In [1]:
import os
import pandas as pd
import numpy as np
import neural_style
import download_models
from PIL import Image
import shutil

In [2]:
def choose_classes(labels, n_classes, n_train, n_test):
    """
    Chooses n_classes that contain at least n_train, n_test : train,test samples from the given labels dataframe.
    parameters:
        labels: pandas dataframe containing filenames, labels and train/test split
        n_classes: required number of classes
        n_train: required number of train samples
        n_test: required number of test samples
    returns:
        chosen_classes: an array of the names of the classes chosen    
    """
    #number of classes
    total_classes = labels['label'].nunique()
    #array of classes
    classes = labels.label.unique()

    assert total_classes >= n_classes, "n_classes must be smaller than the number of available classes. (choose_classes function)"

    #create random permutation of n_classes from all classes
    class_random_sampling = np.arange(total_classes)
    class_random_sampling = np.random.permutation(class_random_sampling)
    
    #count samples in each class,split
    samples_per_class = labels.groupby(['label','split'])['filename'].count().reset_index(name='count')
    chosen_classes = []
    for c in class_random_sampling:
        #get amount of samples per class for both splits
        samples_per_current_class = samples_per_class.loc[samples_per_class['label']==classes[c]]
        #get amount of samples per split
        train_samples_per_class = samples_per_current_class.loc[samples_per_current_class['split']=='train']['count'].reset_index(drop=True)[0]
        test_samples_per_class = samples_per_current_class.loc[samples_per_current_class['split']=='test']['count'].reset_index(drop=True)[0]
        if (train_samples_per_class < n_train) or (test_samples_per_class < n_test):
            #if not enough samples, we ignore this class and take the next one
            continue
        chosen_classes.append(classes[c])
        if len(chosen_classes) == n_classes:
            break

    return chosen_classes

def choose_styles(labels, n_styles):
    """
    Chooses n_styles randomly from the labels dataframe. 
    parameters:
        labels: pandas dataframe containing filenames, style labels and train/test split of each file
        n_styles: number of styles to choose
    returns:
        chosen_styles: an array of the names of the styles chosen
    """
    #number of style classes
    total_classes = labels['label'].nunique()
    #array of classes
    classes = labels.label.unique()
    
    assert total_classes >= n_styles, "n_classes must be smaller than the number of available classes. (choose_styles function)"
    
    #create random permutation of n_classes from all classes
    class_random_sampling = np.arange(total_classes)
    class_random_sampling = np.random.permutation(class_random_sampling)

    chosen_styles = []
    for s in class_random_sampling:
        chosen_styles.append(classes[s])
        if chosen_styles == n_styles:
            break
    return chosen_styles
 
def choose_style_image(style_location, style, split):
    """
    Given a style and split, finds its associated images using the label file and split in style_location and returns a random filename of the style.
    parameters:
        style_location: location containing the folder with style images and the label.csv
        style: name of the style
        split: whether it's a train or test style image
    returns:
        chosen_image: location of the chosen style image
    """

    label = os.path.join(style_location, "labels.csv")
    images_location = os.path.join(style_location, "data")

    label_csv = pd.read_csv(label)
    
    #get all filenames and splits of images of said style
    style_images = label_csv.loc[label_csv['label']==style][['filename', 'split']]
    #get all filenames of given split from previously selected style
    style_split_images = style_images.loc[style_images['split']==split]['filename']
    #choose a random image from this list
    random_index = int(len(style_split_images) * np.random.rand(1))
    chosen_image = style_split_images.iloc[random_index]
    return chosen_image

In [3]:
def create_stylized_dataset(location, location_styles, n_classes, n_styles, n_train, n_test, p_train, output_location,
        #the rest are remaining neural style transfer arguments
        p_style_weight="1e2", p_content_weight="5e0", p_num_iterations="1000", p_learning_rate = "1e0", 
        p_gpu="0", p_image_size="512", p_style_blend_weights="None", p_normalize_weights="False", p_normalize_gradients="False", p_tv_weight="1e-3", p_init='random', p_init_image="None", p_optimizer='lbfgs', 
        p_lbfgs_num_correction="100",
        p_print_iter="0", p_save_iter="0", p_style_scale="1.0", p_original_colors = "0", p_model_file='models/vgg19-d01eb7cb.pth', p_disable_check="False",
        p_backend='nn', p_cudnn_autotune="False", p_pooling='max',
        p_seed="-1", p_content_layers='relu4_2', p_style_layers='relu1_1,relu2_1,relu3_1,relu4_1,relu5_1', p_multidevice_strategy='4,7,29'):
    """
    Given a domain, extracts (n_train, n_test) samples from n_classes. It assigns a dominant style to each class. For the class, 
        it assigns p_train*n_train (rounded down) samples to the dominant style, and (1-p_train)/(n_styles-1) (rounded down) samples to the non-dominant styles.
        Samples which are left out (due to rounding down) are assigned the dominant style. 
        Generates a .csv file which contains the filenames, the classes, the splits and the assigned style to them.
        TODO: Apply neural style transfer and create the actual datasets.
    parameters:
        location: the location(s) of the domain(s) to be used; if multiple domains given, choose randomly;
                format of domain folders:
                    location_folder
                        >data
                            >>all images will be in this folder
                        >labels.csv
                            >>this file will contain the columns: filename, label, split
                                >>filename: name of the file
                                >>label: name of the class
                                >>split: test or train; specifying the split of the sample
        location_styles: the location with the styles to be used; format:
            location_styles
                >data
                    >>all style images will be in this folder
                >labels.csv
                    >>this file will contain the columns: filename, label
                        >>filename: name of the style file
                        >>label: name of the style
                        >>split: which split the style is in

        n_classes: amount of classes from the dataset to apply styles to
        n_styles: amount of styles to apply
            (n_classes, n_styles) should be equal?
        n_train: amount of train samples per class
        n_test: amount of test samples per class
            (n_train, n_test) should be equal?
        p_train: amount of bias in train set (p=0.9 => 90% of images will be of dominant class)
        output_location: location where to output the label, data
    output:
        tbd
    """
    if type(location)==str:
        pass
    elif type(location) in (list,tuple):
        location = random.choice(location)
    else:
        #you can't have neither a list, tuple nor a str!!!!
        raise Exception("Please don't do this to me (╥﹏╥)")
    
    data = os.path.join(location, "data")
    label_loc = os.path.join(location, "labels.csv")
    labels = pd.read_csv(label_loc)

    
    chosen_classes = choose_classes(labels, n_classes, n_train, n_test)
    assert len(chosen_classes) == n_classes, "Likely there aren't enough classes to have at least n_train,n_test samples (too many or too little classes were chosen)"

    style_label_loc = os.path.join(location_styles, "labels.csv")
    style_labels = pd.read_csv(style_label_loc)    
    chosen_styles = choose_styles(style_labels, n_styles)    
    assert len(chosen_styles) == n_styles, "Likely there aren't enough styles offered in the style label.csv file"

    #dominant ratio: the ratio of train samples in the dominant style
    #converting ratio to percentage
    dominant_ratio = 100*p_train/100
    #non dominant ratio: the ratio of train samples in the non-dominant styles
    #converting ratio to percentage 100 is used to fix bug where 1-0.9=0.499999999999999999 -.-
    non_dominant_ratio = (100-100*p_train)/(n_styles-1)/100
    
    #amount of images in dominant and non-dominant styles
    images_dominant = int(n_train * dominant_ratio)
    images_non_dominant = int(n_train * non_dominant_ratio)
    
    images_test_set = int(n_test * (1/n_styles))
    #assert images_test_set * n_styles == n_test, 
    #                "Proportions not worked out for the test set; pick values which result in whole numbers, please"
    
    #code doesn't work if ratios result in non-integer numbers; could be improved
    #update; code has been improved by adding leftover images to the dominant class
    #assert (images_dominant + (n_styles-1) * images_non_dominant) == n_train, "Proportions not worked out; pick values which result in whole numbers"

    #the assumption is that n_classes, n_styles are equal
    #pairs each class with a style which will be dominant
    dominant_style_class = []
    for c,s in zip(chosen_classes, chosen_styles):
        dominant_style_class.append([c,s])

    labels_wstyles = pd.DataFrame()


    for d_s_c in dominant_style_class:
        #select the class from the d_s_c pair
        c = d_s_c[0]
        #select samples from current class
        samples = labels[labels['label']==c].reset_index(drop=True)
        #select train samples for current class c
        train_samples = samples[samples['split']=='train'].reset_index(drop=True)
        #select test samples for current class c
        test_samples = samples[samples['split']=='test'].reset_index(drop=True)

        #get permutations for train,test samples
        random_train_permutation = np.random.permutation(np.arange(len(train_samples)))
        random_test_permutation = np.random.permutation(np.arange(len(test_samples)))


        #select first n_train, n_test samples from permutation
        chosen_trains = train_samples.iloc[random_train_permutation[0:n_train]][['filename', 'label', 'split']]
        chosen_tests = test_samples.iloc[random_test_permutation[0:n_test]][['filename', 'label', 'split']]
        #reset indices of chosen samples
        chosen_trains = chosen_trains.reset_index(drop=True)
        chosen_tests = chosen_tests.reset_index(drop=True)

        #get permutations for train,test samples
        chosen_train_permutation = np.random.permutation(np.arange(len(chosen_trains)))
        chosen_test_permutation = np.random.permutation(np.arange(len(chosen_tests)))

        #assign styles to train set
        for s in chosen_styles:
            if d_s_c[1]==s:
                #we have the dominant style
                #select first images_dominant to be of dominant styles, the remaining will be left for the other styles
                chosen_images = chosen_train_permutation[:images_dominant]
                chosen_train_permutation = chosen_train_permutation[images_dominant:]
                chosen_trains.loc[chosen_images, 'style'] = s

            else:
                #we have non dominant style
                #select first images_non_dominant to be of non dominant styles, the remaining will be left for the other styles
                chosen_images = chosen_train_permutation[:images_non_dominant]
                chosen_train_permutation = chosen_train_permutation[images_non_dominant:]
                chosen_trains.loc[chosen_images, 'style'] = s

        if len(chosen_train_permutation)>0:
            #if there's leftover images, assign them to dominant class
            chosen_trains.loc[chosen_train_permutation, 'style'] = d_s_c[1]
        #assign styles to test set
        for s in chosen_styles:
            #we have non dominant style
            #select first images_test_set to be of non dominant styles, the remaining will be left for the other styles
            chosen_images = chosen_test_permutation[:images_test_set]
            chosen_test_permutation = chosen_test_permutation[images_test_set:]
            chosen_tests.loc[chosen_images, 'style'] = s

        #assign leftover images to the last style in chosen_styles
        if len(chosen_test_permutation) > 0:
            chosen_tests.loc[chosen_test_permutation, 'style'] = chosen_styles[-1]

        labels_wstyles = pd.concat((labels_wstyles, chosen_trains), ignore_index=True)
        labels_wstyles = pd.concat((labels_wstyles, chosen_tests), ignore_index=True)

    #for all entries, choose a style image of given style
    style_location_list = []
    for index,row in labels_wstyles.iterrows():
        style_location_list.append(choose_style_image(location_styles, row['style'], row['split']))

    style_location_df = pd.DataFrame(style_location_list, columns=['style_filename'])
    labels_wstyles['style_filename'] = style_location_df

    #TODO: add neural style transferoutput_location
    #TODO: formalize output
    output_label = os.path.join(output_location, 'label.csv')
    output_data = os.path.join(output_location, "data")

    #create folders necessary for output_data (and output_label)
    os.makedirs(output_data, exist_ok=True)
    
    
    labels_wstyles.to_csv(output_label)

    #create stylised dataset
    for index,row in labels_wstyles.iterrows():
        location_style_image = os.path.join(location_styles, "data", row['style_filename'])
        location_content_image = os.path.join(location, "data", row['filename'])
        location_output_image = os.path.join(output_location, "data", row['filename'])
        
        with Image.open(location_content_image) as img:
            width, height = img.size
            #choosing the smaller value between image size, and the requested p_image_size
            #target_image_size = int(max(width, height))
            target_image_size = min(p_image_size, int(max(width, height)))
            
        
        command = "/kaggle/usr/lib/neural_style/neural_style.py -style_image %s -style_blend_weights %s -content_image %s -image_size %s -gpu %s -content_weight %s -style_weight %s normalize_weights %s -normalize_gradients %s -tv_weight %s -num_iterations %s -init %s -init_image %s -optimizer %s -learning_rate %s -lbfgs_num_correction %s -print_iter %s -save_iter %s -output_image %s -style_scale %s -original_colors %s -pooling %s -model_file %s -disable_check %s -backend %s -cudnn_autotune %s -seed %s -content_layers %s -style_layers %s -multidevice_strategy %s" %(
                location_style_image, p_style_blend_weights, location_content_image, target_image_size, p_gpu, p_content_weight, p_style_weight, p_normalize_weights, p_normalize_gradients, p_tv_weight, p_num_iterations, p_init, p_init_image, p_optimizer, p_learning_rate, p_lbfgs_num_correction, p_print_iter, p_save_iter, location_output_image, p_style_scale, p_original_colors, p_pooling, p_model_file, p_disable_check, p_backend, p_cudnn_autotune, p_seed, p_content_layers, p_style_layers, p_multidevice_strategy)
        !python3 $command
        print("Finished running: %s" %command)
                    
    return 0

In [4]:
p_model_location = "/kaggle/working/models"
download_models.main(p_model_location)
p_model_file = os.path.join(p_model_location, "vgg19-d01eb7cb.pth")

All models have been successfully downloaded


In [None]:
location = "/kaggle/input/ter-set-1/archive/data/Human_Actions"
location_styles = "/kaggle/input/ter-set-1/Classified_Style_Dataset/Classified_Style_Dataset"
n_classes = 3
n_styles = 3
n_train = 40
n_test = 39
p_train = 0.9
output_location = "/kaggle/working/output/Human_Actions_Stylized_Experiment_09122022"

create_stylized_dataset(location, location_styles, n_classes, n_styles, n_train, n_test, p_train, output_location,
                        p_model_file = p_model_file, p_original_colors = "1", p_style_weight=25, p_image_size=512
        #the rest are remaining neural style transfer arguments
                       )
"""
        ,p_style_weight=1e2, p_content_weight=5e0, p_num_iterations=1000, p_learning_rate = 1e0, 
        p_gpu=0, p_image_size=512, p_style_blend_weights=None, p_normalize_weights=False, p_normalize_gradients=False, p_tv_weight=1e-3, p_init='random', p_init_image=None, p_optimizer='lbfgs', 
        p_lbfgs_num_correction=100,
        p_print_iter=0, p_save_iter=0, p_style_scale=1.0, p_original_colors = 0, p_model_file='models/vgg19-d01eb7cb.pth', p_disable_check=False, 
        p_backend='nn', p_cudnn_autotune=False, p_pooling='max',
        p_seed=-1, p_content_layers='relu4_2', p_style_layers='relu1_1,relu2_1,relu3_1,relu4_1,relu5_1', p_multidevice_strategy='4,7,29')
"""
print()

All models have been successfully downloaded
VGG-19 Architecture Detected
Successfully loaded /kaggle/working/models/vgg19-d01eb7cb.pth
Capturing style target 1
Running optimization with L-BFGS
Finished running: /kaggle/usr/lib/neural_style/neural_style.py -style_image /kaggle/input/ter-set-1/Style_Dataset/Style_Dataset/data/Cathedral_5.png -style_blend_weights None -content_image /kaggle/input/ter-set-1/archive/data/Human_Actions/data/running_092.jpg -image_size 512 -gpu 0 -content_weight 5e0 -style_weight 25 normalize_weights False -normalize_gradients False -tv_weight 1e-3 -num_iterations 1000 -init random -init_image None -optimizer lbfgs -learning_rate 1e0 -lbfgs_num_correction 100 -print_iter 0 -save_iter 0 -output_image /kaggle/working/output/Human_Actions_Stylized_Experiment/data/running_092.jpg -style_scale 1.0 -original_colors 1 -pooling max -model_file /kaggle/working/models/vgg19-d01eb7cb.pth -disable_check False -backend nn -cudnn_autotune False -seed -1 -content_layers re

In [11]:
#shutil.rmtree("/kaggle/working/output/")
#os.remove("/kaggle/working/output_archive.zip")
#os.makedirs("/kaggle/working/")

In [7]:
import shutil
shutil.make_archive("/kaggle/working/output_archive", 'zip', "/kaggle/working/output")

'/kaggle/working/output_archive.zip'

52