# PRETRAINED VGG16 FINE TUNED AND RETRAINED ON SEM IMAGES + CATEGORIES

In [None]:
import matplotlib.pyplot as plt
from utils import *

## Import data

In [None]:
# Read folder data images
root_dir = './images/'
img_files_paths = list()
class_labels = list()

for dir_, _, files in os.walk(root_dir):
    for file_name in files:
        if 'DS_Store' not in file_name:
            rel_dir = os.path.relpath(dir_, root_dir)
            class_labels.append(rel_dir)
            rel_file = os.path.join(rel_dir, file_name)
            rel_file = os.path.join(root_dir, rel_file)
            img_files_paths.append(rel_file.replace("\\", "/"))

In [None]:
# Read folder data images
root_dir1 = './unknown_DB865/'
root_dir2 = './unknown_DB866/'
img_files_paths_unknown = list()

for dir_, _, files in os.walk(root_dir1):
    for file_name in files:
        if 'DS_Store' not in file_name:
            rel_file = os.path.join(root_dir1, file_name)
            img_files_paths_unknown.append(rel_file.replace("\\", "/"))
            
for dir_, _, files in os.walk(root_dir2):
    for file_name in files:
        if 'DS_Store' not in file_name:
            rel_file = os.path.join(root_dir2, file_name)
            img_files_paths_unknown.append(rel_file.replace("\\", "/"))

In [None]:
class_type = 'pseudo-vs-all' #  all | pseudo-vs-all | genres

if class_type == 'pseudo-vs-all':
    class_labels = [l if l == 'pseudoracemus' else 'other' for l in class_labels]
elif class_type == 'genres':
    class_labels = [l if l != 'pseudoracemus' else 'racemus' for l in class_labels]
elif class_type == 'all':
    pass

In [None]:
label_counter = Counter(class_labels).most_common()
label_freq = [freq[1] / float(len(class_labels)) for freq in label_counter]
label_counter, label_freq

In [None]:
X = np.array(img_files_paths)
y = np.array(class_labels)
train_df = pd.DataFrame([X, y]).T

In [None]:
train_datagen = ImageDataGenerator(preprocessing_function=preprocess_input,
                                       rescale=1./255,
                                       rotation_range=10,
                                       brightness_range=[0.5,1.8],
                                       zoom_range=[0.7,1],
                                       horizontal_flip=True,
                                       fill_mode="reflect")

train_generator = train_datagen.flow_from_dataframe(train_df, x_col=0, y_col=1,
                                                target_size = (224, 224),
                                                batch_size = 1,
                                                class_mode = 'categorical')

## Example of data augmentation

In [None]:
# load the image
img = load_img('./4_classes_full/racemus/14052020_B1_A_04.jpg')
# convert to numpy array
data = img_to_array(img)
# expand dimension to one sample
samples = expand_dims(data, 0)
# create image data augmentation generator
datagen = ImageDataGenerator(preprocessing_function=preprocess_input,
                                       rescale=1./255,
                                       rotation_range=10,
                                       brightness_range=[0.5,1.8],
                                       zoom_range=[0.7,1],
                                       horizontal_flip=True,
                                       fill_mode="reflect")
# prepare iterator
it = datagen.flow(samples, batch_size=1)
# generate samples and plot
for i in range(9):
    # define subplot
    pyplot.subplot(330 + 1 + i)
    # generate batch of images
    batch = it.next()
    # convert to unsigned integers for viewing
    image = batch[0].astype('uint8')
    # plot raw pixel data
    pyplot.imshow(image)
# show the figure
#pyplot.show()

## Internal Validation + External test: VGG16 IMAGES+CATEGORIES

In [None]:
# import structure list and reorder as img_files_paths
struct_df = pd.read_csv('./Categories.csv', sep = ';')
struct_unknown_df = pd.read_csv('./Categories_unknown.csv', sep = ';')
struct_df

In [None]:
file_name_list = [path.split('/')[-1] for path in img_files_paths]
filename_df = pd.DataFrame(file_name_list)
filename_df['img_path'] = img_files_paths
filename_df['class'] = class_labels
filename_df.columns = ['filename', 'img_path', 'class']

full_ohe_df = pd.merge(filename_df, struct_df, on='filename')
full_ohe_df

In [None]:
file_name_list = [path.split('/')[-1] for path in img_files_paths_unknown]
filename_df = pd.DataFrame(file_name_list)
filename_df['img_path'] = img_files_paths_unknown
filename_df.columns = ['filename', 'img_path']

full_ohe_df_unknown = pd.merge(filename_df, struct_unknown_df, on='filename')
full_ohe_df_unknown

In [None]:
np.sum(struct_df.iloc[:,3:])

In [None]:
class_ohe = pd.get_dummies(full_ohe_df['class'])
full_ohe_df = pd.concat([full_ohe_df, class_ohe],axis=1)
full_ohe_df

In [None]:
full_ohe_df = full_ohe_df.drop(columns=['filename', 'trueclass', 'class'])
full_ohe_df

In [None]:
full_ohe_df_unknown = full_ohe_df_unknown.drop(columns=['filename', 'trueclass'])
full_ohe_df_unknown

In [None]:
################ PARAMETERS ################

hidden_vgg = 128
dropout = True

lr_coeff = 5
learning_rate = 10 / (10 ** lr_coeff)
reg = 'l2'

k_folds = 4
batch_size = 32
n_epochs = 20
n_steps = 5
n_classes = 2
num_struct = 6
rs = 42
num_replica = 2
cweights = 'yes' #no
############################################

In [None]:
for i in list(full_ohe_df.columns[-n_classes:]):
    full_ohe_df_unknown[i] = [0]*len(full_ohe_df_unknown)
full_ohe_df_unknown

In [None]:
#weights = []
class_weight = dict()

for c in range(n_classes):
    class_weight[c] = (1 / np.sum(full_ohe_df.iloc[:,-(n_classes-c)].values)) * (len(full_ohe_df) / n_classes)

# INTERNAL VALIDATION

In [None]:
save_dir = './Results/FINAL_class-{}+struct_hn-{}_epochs-{}_lr-{}_{}_3rep/'.format(n_classes, hidden_vgg, n_epochs, lr_coeff, cweights)
os.mkdir(save_dir)

test_indices = []
unknown_df = full_ohe_df_unknown

# Cross-fold validation
kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state = rs)
kf.get_n_splits(img_files_paths, class_labels)

i = 1
for train_index, test_index in kf.split(img_files_paths, class_labels):

    train_df = full_ohe_df.iloc[train_index]
    test_df = full_ohe_df.iloc[test_index]

    test_indices.append(test_index)

    with open(save_dir + 'cv{}_test_files.pickle'.format(i), 'wb') as handle:
        pickle.dump(test_df, handle)

    print("\n=========================================")
    print("====== K Fold Validation step %d/%d =======" % (i,k_folds))
    print("=========================================\n")

    # Image data generator from dataframe
    train_datagen = ImageDataGenerator(preprocessing_function=preprocess_input,
                                       rescale=1./255,
                                       rotation_range=10,
                                       brightness_range=[0.5,1.8],
                                       zoom_range=[0.7,1],
                                       horizontal_flip=True,
                                       fill_mode="reflect")

    test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input, rescale=1./255)

    train_generator = train_datagen.flow_from_dataframe(train_df, x_col='img_path', 
                                                        y_col=train_df.columns[1:].tolist(),
                                                        target_size = (224, 224),
                                                        batch_size = batch_size,
                                                        shuffle=True,
                                                        class_mode = 'raw')

    test_generator = test_datagen.flow_from_dataframe(test_df, x_col='img_path', 
                                                      y_col=test_df.columns[1:].tolist(),
                                                      target_size = (224, 224),
                                                      class_mode ='raw',
                                                      batch_size = 1,
                                                      shuffle = False) 

    unknown_generator = test_datagen.flow_from_dataframe(unknown_df, x_col='img_path', 
                                                      y_col=test_df.columns[1:].tolist(),
                                                      target_size = (224, 224),
                                                      class_mode ='raw',
                                                      batch_size = 1,
                                                      shuffle = False) 

    ############################################################################################################
    #create model
    model = create_model()

    ############################################################################################################

    n_training_samples = len(train_generator.filenames)
    n_test_samples = len(test_generator.filenames)

    if cweights == 'yes':
        hist = model.fit_generator(
            own_train_generator_func(),
            epochs=n_epochs,
            validation_data=own_test_generator_func(),
            validation_steps=n_test_samples,
            steps_per_epoch=n_steps, class_weight=class_weight)
    else:
            hist = model.fit_generator(
            own_train_generator_func(),
            epochs=n_epochs,
            validation_data=own_test_generator_func(),
            validation_steps=n_test_samples,
            steps_per_epoch=n_steps)
            
    model_json = model.to_json()
    with open(save_dir + 'cv{}model.json'.format(i), "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(save_dir + 'cv{}model.h5'.format(i))
    print("Saved model to disk")

    with open(save_dir + 'cv{}_history.pickle'.format(i), 'wb') as handle:
        pickle.dump(hist.history, handle)

    test_generator.reset()
    test_prob = model.predict_generator(own_test_generator_func(), steps=test_df.shape[0])
    with open(save_dir + 'cv{}_test-prob.pickle'.format(i), 'wb') as handle:
        pickle.dump(test_prob, handle)

    print("TEST PROBABILITIES")
    print(test_prob)

    unknown_prob = model.predict_generator(own_test_generator_func_unknown(), steps=unknown_df.shape[0])
    with open(save_dir + 'cv{}_unknown-prob.pickle'.format(i), 'wb') as handle:
        pickle.dump(unknown_prob, handle)

    print("UNKNOWN PROBABILITIES")
    print(unknown_prob)

    i += 1

# EXTERNAL TEST

In [None]:
unknown_df = full_ohe_df_unknown
train_df = full_ohe_df


# Image data generator from dataframe
train_datagen = ImageDataGenerator(preprocessing_function=preprocess_input,
                                   rescale=1./255,
                                   rotation_range=10,
                                   brightness_range=[0.5,1.8],
                                   zoom_range=[0.7,1],
                                   horizontal_flip=True,
                                   fill_mode="reflect")

test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input, rescale=1./255)

train_generator = train_datagen.flow_from_dataframe(train_df, x_col='img_path', 
                                                    y_col=train_df.columns[1:].tolist(),
                                                    target_size = (224, 224),
                                                    batch_size = batch_size,
                                                    shuffle=True,
                                                    class_mode = 'raw')

unknown_generator = test_datagen.flow_from_dataframe(unknown_df, x_col='img_path', 
                                                  y_col=unknown_df.columns[1:].tolist(),
                                                  target_size = (224, 224),
                                                  class_mode ='raw',
                                                  batch_size = 1,
                                                  shuffle = False) 

############################################################################################################
#create model
model = create_model()

############################################################################################################

n_training_samples = len(train_generator.filenames)

if cweights == 'yes':
    hist = model.fit_generator(
        own_train_generator_func(),
        epochs=n_epochs,
        steps_per_epoch=n_steps, class_weight=class_weight)
else:
        hist = model.fit_generator(
        own_train_generator_func(),
        epochs=n_epochs,
        steps_per_epoch=n_steps)

model_json = model.to_json()
with open(save_dir + 'model.json', "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(save_dir + 'model.h5')
print("Saved model to disk")

with open(save_dir + 'history.pickle', 'wb') as handle:
    pickle.dump(hist.history, handle)

unknown_prob = model.predict_generator(own_test_generator_func_unknown(), steps=unknown_df.shape[0])
with open(save_dir + 'unknown-prob.pickle', 'wb') as handle:
    pickle.dump(unknown_prob, handle)

print("UNKNOWN PROBABILITIES")
print(unknown_prob)