# Data Augmentation with flip at 3 different positions

In [None]:
import cv2
import os

# imatges originals:
input_dir = "../../train_1" 

# where are we saving the images created:
output_dir = "../../train_1_flip"

# create the folder:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# save all the original images:
image_files = os.listdir(input_dir)

for img_file in image_files:
    img_path = os.path.join(input_dir, img_file)

    # this reads the original image
    img = cv2.imread(img_path)
    # as we had some errors, we need to find out if it loaded correcty
    if img is None:
        print("No carrega correctament", img_path)
        continue

    # horizontal
    flipped_img_h = cv2.flip(img, 1)
    cv2.imwrite(os.path.join(output_dir, f"{img_file.split('.')[0]}_flipped_h.jpg"), flipped_img_h)

    # vertical
    flipped_img_v = cv2.flip(img, 0)
    cv2.imwrite(os.path.join(output_dir, f"{img_file.split('.')[0]}_flipped_v.jpg"), flipped_img_v)

    # horizontal and vertical
    flipped_img_hv = cv2.flip(img, -1)
    cv2.imwrite(os.path.join(output_dir, f"{img_file.split('.')[0]}_flipped_hv.jpg"), flipped_img_hv)

# Overview

Can a computer "learn" to classify artists by their paintings? 

ResNet50 is a good model for classifying ImageNet data. How about a set of 38 artists?

We use transfer learning to re-train a ResNet50 model to identify one of 38 artists who have more than ~~300~~ ***200*** paintings in the dataset. 

This notebook is part of a project for CSC 480 taught by [Dr. Franz J. Kurfess](http://users.csc.calpoly.edu/~fkurfess/) at Cal Poly

A web application is [in development](https://github.com/SomethingAboutImages/WebImageClassifier) to make use of the model that this notebook outputs. 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt
from random import seed # for setting seed
import tensorflow
from IPython import sys_info

import gc # garbage collection

In [None]:
MY_SEED = 42 # 480 could work too
seed(MY_SEED)
np.random.seed(MY_SEED)
tensorflow.random.set_seed(MY_SEED)

print(sys_info())
# get module information
!pip freeze > frozen-requirements.txt
# append system information to file
with open("frozen-requirements.txt", "a") as file:
    file.write(sys_info())

In [None]:
from tensorflow.python.client import device_lib
# print out the CPUs and GPUs
print(device_lib.list_local_devices())

In [None]:
# https://stackoverflow.com/questions/25705773/image-cropping-tool-python
# because painting images are hella big
from PIL import Image
Image.MAX_IMAGE_PIXELS = None

In [None]:
# globals

# DATA_DIR = '../input/painters-train-part-1/'

# TRAIN_1_DIR = '../input/painters-train-part-1/train_1/train_1/'
# TRAIN_2_DIR = '../input/painters-train-part-1/train_2/train_2/'
# TRAIN_3_DIR = '../input/painters-train-part-1/train_3/train_3/'

# TRAIN_4_DIR = '../input/painters-train-part-2/train_4/train_4/'
# TRAIN_5_DIR = '../input/painters-train-part-2/train_5/train_5/'
# TRAIN_6_DIR = '../input/painters-train-part-2/train_6/train_6/'

# TRAIN_7_DIR = '../input/painters-train-part-3/train_7/train_7/'
# TRAIN_8_DIR = '../input/painters-train-part-3/train_8/train_8/'
# TRAIN_9_DIR = '../input/painters-train-part-3/train_9/train_9/'

TRAIN_DIRS = ['../../train_1_flip']

TEST_DIR = '../input/painter-test/test/test/'

In [None]:
df = pd.read_csv('../../all_data_info.csv')
print("df.shape", df.shape)
display(df)

In [None]:
# quick fix for corrupted files
list_of_corrupted = ['3917.jpg','18649.jpg','20153.jpg','41945.jpg','79499.jpg','91033.jpg','92899.jpg','95347.jpg','100532.jpg','101947.jpg']

# completely get rid of them
df = df[df["new_filename"].isin(list_of_corrupted) == False]

display(df)

In [None]:
train_df = df[df["in_train"] == True]
test_df = df[df['in_train'] == False]
train_df = train_df[['artist', 'new_filename']]
test_df = test_df[['artist', 'new_filename']]

print("test_df.shape", test_df.shape)
print("train_df.shape", train_df.shape)

artists = {} # holds artist hash & the count
for a in train_df['artist']:
    if (a not in artists):
        artists[a] = 1
    else:
        artists[a] += 1
# print(artists)

training_set_artists = []
for a, count in artists.items():
    if(int(count) >= 200):
        training_set_artists.append(a)

print("\nnumber of artists:", len(training_set_artists))

print("\nlist of artists...")
print(training_set_artists)


In [None]:
t_df = train_df[train_df["artist"].isin(training_set_artists)]

t_df.head(5)

In [None]:
t1_df = t_df[t_df['new_filename'].str.startswith('1')]

t2_df = t_df[t_df['new_filename'].str.startswith('2')]

t3_df = t_df[t_df['new_filename'].str.startswith('3')]

t4_df = t_df[t_df['new_filename'].str.startswith('4')]

t5_df = t_df[t_df['new_filename'].str.startswith('5')]

t6_df = t_df[t_df['new_filename'].str.startswith('6')]

t7_df = t_df[t_df['new_filename'].str.startswith('7')]

t8_df = t_df[t_df['new_filename'].str.startswith('8')]

t9_df = t_df[t_df['new_filename'].str.startswith('9')]

all_train_dfs = [t1_df, t2_df, t3_df,
                t4_df, t5_df, t6_df,
                t7_df, t8_df, t9_df]

t9_df.head(5)

In [None]:
from keras.applications import ResNet50
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Flatten, GlobalAveragePooling2D

from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# specify the model that classifies 38 artists 🎨 🖌

In [None]:
len(training_set_artists)

In [None]:
num_classes = len(training_set_artists) # one class per artist
weights_notop_path = '../input/resnet50/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'
model = Sequential()
model.add(ResNet50(
  include_top=False,
  weights=weights_notop_path,
  pooling='avg'
))
model.add(Dense(
  num_classes,
  activation='softmax'
))

model.layers[0].trainable = False

# Compile Model

In [None]:
model.compile(
  optimizer='adam', # lots of people reccommend Adam optimizer
  loss='categorical_crossentropy', # aka "log loss" -- the cost function to minimize 
  # so 'optimizer' algorithm will minimize 'loss' function
  metrics=['accuracy'] # ask it to report % of correct predictions
)

# setup the image data generator for each training directory 

In [None]:
# model globals
IMAGE_SIZE = 224
BATCH_SIZE = 96
TEST_BATCH_SIZE = 17 # because test has 23817 images and factors of 23817 are 3*17*467
                     # it is important that this number evenly divides the total num images 
VAL_SPLIT = 0.25

In [None]:
def setup_generators(
    val_split, train_dataframe, train_dir,
    img_size, batch_size, my_seed, list_of_classes,
    test_dataframe, test_dir, test_batch_size
):
    print("-"*20)
    if not preprocess_input:
          raise Exception("please do import call 'from tensorflow.python.keras.applications.resnet50 import preprocess_input'")

    # setup resnet50 preprocessing 
    data_gen = ImageDataGenerator(
        preprocessing_function=preprocess_input,
        validation_split=val_split)

    print(len(train_dataframe), "images in", train_dir, "and validation_split =", val_split)
    print("\ntraining set ImageDataGenerator")
    train_gen = data_gen.flow_from_dataframe(
        dataframe=train_dataframe.reset_index(), # call reset_index() so keras can start with index 0
        directory=train_dir,
        x_col='new_filename',
        y_col='artist',
        has_ext=True,
        target_size=(img_size, img_size),
        subset="training",
        batch_size=batch_size,
        seed=my_seed,
        shuffle=True,
        class_mode='categorical',
        classes=list_of_classes
    )

    print("\nvalidation set ImageDataGenerator")
    valid_gen = data_gen.flow_from_dataframe(
        dataframe=train_dataframe.reset_index(), # call reset_index() so keras can start with index 0
        directory=train_dir,
        x_col='new_filename',
        y_col='artist',
        has_ext=True,
        subset="validation",
        batch_size=batch_size,
        seed=my_seed,
        shuffle=True,
        target_size=(img_size,img_size),
        class_mode='categorical',
        classes=list_of_classes
    )

    test_data_gen = ImageDataGenerator(preprocessing_function=preprocess_input)

    print("\ntest set ImageDataGenerator")
    test_gen = test_data_gen.flow_from_dataframe(
        dataframe=test_dataframe.reset_index(), # call reset_index() so keras can start with index 0
        directory=test_dir,
        x_col='new_filename',
        y_col=None,
        has_ext=True,
        batch_size=test_batch_size,
        seed=my_seed,
        shuffle=False, # dont shuffle test directory
        class_mode=None,
        target_size=(img_size,img_size)
    )

    return (train_gen, valid_gen, test_gen)

print("defined setup_generators()")

In [None]:
# delete some unused dataframes to free some RAM for training
del df
del t_df
del t1_df
del t2_df
del t3_df
del t4_df
del t5_df
del t6_df
del t7_df
del t8_df
del t9_df
gc.collect()

In [None]:
train_gens = [None]*len(TRAIN_DIRS)
valid_gens = [None]*len(TRAIN_DIRS)
test_gen  = None # only 1 test_gen
i = 0
for i in range(0, len(TRAIN_DIRS)):
    train_gens[i], valid_gens[i], test_gen = setup_generators(
        train_dataframe=all_train_dfs[i], train_dir=TRAIN_DIRS[i],
        val_split=VAL_SPLIT, img_size=IMAGE_SIZE, batch_size=BATCH_SIZE, my_seed=MY_SEED, 
        list_of_classes=training_set_artists, test_dataframe=test_df, 
        test_dir=TEST_DIR, test_batch_size=TEST_BATCH_SIZE
    )
    i += 1

# TRAINING TIME!  🎉 🎊 🎁

In [None]:
MAX_EPOCHS = 5 * len(train_gens) # should be a multiple of 9 because need evenly train each train_dir
DIR_EPOCHS = 1 # fit each train_dir at least this many times before overfitting

In [None]:
histories = []

e=0
while ( e < MAX_EPOCHS):
    for i in range(0, len(train_gens)):
        # train_gen.n = number of images for training
        STEP_SIZE_TRAIN = train_gens[i].n//train_gens[i].batch_size
        # train_gen.n = number of images for validation
        STEP_SIZE_VALID = valid_gens[i].n//valid_gens[i].batch_size
        print("STEP_SIZE_TRAIN",STEP_SIZE_TRAIN)
        print("STEP_SIZE_VALID",STEP_SIZE_VALID)
        histories.append(
            model.fit_generator(generator=train_gens[i],
                                steps_per_epoch=STEP_SIZE_TRAIN,
                                validation_data=valid_gens[i],
                                validation_steps=STEP_SIZE_VALID,
                                epochs=DIR_EPOCHS)
        )
        e+=1

# Evaluate the model 🧐 🤔

In [None]:
accuracies = []
val_accuracies = []
losses = []
val_losses = []
for hist in histories:
    if hist:
        accuracies += hist.history['acc']
        val_accuracies += hist.history['val_acc']
        losses += hist.history['loss']
        val_losses += hist.history['val_loss']

In [None]:
# Plot training & validation accuracy values
plt.plot(accuracies)
plt.plot(val_accuracies)
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(losses)
plt.plot(val_losses)
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
import time
timestr = time.strftime("%Y%m%d-%H%M%S") # e.g: 20181109-180140
model.save('painters_adam_e45_'+timestr+'.h5')

# Predict the output 🔮 🎩

In [None]:
PRED_STEPS = len(test_gen) #100 # default would have been len(test_gen)

In [None]:
# Need to reset the test_gen before calling predict_generator
# This is important because forgetting to reset the test_generator results in outputs with a weird order.
test_gen.reset()
pred=model.predict_generator(test_gen, verbose=1, steps=PRED_STEPS)

In [None]:
print(len(pred),"\n",pred)

In [None]:
predicted_class_indices=np.argmax(pred,axis=1)

In [None]:
print(len(predicted_class_indices),"\n",predicted_class_indices)
print("it has values ranging from ",min(predicted_class_indices),"...to...",max(predicted_class_indices))

In [None]:
labels = (train_gens[0].class_indices)
labels = dict((v,k) for k,v in labels.items())
predictions = [labels[k] for k in predicted_class_indices]

In [None]:
print("*"*20+"\nclass_indices\n"+"*"*20+"\n",train_gens[0].class_indices,"\n")
print("*"*20+"\nlabels\n"+"*"*20+"\n",labels,"\n")
print("*"*20+"\npredictions has", len(predictions),"values that look like","'"+str(predictions[0])+"' which is the first prediction and corresponds to this index of the classes:",train_gens[0].class_indices[predictions[0]])

In [None]:
# Save the results to a CSV file.
filenames=test_gen.filenames[:len(predictions)] # because "ValueError: arrays must all be same length"

real_artists = []
for f in filenames:
    real = test_df[test_df['new_filename'] == f].artist.get_values()[0]
    real_artists.append(real)

results=pd.DataFrame({"Filename":filenames,
                      "Predictions":predictions,
                      "Real Values":real_artists})
results.to_csv("results.csv",index=False)

In [None]:
results.head()

In [None]:
len(training_set_artists)

In [None]:
print(training_set_artists)

In [None]:
count = 0
match = 0
unexpected_count = 0
unexpected_match = 0
match_both_expected_unexpected = 0

for p, r in zip(results['Predictions'], results['Real Values']):
    if r in training_set_artists:
        count += 1
        if p == r:
            match += 1
    else:
        unexpected_count += 1
        if p == r:
            unexpected_match += 1

print("test accuracy on new images for TRAINED artsits")
acc = match/count
print(match,"/",count,"=","{:.4f}".format(acc))

print("test accuracy on new images for UNEXPECTED artsits")
u_acc = unexpected_match/unexpected_count
print(unexpected_match,"/",unexpected_count,"=","{:.4f}".format(u_acc))

print("test accuracy on new images")
total_match = match+unexpected_match
total_count = count+unexpected_count
total_acc = (total_match)/(total_count)
print(total_match,"/",total_count,"=","{:.4f}".format(total_acc))

So, it seems like the model may have learned some interesting patterns related to the artists that it expects. 

**Questions to explore:**
* [What does the model actually "see"](https://arxiv.org/abs/1312.6034) in a painting by [Pablo Picasso](https://www.wikiart.org/en/pablo-picasso/) as opposed to [Vincent van Gogh](https://www.wikiart.org/en/vincent-van-gogh)?
* What would happen if we trained the model on the full artist dataset or at least on artists with over 200 paintings in the dataset?
* Can the accuracy be improved with techniques like data augmentation or with a custom convolutional neural network? How about doing transfer learning with different [pre-trained model](https://keras.io/applications)?
* How can the learning rate be tuned to improve the accuracy?
* Would a regularization technique like [dropout](https://machinelearningmastery.com/dropout-regularization-deep-learning-models-keras/) be helpful?
* This notebook uses the [Adam](https://keras.io/optimizers/#adam) optimizer... what if we tried RMSprop?
* How about using an [ensemble of models](https://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/)?