In [2]:
from scipy.sparse import csr_matrix, save_npz, load_npz
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import scipy.io
import math
from keras.callbacks import CSVLogger
import json
from PIL import Image
import numpy as np
from matplotlib import pyplot as plt

# Option to see strings fully and not cut by ... in the DataFrame
pd.set_option('display.max_colwidth', None)

In [3]:
import math
# 
# ```
# df = dataframe
# test_size = ratio of the test set (0.0 - 1.0)
# batch_size = amount of pictures per class for the training set

def split_data(data, test_size):
    df = data
    classes = df['actor_name'].unique()
    test_names = []
    train_names = []
    train_paths = []
    test_paths = []
    for actor in classes:
        actor_df = df[df['actor_name'] == actor]
        images_number = len(actor_df)
        train_size = math.ceil(images_number * (1 - test_size))
        if train_size == 0:
            train_size = 1
        for i in range(images_number):
            if i >= min(train_size, 9):
                test_names.append(actor_df.iloc[i]['actor_name'])
                test_paths.append(actor_df.iloc[i]['path'])
                if i == 11:
                    break
            else:
                train_names.append(actor_df.iloc[i]['actor_name'])
                train_paths.append(actor_df.iloc[i]['path'])
    train = np.vstack((train_names, train_paths))
    test = np.vstack((test_names, test_paths))
    train_set = pd.DataFrame(train).T
    test_set = pd.DataFrame(test).T
    
    train_set.columns = ['actor_name', 'path']
    test_set.columns = ['actor_name', 'path']
    
    return train_set, test_set
            

# function to remove actors that have less than 'n' images in the dataset

def remove_single_occurrences(df, n):
    count = df['actor_name'].value_counts()
    mask = (count[df['actor_name']].values > n)
    return df[mask]

def remove_single_occurrences2(df, n):
    count = df['actor_name'].value_counts()
    mask = (count[df['actor_name']].values < n)
    return df[mask]

# function to load training & test sets into DataFrames
def load_data(filename : str):
    meta = pd.read_csv(filename).reset_index()
    actors = meta['actor_name']
    paths = meta['path']
    meta = np.vstack((actors, paths))
    meta_df = pd.DataFrame(meta).T
    meta_df.columns = ['actor_name', 'path']
    return meta_df

#function to predict the class
# predict a class using img file
def predict_class(filepath):
    import cv2
    img = cv2.imread(filepath)
    resized_img = cv2.resize(img, (128, 128))
    normalizedImg = np.zeros((800, 800))
    normalizedImg = cv2.normalize(resized_img, normalizedImg, 0, 255, cv2.NORM_MINMAX)
    resized_img = normalizedImg.reshape(1, 128, 128, 3)
    return lookup_table[str(np.argmax(model.predict(resized_img)))]

def sample_till(df, maxo):
    names = train_set['actor_name'].unique()
    names_list = []
    paths_list = []
    for name in names:
        paths = df[df['actor_name'] == name]['path']
        for i, path in enumerate(paths):
            if i > min(len(paths), maxo):
                break
            names_list.append(name)
            paths_list.append(path)
            
    df_new = np.vstack((names_list, paths_list))
    df = pd.DataFrame(df_new).T
    df.columns = ['actor_name', 'path']
    df = df.sample(frac = 1)
    return df

#function to save the train & test sets as well as the lookup table
def save_info(train_gen, filename1, train_set, filename2, test_set, filename3):
    lookup_table = dict(map(reversed, train_gen.class_indices.items()))
    with open(filename1, 'w', encoding='utf8') as f:
        json.dump(lookup_table, f)
    train_set.to_csv(filename2)
    test_set.to_csv(filename3)

In [4]:
## load data of train & test sets that were saved from earlier training of this model
train_set = load_data(r'data/train_set.csv')
test_set = load_data(r'data/test_set.csv')
lookup_table = {}
with open('data/lookup_table.json', 'r') as f:
    lookup_table = json.loads(f.read())

In [7]:
train_set['actor_name'].value_counts()

Ariel_Sharon         62
Hugo_Chavez          57
Junichiro_Koizumi    48
Jean_Chretien        44
Jacques_Chirac       42
                     ..
Leonardo_DiCaprio     7
Jeong_Se-hyun         7
Fernando_Gonzalez     7
John_Abizaid          7
Bill_Frist            7
Name: actor_name, Length: 179, dtype: int64

## training the model

In [263]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

#sort by actor_name and remove actors with less than 8 images
data = new_meta
data = remove_single_occurrences(data, 8)

#Splitting into train and validation sets - can be commeneted out if loading the sets from csv
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42, stratify = data['actor_name'])

#Image preprocessing
img_width, img_height = 128, 128
batch_size = 8
#Normalizing the images
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)
#Creating the generators for training and test\validation sets
train_generator = train_datagen.flow_from_dataframe(
    train_set,
    x_col='path',
    y_col='actor_name',
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='categorical')

val_generator = val_datagen.flow_from_dataframe(
    test_set,
    x_col='path',
    y_col='actor_name',
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='categorical')
#Printing number of classes
num_classes = len(train_generator.class_indices)
print(f"Number of unique classes: {num_classes}")

#Model architecture, using l2 regularization to slow down learning curve
model = Sequential([
    Conv2D(8, (3, 3), activation='relu', input_shape=(img_width, img_height, 3)),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(16, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(32, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(128, activation = 'relu', kernel_regularizer='l2'),
    Dense(len(train_generator.class_indices), activation='softmax')
])

#Compiling the model
model.compile(optimizer='adam', loss="categorical_crossentropy", metrics=['accuracy'])

Found 2734 validated image filenames belonging to 179 classes.
Found 684 validated image filenames belonging to 179 classes.
Number of unique classes: 179


In [268]:
model.summary()

Model: "sequential_47"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_134 (Conv2D)         (None, 126, 126, 8)       224       
                                                                 
 max_pooling2d_134 (MaxPooli  (None, 63, 63, 8)        0         
 ng2D)                                                           
                                                                 
 conv2d_135 (Conv2D)         (None, 61, 61, 16)        1168      
                                                                 
 max_pooling2d_135 (MaxPooli  (None, 30, 30, 16)       0         
 ng2D)                                                           
                                                                 
 conv2d_136 (Conv2D)         (None, 28, 28, 32)        4640      
                                                                 
 max_pooling2d_136 (MaxPooli  (None, 14, 14, 32)     

In [265]:
# Train the model
# since we've experienced crashes during the training, 
#we made the training in a for loop where every 5 epochs we save the model and output the log into a csv file

epochs = 400
for i in range(0,80):        
    history = model.fit(
        train_generator,
        steps_per_epoch=len(train_generator),
        epochs=int(epochs/80),
        validation_data=val_generator,
        validation_steps=len(val_generator))
    
    ## save stuff after 5 epochs
    hist_df = pd.DataFrame(history.history) 
    hist_csv_file = str(batch_size) + '_history_' + str((i+1) * 5) + ".csv"
    with open(hist_csv_file, mode='w') as f:
        hist_df.to_csv(f)
    model.save(str(batch_size) + '_epoch' + str(i+1 * 5) + '.h5')

# Save the trained model
model.save(str(batch_size) + '.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


KeyboardInterrupt: 

In [267]:
save_info(train_generator, "lookup_table.json", train_set, "train_set.csv", test_set, "test_set.csv")