In [3]:
from scipy.sparse import csr_matrix, save_npz, load_npz
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import scipy.io
import math
from keras.callbacks import CSVLogger
import json


# Option to see strings fully and not cut by ... in the DataFrame
pd.set_option('display.max_colwidth', None)

In [51]:
import math
# 
# ```
# df = dataframe
# test_size = ratio of the test set (0.0 - 1.0)
# batch_size = amount of pictures per class for the training set

def split_data(data, test_size):
    df = data
    classes = df['actor_name'].unique()
    test_names = []
    train_names = []
    train_paths = []
    test_paths = []
    for actor in classes:
        actor_df = df[df['actor_name'] == actor]
        images_number = len(actor_df)
        train_size = math.ceil(images_number * (1 - test_size))
        if train_size == 0:
            train_size = 1
        for i in range(images_number):
            if i >= min(train_size, 9):
                test_names.append(actor_df.iloc[i]['actor_name'])
                test_paths.append(actor_df.iloc[i]['path'])
                if i == 11:
                    break
            else:
                train_names.append(actor_df.iloc[i]['actor_name'])
                train_paths.append(actor_df.iloc[i]['path'])
    train = np.vstack((train_names, train_paths))
    test = np.vstack((test_names, test_paths))
    train_set = pd.DataFrame(train).T
    test_set = pd.DataFrame(test).T
    
    train_set.columns = ['actor_name', 'path']
    test_set.columns = ['actor_name', 'path']
    
    return train_set, test_set
            

# function to remove actors that have less than 'n' images in the dataset

def remove_single_occurrences(df, n):
    count = df['actor_name'].value_counts()
    mask = (count[df['actor_name']].values > n)
    return df[mask]

# function to load training & test sets into DataFrames
def load_data(filename : str):
    meta = pd.read_csv(filename).reset_index()
    actors = meta['actor_name']
    paths = meta['path']
    meta = np.vstack((actors, paths))
    meta_df = pd.DataFrame(meta).T
    meta_df.columns = ['actor_name', 'path']
    return meta_df

#function to predict the class
def predict_class(model : Sequential, train_gen, img_path, width, height):
    from PIL import Image
    import numpy as np
    from matplotlib import pyplot as plt
    
    img = Image.open(img_path)
    
    img = np.array(img)
    
    x_new = img.reshape(1, width, height, 4)
    x_new = x_new.astype('float32') / 255
    
    predictions = model.predict(img)
    
    lookup_table = dict(map(reversed, train_gen.class_indices.items()))
    
    prediction = np.argmax(predictions)
    
    return(lookup_table[prediction])

#function to save the train & test sets as well as the lookup table
def save_info(train_gen, filename1, train_set, filename2, test_set, filename3):
    lookup_table = dict(map(reversed, train_gen.class_indices.items()))
    with open(filename1, 'w', encoding='utf8') as f:
        json.dump(lookup_table, f)
    train_set.to_csv(filename2)
    test_set.to_csv(filename3)

In [42]:
# setting up the meta data from the folder's structure of the LFW Dataset

lfw_path = r'../../../data/lfw-deepfunneled'
actor_list = []
paths = []
rest_actor_list = []
rest_paths = []

for root_dir, cur_dir, files in os.walk(lfw_path):
    if root_dir == lfw_path:
        continue
    if len(files) >= 2:
        actor_name = root_dir.replace("../../../data/lfw-deepfunneled", "")
        actor_name = actor_name.replace("\\", "")
        for idx, file in enumerate(files):
            # if idx > 8:
            #     rest_actor_list.append(actor_name)
            #     rest_paths.append(lfw_path + "\\" + actor_name + "\\" + file)
            actor_list.append(actor_name)
            paths.append(lfw_path + "\\" + actor_name + "\\" + file)
lfw = np.vstack((actor_list, paths))
lfw_df = pd.DataFrame(lfw).T
lfw_df.columns = ['actor_name', 'path']
meta = lfw_df.sample(frac = 1)

In [44]:
meta

Unnamed: 0,actor_name,path
3393,Gray_Davis,../../../data/lfw-deepfunneled\Gray_Davis\Gray_Davis_0002.jpg
3213,Gerhard_Schroeder,../../../data/lfw-deepfunneled\Gerhard_Schroeder\Gerhard_Schroeder_0067.jpg
1288,Charles_Moose,../../../data/lfw-deepfunneled\Charles_Moose\Charles_Moose_0006.jpg
3838,Hun_Sen,../../../data/lfw-deepfunneled\Hun_Sen\Hun_Sen_0004.jpg
6905,Paula_Zahn,../../../data/lfw-deepfunneled\Paula_Zahn\Paula_Zahn_0001.jpg
...,...,...
8264,Ted_Maher,../../../data/lfw-deepfunneled\Ted_Maher\Ted_Maher_0002.jpg
8702,Toshihiko_Fukui,../../../data/lfw-deepfunneled\Toshihiko_Fukui\Toshihiko_Fukui_0003.jpg
8853,Vincent_Brooks,../../../data/lfw-deepfunneled\Vincent_Brooks\Vincent_Brooks_0003.jpg
8499,Tom_Ridge,../../../data/lfw-deepfunneled\Tom_Ridge\Tom_Ridge_0003.jpg


In [6]:
meta = load_data('data\lfw-deepfunneled\metadata.csv')

## training the model

In [49]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# sort by actor_name and remove actors with less than 8 images
new_meta = meta.sort_values(by=['actor_name'])
data = new_meta
data = remove_single_occurrences(data, 8)

# check number of unique classes - probably can be removed
unique_labels = data['actor_name'].unique()
print(f"Min: {min(unique_labels)}, Max: {max(unique_labels)}, Count: {len(unique_labels)}")

# Split into train and validation sets
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42, stratify = data['actor_name'])

# Image preprocessing
img_width, img_height = 128, 128
batch_size = 8

train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_dataframe(
    train_set,
    x_col='path',
    y_col='actor_name',
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='categorical')

val_generator = val_datagen.flow_from_dataframe(
    test_set,
    x_col='path',
    y_col='actor_name',
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='categorical')

num_classes = len(train_generator.class_indices)
print(f"Number of unique classes: {num_classes}")

# Model architecture
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(img_width, img_height, 3)),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(512, activation = 'relu', kernel_regularizer='l2'),
    Dropout(0.5),
    Dense(len(train_generator.class_indices), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss="categorical_crossentropy", metrics=['accuracy'])

Min: Abdullah_Gul, Max: Zhu_Rongji, Count: 184
Found 3646 validated image filenames belonging to 184 classes.
Found 912 validated image filenames belonging to 184 classes.
Number of unique classes: 184


In [52]:
# Train the model
# since we've experienced crashes during the training, 
#we made the training in a for loop where every 5 epochs we save the model and output the log into a csv file

epochs = 200
for i in range(0,40):        
    history = model.fit(
        train_generator,
        steps_per_epoch=len(train_generator),
        epochs=int(epochs/40),
        validation_data=val_generator,
        validation_steps=len(val_generator))
    
    ## save stuff after 5 epochs
    hist_df = pd.DataFrame(history.history) 
    hist_csv_file = str(batch_size) + '_history_' + str(i+1 * 5) + ".csv"
    with open('history/' + hist_csv_file, mode='w') as f:
        hist_df.to_csv(f)
    model.save(str(batch_size) + '_epoch' + str(i+1 * 5) + '.h5')

# Save the trained model
model.save(str(batch_size) + '.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [52]:
save_info(train_generator, "lookup_table.json", train_set, "train_set.csv", test_set, "test_set.csv")

In [53]:
print("shit")

shit
