# Celebrity Look a Like CNN DL Project

#### Mohamed Youssef - 211001821


## Index

* [Including Necessary Libraries](#libs)
<br>
    * [Other Utilites](#uti)
<br>
* [Data Preprocessing](#dataset)
<br>
    * [Reading Meta Data](#metadata)
    * [Creating Data Frame](#dataframe)
    * [Data Cleaning](#clean)


## Including Necessary Libraries and Initializing Utilites<a id='libs'></a>


In [271]:
# Python built-in libraries
from datetime import datetime, timedelta
import time
import warnings
import os

# Libraries for preprocessing and visualizing the data
import scipy.io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# OpenCV library for image processing
import cv2

# Tensorflow and Keras libraries for deep learning models
import tensorflow as tf

import keras
from keras.preprocessing import image
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import (
    Dense,
    Activation,
    Dropout,
    Flatten,
    Input,
    Convolution2D,
    ZeroPadding2D,
    MaxPooling2D,
    Activation,
)
from keras.layers import Conv2D, AveragePooling2D
from keras.models import Model, Sequential
from keras import metrics
from keras.models import model_from_json


# Sklearn train test split function
from sklearn.model_selection import train_test_split

### Other Utilites<a id='uti'></a>

In [272]:
# Disabling warnings
def warn(*args, **kwargs):
    pass


warnings.warn = warn

In [273]:
# Setting the path to the current directory
path = os.getcwd()

In [274]:
# Configuring the GPU
# Setting the second GPU as the default GPU for training
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# Setting the configuration of the GPU

gpus = tf.config.experimental.list_physical_devices("gpus")

print("Num GPUs Available: ", gpus)

if gpus:
    try:
        tf.config.experimental.set_visible_devices(gpus[1], "GPU")
        tf.config.experimental.set_memory_growth(gpus[1], True)

    except RuntimeError as e:
        print(e)

Num GPUs Available:  []


## Data Preprocessing <a id='dataset'></a>

### Reading Meta Data <a id='metadata'></a>

In [275]:
# Loading the mat file containing the data
data_mat = scipy.io.loadmat(path + "/Data/imdb_crop/imdb_crop/imdb.mat")

In [276]:
# Getting the total number of records in the data
records = data_mat["imdb"][0][0][0].shape[1]
print("Total no. of records: ", records)

Total no. of records:  460723


In [277]:
# Getting the column names of the data
features = data_mat["imdb"][0][0].dtype
features = list(features.names)
print("Total no. of features: ", len(features))

Total no. of features:  10


In [278]:
# Rename the columns of the data
print("Current Features Names:\n", features)
features[0] = "Date of Birth"
features[1] = "Photo Taken Date"
features[2] = "Full Img Path"
features[3] = "Gender"
features[4] = "Celebrity Name"
features[5] = "Face Location"
features[6] = "Face Score"
features[7] = "Second Face Score"
features[8] = "Celeb Names"
features[9] = "Celeb ID"
print("New Features Names:\n", features)

Current Features Names:
 ['dob', 'photo_taken', 'full_path', 'gender', 'name', 'face_location', 'face_score', 'second_face_score', 'celeb_names', 'celeb_id']
New Features Names:
 ['Date of Birth', 'Photo Taken Date', 'Full Img Path', 'Gender', 'Celebrity Name', 'Face Location', 'Face Score', 'Second Face Score', 'Celeb Names', 'Celeb ID']


### Creating Data Frame <a id='dataframe'></a>

In [279]:
# Creating a dataframe to store the data
data_df = pd.DataFrame(index=(range(0, records)), columns=features)

In [280]:
# Extracting the data from the mat file and storing it in a dataframe
for i in data_mat:
    if i == "imdb":
        current_array = data_mat[i][0][0]
        for j in range(len(current_array)):
            data_df[features[j]] = pd.DataFrame(current_array[j][0])

In [281]:
# Displaying 5 random rows of the data
data_df.sample(5)

Unnamed: 0,Date of Birth,Photo Taken Date,Full Img Path,Gender,Celebrity Name,Face Location,Face Score,Second Face Score,Celeb Names,Celeb ID
249712,717665,2008,[13/nm0226813_rm4227306496_1964-11-24_2008.jpg],1.0,[Garret Dillahunt],"[[343.75, 35.75, 382.8, 74.8]]",2.535535,,,6625
414024,723401,2012,[37/nm4083737_rm3029578752_1980-8-8_2012.jpg],1.0,[Tobias Santelmann],"[[1, 1, 2683, 4540]]",-inf,,,19015
376585,708685,2014,[81/nm0662981_rm3337275136_1940-4-24_2014.jpg],1.0,[Michael Parks],"[[460.8, 346.112, 606.208, 491.52]]",2.56934,1.791728,,13672
316367,724334,2010,[18/nm0544718_rm2410592768_1983-2-27_2010.jpg],0.0,[Kate Mara],"[[980.7022401101137, 227.89128617925704, 1656....",3.176025,,,10569
228689,728396,2007,[80/nm1519680_rm641439744_1994-4-12_2007.jpg],0.0,[Saoirse Ronan],"[[1, 1, 323, 485]]",-inf,,,17164


In [282]:
# Describing the data
data_df.describe()

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,Date of Birth,Photo Taken Date,Gender,Face Score,Second Face Score,Celeb ID
count,460723.0,460723.0,452261.0,460723.0,213797.0,460723.0
mean,718987.731774,2005.461555,0.581996,-inf,2.452904,10116.802404
std,13253.963535,9.054475,0.493231,,1.064432,5742.153266
min,47.0,1961.0,0.0,-inf,0.730926,1.0
25%,716370.0,2004.0,0.0,1.757891,1.583692,5294.0
50%,719935.0,2008.0,1.0,2.980097,2.355163,10066.0
75%,723073.0,2011.0,1.0,4.006376,3.228071,14922.0
max,734963.0,2015.0,1.0,7.381689,6.395435,20284.0


In [283]:
# Checking for null values in the data
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460723 entries, 0 to 460722
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Date of Birth      460723 non-null  int32  
 1   Photo Taken Date   460723 non-null  uint16 
 2   Full Img Path      460723 non-null  object 
 3   Gender             452261 non-null  float64
 4   Celebrity Name     460723 non-null  object 
 5   Face Location      460723 non-null  object 
 6   Face Score         460723 non-null  float64
 7   Second Face Score  213797 non-null  float64
 8   Celeb Names        20284 non-null   object 
 9   Celeb ID           460723 non-null  uint16 
dtypes: float64(3), int32(1), object(4), uint16(2)
memory usage: 28.1+ MB


### Data Cleaning <a id='clean'></a>

In [284]:
# Removing pictures with no face detected
data_df = data_df[data_df["Face Score"] != -np.inf]
print("Total no. of records with no face detected removed:", records - len(data_df))
records = len(data_df)  # Updating the total number of records
print("Total no. of records after removing records with no face detected:", records)

Total no. of records with no face detected removed: 62302
Total no. of records after removing records with no face detected: 398421


In [285]:
# Removing pictures with more than one face detected
data_df = data_df[data_df["Second Face Score"].isna()]
print(
    "Total no. of records with more than one face detected removed:",
    records - len(data_df),
)
records = len(data_df)  # Updating the total number of records
print("Total no. of records after removing pictures with more the one face:", records)

Total no. of records with more than one face detected removed: 213797
Total no. of records after removing pictures with more the one face: 184624


In [286]:
# Removing pictures with low face scores
data_df = data_df[data_df["Face Score"] >= 3]
print(
    "Total no. of records with low face scores removed:",
    records - len(data_df),
)
records = len(data_df)  # Updating the total number of records
print("Total no. of records after removing pictures with low face scores:", records)

Total no. of records with low face scores removed: 89390
Total no. of records after removing pictures with low face scores: 95234


In [287]:
# Creating a function to extract first index of an array
def extract_data(arr):
    return arr[0]

In [288]:
# Extracting the data from the array columns of the dataframe
data_df["Celebrity Name"] = data_df["Celebrity Name"].apply(extract_data)
data_df["Full Img Path"] = data_df["Full Img Path"].apply(extract_data)

In [289]:
# Dropping the unnecessary columns from the dataframe
data_df.drop(
    ["Celeb ID", "Celeb Names", "Face Score", "Second Face Score"], axis=1, inplace=True
)

In [290]:
# Creating a function to convert the matlab serial date number to python date time object
def convert_date(date_number):
    str_date = str(date_number)
    # If the length of the string is 4, then the date is in the format YYYY
    if len(str_date) == 4:
        full_date = datetime.strptime(str_date, "%Y")
        return full_date.strftime("%Y")
    else:
        try:
            date_time = (
                datetime.fromordinal(int(date_number))
                + timedelta(days=date_number % 1)
                - timedelta(days=366)
            )
            return date_time
        except OverflowError:
            return pd.NaT

In [291]:
# Converting the matlab serial date number to python date time object
data_df["Date of Birth"] = data_df["Date of Birth"].apply(convert_date)

In [292]:
# Converting the Photo Taken Date to python year date time object
data_df["Photo Taken Date"] = data_df["Photo Taken Date"].apply(convert_date)

In [293]:
# Creating a function to calculate the age of the celebrity
def calculate_age(born, photo_taken):
    photo_taken = datetime.strptime(photo_taken, "%Y")
    try:
        return (
            photo_taken.year
            - born.year
            - ((photo_taken.month, photo_taken.day) < (born.month, born.day))
        )
    except AttributeError:
        return np.nan

In [294]:
# Calculating the age of the celebrity
data_df["Age at Photo"] = data_df.apply(
    lambda x: calculate_age(x["Date of Birth"], x["Photo Taken Date"]), axis=1
)
# Changing the datatype of the age column to integer
data_df["Age at Photo"] = data_df["Age at Photo"].astype("Int64")

In [295]:
# Dropping any rows with null values
data_df.dropna(inplace=True)
print("Total no. of records after removing null values:", len(data_df))

Total no. of records after removing null values: 93500


In [296]:
# Resetting the index of the dataframe
data_df.reset_index(drop=True, inplace=True)

In [297]:
# Displaying 5 random rows of the data
data_df.sample(5)

Unnamed: 0,Date of Birth,Photo Taken Date,Full Img Path,Gender,Celebrity Name,Face Location,Age at Photo
89276,1982-11-30,2011,46/nm0193846_rm2782044416_1982-11-30_2011.jpg,0.0,Elisha Cuthbert,"[[387.58393455260597, 72.80212735862438, 467.6...",28
49705,1966-04-18,2001,67/nm0919867_rm1079810048_1966-4-18_2001.jpg,1.0,Frederick Weller,"[[288.7828876545509, 100.86100440158292, 400.9...",34
37895,1971-08-14,2011,56/nm0100556_rm1469102080_1971-8-14_2011.jpg,1.0,Raoul Bova,"[[788.48, 231.424, 1081.344, 524.288]]",39
76760,1980-12-03,2010,87/nm1564087_rm1401523968_1980-12-3_2010.jpg,0.0,Jenna Dewan Tatum,"[[113.708, 34.06, 150.912, 71.264]]",29
27825,1981-04-28,2007,95/nm0004695_rm4081948928_1981-4-28_2007.jpg,0.0,Jessica Alba,"[[134.13730584594, 67.38965292297, 283.6775249...",25


In [298]:
# Checking for null values in the data
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93500 entries, 0 to 93499
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Date of Birth     93500 non-null  datetime64[ns]
 1   Photo Taken Date  93500 non-null  object        
 2   Full Img Path     93500 non-null  object        
 3   Gender            93500 non-null  float64       
 4   Celebrity Name    93500 non-null  object        
 5   Face Location     93500 non-null  object        
 6   Age at Photo      93500 non-null  Int64         
dtypes: Int64(1), datetime64[ns](1), float64(1), object(4)
memory usage: 5.1+ MB
