# Celebrity Look a Like CNN DL Project

#### Mohamed Youssef - 211001821


## Index

* [Including Necessary Libraries](#libs)
<br>
    * [Other Utilites](#uti)
<br>
* [Data Preprocessing](#dataset)
<br>
    * [Reading Meta Data](#metadata)
    * [Creating Data Frame](#dataframe)
    * [Data Cleaning](#clean)


## Including Necessary Libraries and Initializing Utilites<a id='libs'></a>


In [39]:
# Python built-in libraries
from datetime import datetime, timedelta
import time
import warnings
import os

# Libraries for preprocessing and visualizing the data
import scipy.io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# OpenCV library for image processing
import cv2

# Tensorflow and Keras libraries for deep learning models
import tensorflow as tf

import keras
from keras.preprocessing import image
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import (
    Dense,
    Activation,
    Dropout,
    Flatten,
    Input,
    Convolution2D,
    ZeroPadding2D,
    MaxPooling2D,
    Activation,
)
from keras.layers import Conv2D, AveragePooling2D
from keras.models import Model, Sequential
from keras import metrics
from keras.models import model_from_json


# Sklearn train test split function
from sklearn.model_selection import train_test_split

### Other Utilites<a id='uti'></a>

In [40]:
# Disabling warnings
def warn(*args, **kwargs):
    pass


warnings.warn = warn

In [41]:
# Setting the path to the current directory
path = os.getcwd()

In [42]:
# Configuring the GPU for training the model
gpus = tf.config.list_physical_devices("GPU")

if gpus:
    print("Num GPUs Available: ", gpus)

    try:
        tf.config.experimental.set_visible_devices(gpus[1], "GPU")
        tf.config.experimental.set_memory_growth(gpus[1], True)

    except:
        tf.config.experimental.set_visible_devices(gpus[0], "GPU")
        tf.config.experimental.set_memory_growth(gpus[0], True)
else:
    print("No GPU Available")

Num GPUs Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## Data Preprocessing <a id='dataset'></a>

### Reading Meta Data <a id='metadata'></a>

In [43]:
# Loading the mat file containing the data
data_mat = scipy.io.loadmat(path + "/Data/imdb_crop/imdb_crop/imdb.mat")

In [44]:
# Getting the total number of records in the data
records = data_mat["imdb"][0][0][0].shape[1]
print("Total no. of records: ", records)

Total no. of records:  460723


In [45]:
# Getting the column names of the data
features = data_mat["imdb"][0][0].dtype
features = list(features.names)
print("Total no. of features: ", len(features))

Total no. of features:  10


In [46]:
# Rename the columns of the data
print("Current Features Names:\n", features)
features[0] = "Date of Birth"
features[1] = "Photo Taken Date"
features[2] = "Full Img Path"
features[3] = "Gender"
features[4] = "Celebrity Name"
features[5] = "Face Location"
features[6] = "Face Score"
features[7] = "Second Face Score"
features[8] = "Celeb Names"
features[9] = "Celeb ID"
print("New Features Names:\n", features)

Current Features Names:
 ['dob', 'photo_taken', 'full_path', 'gender', 'name', 'face_location', 'face_score', 'second_face_score', 'celeb_names', 'celeb_id']
New Features Names:
 ['Date of Birth', 'Photo Taken Date', 'Full Img Path', 'Gender', 'Celebrity Name', 'Face Location', 'Face Score', 'Second Face Score', 'Celeb Names', 'Celeb ID']


### Creating Data Frame <a id='dataframe'></a>

In [47]:
# Creating a dataframe to store the data
data_df = pd.DataFrame(index=(range(0, records)), columns=features)

In [48]:
# Extracting the data from the mat file and storing it in a dataframe
for i in data_mat:
    if i == "imdb":
        current_array = data_mat[i][0][0]
        for j in range(len(current_array)):
            data_df[features[j]] = pd.DataFrame(current_array[j][0])

In [49]:
# Displaying 5 random rows of the data
data_df.sample(5)

Unnamed: 0,Date of Birth,Photo Taken Date,Full Img Path,Gender,Celebrity Name,Face Location,Face Score,Second Face Score,Celeb Names,Celeb ID
434499,725504,2011,[40/nm0885840_rm2911743488_1986-5-12_2011.jpg],0.0,[Emily VanCamp],"[[1, 1, 425, 640]]",-inf,,,5871
127176,714239,2008,[51/nm0001751_rm1746198016_1955-7-9_2008.jpg],1.0,[Jimmy Smits],"[[314.792, 49.704, 376.704, 111.616]]",1.855542,0.989908,,9221
246986,725235,2005,[62/nm2129662_rm2027542272_1985-8-16_2005.jpg],0.0,[Cristin Milioti],"[[223.92888020754896, 258.22563100871037, 300....",2.503853,2.496622,,4099
273909,718428,2002,[91/nm0130191_rm2860420608_1966-12-27_2002.jpg],0.0,[Eva LaRue],"[[165.834, 149.85000000000002, 213.12, 197.136...",1.231028,,,6162
20740,707840,2010,[64/nm0000164_rm4138897152_1937-12-31_2010.jpg],1.0,[Anthony Hopkins],"[[296.44800000000004, 345.6, 737.2800000000001...",3.090811,1.121087,,1316


In [50]:
# Describing the data
data_df.describe()

Unnamed: 0,Date of Birth,Photo Taken Date,Gender,Face Score,Second Face Score,Celeb ID
count,460723.0,460723.0,452261.0,460723.0,213797.0,460723.0
mean,718987.731774,2005.461555,0.581996,-inf,2.452904,10116.802404
std,13253.963535,9.054475,0.493231,,1.064432,5742.153266
min,47.0,1961.0,0.0,-inf,0.730926,1.0
25%,716370.0,2004.0,0.0,1.757891,1.583692,5294.0
50%,719935.0,2008.0,1.0,2.980097,2.355163,10066.0
75%,723073.0,2011.0,1.0,4.006376,3.228071,14922.0
max,734963.0,2015.0,1.0,7.381689,6.395435,20284.0


In [51]:
# Checking for null values in the data
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460723 entries, 0 to 460722
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Date of Birth      460723 non-null  int32  
 1   Photo Taken Date   460723 non-null  uint16 
 2   Full Img Path      460723 non-null  object 
 3   Gender             452261 non-null  float64
 4   Celebrity Name     460723 non-null  object 
 5   Face Location      460723 non-null  object 
 6   Face Score         460723 non-null  float64
 7   Second Face Score  213797 non-null  float64
 8   Celeb Names        20284 non-null   object 
 9   Celeb ID           460723 non-null  uint16 
dtypes: float64(3), int32(1), object(4), uint16(2)
memory usage: 28.1+ MB


### Data Cleaning <a id='clean'></a>

In [52]:
# Removing pictures with no face detected
data_df = data_df[data_df["Face Score"] != -np.inf]
print("Total no. of records with no face detected removed:", records - len(data_df))
records = len(data_df)  # Updating the total number of records
print("Total no. of records after removing records with no face detected:", records)

Total no. of records with no face detected removed: 62302
Total no. of records after removing records with no face detected: 398421


In [53]:
# Removing pictures with more than one face detected
data_df = data_df[data_df["Second Face Score"].isna()]
print(
    "Total no. of records with more than one face detected removed:",
    records - len(data_df),
)
records = len(data_df)  # Updating the total number of records
print("Total no. of records after removing pictures with more the one face:", records)

Total no. of records with more than one face detected removed: 213797
Total no. of records after removing pictures with more the one face: 184624


In [54]:
# Removing pictures with low face scores
data_df = data_df[data_df["Face Score"] >= 3]
print(
    "Total no. of records with low face scores removed:",
    records - len(data_df),
)
records = len(data_df)  # Updating the total number of records
print("Total no. of records after removing pictures with low face scores:", records)

Total no. of records with low face scores removed: 89390
Total no. of records after removing pictures with low face scores: 95234


In [55]:
# Creating a function to extract first index of an array
def extract_data(arr):
    return arr[0]

In [56]:
# Extracting the data from the array columns of the dataframe
data_df["Celebrity Name"] = data_df["Celebrity Name"].apply(extract_data)
data_df["Full Img Path"] = data_df["Full Img Path"].apply(extract_data)

In [57]:
# Dropping the unnecessary columns from the dataframe
data_df.drop(
    ["Celeb ID", "Celeb Names", "Face Score", "Second Face Score", "Face Location"],
    axis=1,
    inplace=True,
)

In [58]:
# Creating a function to convert the matlab serial date number to python date time object
def convert_date(date_number):
    str_date = str(date_number)
    # If the length of the string is 4, then the date is in the format YYYY
    if len(str_date) == 4:
        full_date = datetime.strptime(str_date, "%Y")
        return full_date.strftime("%Y")
    else:
        try:
            date_time = (
                datetime.fromordinal(int(date_number))
                + timedelta(days=date_number % 1)
                - timedelta(days=366)
            )
            return date_time
        except OverflowError:
            return pd.NaT

In [59]:
# Converting the matlab serial date number to python date time object
data_df["Date of Birth"] = data_df["Date of Birth"].apply(convert_date)

In [60]:
# Converting the Photo Taken Date to python year date time object
data_df["Photo Taken Date"] = data_df["Photo Taken Date"].apply(convert_date)

In [61]:
# Creating a function to calculate the age of the celebrity
def calculate_age(born, photo_taken):
    photo_taken = datetime.strptime(photo_taken, "%Y")
    try:
        return (
            photo_taken.year
            - born.year
            - ((photo_taken.month, photo_taken.day) < (born.month, born.day))
        )
    except AttributeError:
        return np.nan

In [62]:
# Calculating the age of the celebrity
data_df["Age at Photo"] = data_df.apply(
    lambda x: calculate_age(x["Date of Birth"], x["Photo Taken Date"]), axis=1
)
# Changing the datatype of the age column to integer
data_df["Age at Photo"] = data_df["Age at Photo"].astype("Int64")

In [63]:
# Dropping the Date of Birth and Photo Taken Date columns
data_df.drop(["Date of Birth", "Photo Taken Date"], axis=1, inplace=True)

In [68]:
# Dropping any rows with null values
data_df.dropna(inplace=True)
print("Total no. of records with null values removed:", records - len(data_df))
records = len(data_df)  # Updating the total number of records
print("Total no. of records after removing null values:", len(data_df))

Total no. of records with null values removed: 1734
Total no. of records after removing null values: 93500


In [69]:
# Resetting the index of the dataframe
data_df.reset_index(drop=True, inplace=True)

In [70]:
# Displaying 5 random rows of the data
data_df.sample(5)

Unnamed: 0,Full Img Path,Gender,Celebrity Name,Age at Photo
31700,42/nm0005042_rm1011005696_1963-6-6_2012.jpg,1.0,Jason Isaacs,48
4920,86/nm0000186_rm3272907520_1946-1-20_2001.jpg,1.0,David Lynch,54
40330,87/nm1406387_rm2436406016_1966-11-19_2005.jpg,1.0,Rocco DiSpirito,38
66975,38/nm0947338_rm3200354304_1989-3-11_2009.jpg,1.0,Anton Yelchin,19
46923,20/nm0517820_rm3248002048_1986-7-2_2006.jpg,0.0,Lindsay Lohan,19


In [67]:
# Checking for null values in the data
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93500 entries, 0 to 93499
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Full Img Path   93500 non-null  object 
 1   Gender          93500 non-null  float64
 2   Celebrity Name  93500 non-null  object 
 3   Age at Photo    93500 non-null  Int64  
dtypes: Int64(1), float64(1), object(2)
memory usage: 2.9+ MB
