In [None]:
# import libraries
import os
from os import listdir
import pandas as pd
import numpy as np
from numpy import asarray
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
import matplotlib.pyplot as plt
import cv2
import secrets

# FIRST TASK: CLASSIFY FACES FROM NONE FACES

In [None]:
# image dataset path
dataset = 'faceornot/'

In [None]:
# check image names
for picture in listdir(dataset) :
     if picture != 'Thumbs.db':
            print(picture)#print(picture[0])
            

In [None]:
# function that creates features and targets
def load_images(directory):
    train_pictures = list()
    targets = list()
    gray_scaled = list()
    images = pd.DataFrame()
    print('Converting to numpy array ...')
    for picture in listdir(directory) :
         if picture != 'Thumbs.db':
                 # load the picture from directory
                photo = load_img(directory + picture, target_size = (224,224))
                
                #convert image to numpy array
                photo = img_to_array(photo, dtype='uint8')  
                
                #photo = photo.reshape(1, 224, 224, 3)
                
                # append to list
                #train_pictures.append(photo)
                
                # convert image to grayscale
                gray_scale = cv2.cvtColor(photo, cv2.COLOR_BGR2GRAY)
                gray_scaled.append(gray_scale)
                
                # label targets
                if picture[0] == 'f':
                    targets.append(1)
                else:
                    targets.append(0)
    print(' ')
    #print('Converting to Pandas dataframe ...') 

    #images['image_arrays'] = gray_scaled
    #images['targets'] = targets
    
    X_TRAIN = asarray(gray_scaled, dtype = 'uint8')
    Y_TRAIN = asarray(targets, dtype = 'uint8')
    print(' ')
    print('Done!')
    return X_TRAIN, Y_TRAIN #images

In [None]:
# calling the function
X,Y = load_images(dataset) #images = load_images(dataset)

In [None]:
plt.imshow(X[1])

In [None]:
plt.imshow(X[47])

#### Reshape the data

In [None]:
X.shape

In [None]:
X = np.array(X).reshape((80, 224*224))

In [None]:
X.shape

#### Split the data

In [None]:
# split the dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,y_test = train_test_split(X,Y, test_size = 0.2, random_state = 3)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
#from sklearn.preprocessing import StandardScaler
#sc_x = StandardScaler()
#x_train = sc_x.fit_transform(x_train)
#x_test = sc_x.transform(x_test)

### Reducing the dimension of the Images

In [None]:
# Application of PCA
from sklearn.decomposition import PCA
pca = PCA()  #kernel ='rbf' # this takes the indpendent variables that interpret the dataset the best.

In [None]:
x_train_check = pca.fit(x_train)  #x_train_check = pca.fit_transform(x_train) 

In [None]:
explained_variance = pca.explained_variance_ratio_
explained_variance # this was used to know how many components the PCA would explain "the most the variance".


In [None]:
cumsum = np.cumsum(explained_variance) # find the cummulative summation of that array
cumsum

In [None]:
n_comp = np.argmax(cumsum >= 0.96) + 1 # return the indices in the cumsum with a value greater than 0.96 ... this value is 40( this is 6
# because of pythons indexing, then add plus 1: 41. 

In [None]:
n_comp

In [None]:
pca = PCA(n_components = n_comp)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

In [None]:
# Basically the eigen faces are the vectors that best represent the large image matrices

In [None]:
eigenfaces = pca.components_.reshape((n_comp, 224, 224))

In [None]:
plt.imshow(eigenfaces[0])

In [None]:
plt.imshow(eigenfaces[1])

In [None]:
plt.imshow(eigenfaces[2])

In [None]:
# let us convert back to the original images from the compressed format.
X_recover = pca.inverse_transform(x_train)
X_recover= X_recover.reshape((64,224,224))

In [None]:
plt.imshow(X_recover[0])

In [None]:
plt.imshow(X_recover[1])

In [None]:
plt.imshow(X_recover[2])

#### Classifying using Support Vector Machines

In [None]:
from sklearn.svm import SVC
Fclassifier = SVC()
#Fclassifier.fit(x_train, y_train)

In [None]:
# Using grid search to obtain optimal hyperparameters 
from sklearn.model_selection import GridSearchCV
parameters = [{'C' : [1,100,1000], 'kernel':['linear']},
              {'C' : [1,100,1000], 'kernel':['rbf'], 'gamma':[0.5,0.1, 0.01,0.001,0.0001]}]

grid_search = GridSearchCV(estimator = Fclassifier, 
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs =-1)
grid_search = grid_search.fit(x_train, y_train)

In [None]:
best_accuracy = grid_search.best_score_
best_accuracy * 100 # Multiplied by 100 to get in percentage 

In [None]:
from sklearn.model_selection import RandomizedSearchCV

parameters = [{'C' : [1,100,1000], 'kernel':['linear']},
              {'C' : [1,100,1000], 'kernel':['rbf'], 'gamma':[0.5,0.1, 0.01,0.001,0.0001]}]


random_search =  RandomizedSearchCV(Fclassifier,parameters,n_iter =100, random_state = 0, verbose = 1)

random_search = random_search.fit(x_train, y_train)

In [None]:
best_accuracy = random_search.best_score_
best_accuracy * 100

In [None]:
# best parameters for randomsearch
best_parametersR = random_search.best_params_
best_parametersR

In [None]:
# best parameters for grid search
best_parameters = grid_search.best_params_
best_parameters

In [None]:
Fclassifier = grid_search.best_estimator_
Fclassifier.fit(x_train, y_train)#.fit(x_train, y_train)

In [None]:
y_pred = Fclassifier.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
cn = confusion_matrix(y_test, y_pred)

In [None]:
print(cn)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print(accuracy_score(y_pred, y_test) * 100)

In [None]:
# analysis

In [None]:
# What does the x_test look like?
x_test

In [None]:
x_test.shape

In [None]:
test_recover = pca.inverse_transform(x_test)

In [None]:
test_recover = test_recover.reshape((16,224,224))

In [None]:
plt.imshow(test_recover[0])

In [None]:
plt.imshow(test_recover[1])

In [None]:
plt.imshow(test_recover[2])

In [None]:
plt.imshow(test_recover[3])

In [None]:
plt.imshow(test_recover[4])

In [None]:
plt.imshow(test_recover[5])

In [None]:
print(y_pred)

In [None]:
print(y_test)

# SECOND TASK: IDENTIFY WHO IS IN THE IMAGE

In [None]:
# image dataset path
dataset = 'whoisthis/'

In [None]:
name_file = pd.read_csv('name_tags.csv')

In [None]:
name_file.head()

In [None]:
name_file.shape

In [None]:
def names(name_file):
    name_tag= dict()
    for i in range(len(name_file)):
        name, tags = name_file["image_name"][i], name_file['tags'][i]
        name_tag[name] = tags
    return(name_tag)


In [None]:
see = names(name_file)

In [None]:
see

In [None]:
def name_passcode(list_):
    
    maps = dict() 
    for i  in list_:
        maps[i] = secrets.token_hex(24)
    return maps

In [None]:
def load_images_targets(directory, name_file):
    #train_pictures = list()
    targets = list()
    gray_scaled = list()
    name_targ = names(name_file)
    images = pd.DataFrame()
    passcode_targets = list()
    print('Converting to numpy array ...')
    for picture in listdir(directory) :
         if picture != 'Thumbs.db':
                 # load the picture from directory
                photo = load_img(directory + picture, target_size = (224,224))
                
                #convert image to numpy array
                photo = img_to_array(photo, dtype='uint8')  
                
                #photo = photo.reshape(1, 224, 224, 3)
                
                # append to list
                #train_pictures.append(photo)
                
                # convert image to grayscale
                gray_scale = cv2.cvtColor(photo, cv2.COLOR_BGR2GRAY)
                gray_scaled.append(gray_scale)
                
                # label targets
                targets.append(name_targ[picture.split('.')[0]])

    print(' ')
    maps = name_passcode(targets)
    
    for i in targets:
        passcode_targets.append(maps[i])
        
    
        
    
    X_TRAIN = asarray(gray_scaled, dtype = 'uint8')
    Y_TRAIN = asarray(passcode_targets)#, dtype = 'uint8')
    print(' ')
    print('Done!')
    return X_TRAIN, Y_TRAIN, maps #images

In [None]:
X,Y, mapper = load_images_targets(dataset, name_file)

#### RESHAPE THE DATA

In [None]:
mapper

In [None]:
for keys in mapper:
    print(keys, '==>', mapper[keys])

In [None]:
# creating an inverse mapper
inv_mapper = dict()
for keys in mapper:
    inv_mapper[mapper[keys]] = keys
    
   

In [None]:
inv_mapper

In [None]:
Y[:5]

In [None]:
X.shape

In [None]:
X_r = np.array(X).reshape((150, 224*224))

In [None]:
X_r.shape

#### SPLIT THE DATA

In [None]:
# split the dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,y_test = train_test_split(X_r,Y, test_size = 0.2, random_state = 3)

In [None]:
x_train.shape

In [None]:
x_test.shape

#### REDUCING THE DIMENSION OF THE IMAGES

In [None]:
# Application of PCA
from sklearn.decomposition import PCA
pca = PCA()  #kernel ='rbf' # this takes the indpendent variables that interpret the dataset the best.

In [None]:
x_train_check = pca.fit(x_train)

In [None]:
explained_variance = pca.explained_variance_ratio_
explained_variance # this was used to know how many components the PCA would explain "the most the variance".


In [None]:
cumsum1 = np.cumsum(explained_variance) # find the cummulative summation of that array
cumsum1

In [None]:
n_comp = np.argmax(cumsum1 >= 0.99) + 1 

In [None]:
n_comp

In [None]:
pca1= PCA(n_components = n_comp)
x_train = pca1.fit_transform(x_train)
x_test = pca1.transform(x_test)

In [None]:
eigenfaces = pca1.components_.reshape((n_comp, 224, 224))

In [None]:
plt.imshow(eigenfaces[0])

In [None]:
plt.imshow(eigenfaces[1])

In [None]:
plt.imshow(eigenfaces[2])

In [None]:
# let us convert back to the original images from the compressed format.
X_recover = pca1.inverse_transform(x_train)
X_recover= X_recover.reshape((120,224,224))

In [None]:
plt.imshow(X_recover[0])

In [None]:
plt.imshow(X_recover[1])

In [None]:
plt.imshow(X_recover[2])

#### CLASSIFYING USING SUPPORT VECTOR MACHINES

In [None]:
from sklearn.svm import SVC
Wclassifier = SVC(class_weight='balanced')

In [None]:
from sklearn.model_selection import GridSearchCV
parameters =   [{'C' : [1,100,1000], 'kernel':['linear']},
              {'C' : [1,100,1000], 'kernel':['rbf'], 'gamma':[0.5,0.1, 0.01,0.001,0.0001]}]

             

grid_search = GridSearchCV(estimator = Wclassifier, 
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 6,
                           n_jobs =-1)
grid_search = grid_search.fit(x_train, y_train)

In [None]:
best_accuracy = grid_search.best_score_
best_accuracy * 100 # Multiplied by 100 to get in percentage 

In [None]:
from sklearn.model_selection import RandomizedSearchCV

parameters = [{'C' : [1,100,1000], 'kernel':['linear']},
              {'C' : [1,100,1000], 'kernel':['rbf'], 'gamma':[0.5,0.1, 0.01,0.001,0.0001]}]


random_search =  RandomizedSearchCV(Wclassifier,parameters,n_iter =100, random_state = 0, verbose = 1)

random_search = random_search.fit(x_train, y_train)

In [None]:
best_accuracy = random_search.best_score_
best_accuracy * 100

In [None]:
# best parameters for randomsearch
best_parametersR = random_search.best_params_
best_parametersR

In [None]:
# best parameters for gridsearch
best_parametersG = grid_search.best_params_
best_parametersG

In [None]:
Wclassifier = grid_search.best_estimator_
Wclassifier.fit(x_train, y_train)

In [None]:
y_pred = Wclassifier.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
cn = confusion_matrix(y_test, y_pred)

In [None]:
print(cn)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print(accuracy_score(y_pred, y_test) * 100)

## Using SGDClassifier as a linear Support Vector Machine classifier.

In [None]:
# A stochastic gradient descent with hyper-parameters such as loss and penality set to 'hinge' and 'l2' i.e ridge regularization
# respectively is equivalent to a linear support vector machines classifier.

# This is important because of the online learning capabilty (i.e .partial_fit() ) the SGDClassifier provides

from sklearn.linear_model import SGDClassifier
Wclassifier2 = SGDClassifier(loss="hinge", penalty="l2") 

In [None]:
Wclassifier2.partial_fit(x_train, y_train, classes=np.unique(y_train))

# This link explains the use of classes above: https://stackoverflow.com/questions/42147302/sklearn-sgd-partial-fit

In [None]:
y_pred2 = Wclassifier2.predict(x_test)

In [None]:
cn2 = confusion_matrix(y_test, y_pred2)

In [None]:
print(cn2)

In [None]:
print(classification_report(y_test, y_pred2))

In [None]:
print(accuracy_score(y_pred2, y_test) * 100)

In [None]:
# When using .fit() both models perform exactly the same, but when using .partial_fit() it performs lower than the normal svm. 
# This is a problem more data can solve.

### Visualising the predictions for the normal SVM Classifier.

In [None]:
# lets convert back to the names
y_test =[inv_mapper[i] for i in y_test]
y_pred =[inv_mapper[i] for i in y_pred]


In [None]:
zipped = zip(y_pred, y_test)
print('Predicted | Actual')
print('-------------------')
for l1,l2 in zipped:
    print(f'- {l1} | {l2} ')

### Visualising the predictions for the SGD Classifier(linear SVM Classifier).

In [None]:
y_pred2 =[inv_mapper[i] for i in y_pred2]


In [None]:
zipped = zip(y_pred2, y_test)
print('Predicted | Actual')
print('-------------------')
for l1,l2 in zipped:
    print(f'- {l1} | {l2} ')

# USING LBPHFACERECOGINIZER

#### LBPH is used to see if the model will perform a lot better than SVC on a the given dataset . (maybe like 30+ increase in accuracy)

In [None]:
def load_images_targets(directory, name_file):
    #train_pictures = list()
    targets = list()
    gray_scaled = list()
    name_targ = names(name_file)
    print('Converting to numpy array ...')
    for picture in listdir(directory) :
         if picture != 'Thumbs.db':
                 # load the picture from directory
                photo = load_img(directory + picture, target_size = (224,224))
                
                #convert image to numpy array
                photo = img_to_array(photo, dtype='uint8')  
                
                #photo = photo.reshape(1, 224, 224, 3)
                
                # append to list
                #train_pictures.append(photo)
                
                # convert image to grayscale
                gray_scale = cv2.cvtColor(photo, cv2.COLOR_BGR2GRAY)
                gray_scaled.append(gray_scale)
                
                # label targets
                targets.append(name_targ[picture.split('.')[0]])


            
    X_TRAIN = asarray(gray_scaled, dtype = 'uint8')
    Y_TRAIN = asarray(targets)#, dtype = 'uint8')
    print(' ')
    print('Done!')
    return X_TRAIN, Y_TRAIN #images

In [None]:
X,Y = load_images_targets(dataset, name_file)

In [None]:
# The  LBPHFACERECOGINIZER does not take categorical targets so I converted to integers.
hold = Y.tolist() # convert array to list
uniq = set(hold) # remove duplicates
names = list(uniq) # convert to list

In [None]:
mapped = {names[i]:i for i in range(len(names))} # creat dictionary mapping names to numbers
inv_mapped = {i:names[i] for i in range(len(names))}

In [None]:
mapped

In [None]:
# create a new Y
Y_i = [mapped[i] for i in hold]    

In [None]:
# To make the targets and the feature are still mapped correctly Lets check the first, last and any random image

# The first
plt.imshow(X[0])

In [None]:
Y_i[0]

In [None]:
# The Second
plt.imshow(X[43])

In [None]:
Y_i[43]

In [None]:
# The Last
plt.imshow(X[119])

In [None]:
Y_i[119]

In [None]:
# Split the data
x_train, x_test, y_train,y_test = train_test_split(X,Y_i, test_size = 0.2, random_state = 3)

In [None]:
x_train.shape

In [None]:
Y_i[:5]

In [None]:
x_train.shape

In [None]:
#x_train = x_train.reshape((120, 224, 224))

In [None]:
x_train.shape

In [None]:
clf = cv2.face.LBPHFaceRecognizer_create()
clf.train(x_train, np.array(y_train))

In [None]:
x_test.shape

In [None]:
y_pred = list()
for i in x_test:
    y_p,_ = clf.predict(i)
    y_pred.append(y_p)

In [None]:
cn = confusion_matrix(y_test, y_pred)

In [None]:
cn

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print(accuracy_score(y_pred, y_test) * 100)

In [None]:
y_pred = [inv_mapped[i] for i in y_pred]
y_test = [inv_mapped[i] for i in y_test]

In [None]:
zipped = zip(y_pred, y_test)
print('Predicted | Actual')
print('-------------------')
for l1,l2 in zipped:
    print(f'- {l1} | {l2} ')
   

### It just increased by plus 7, which is not impressive. Best solution is get more data. Possibly +150 images per face

###### REFERENCES

In [None]:
#  https://www.kaggle.com/hamishdickson/preprocessing-images-with-dimensionality-reduction

#  https://scikit-learn.org/stable/auto_examples/applications/plot_face_recognition.html

#  https://scikit-learn.org/stable/modules/sgd.html#sgd

# https://shankarmsy.github.io/posts/pca-sklearn.html