In [1]:
from sklearn.datasets import fetch_lfw_pairs

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import Perceptron

In [2]:
lfw_dataset_train = fetch_lfw_pairs(subset='train', resize = 1)
lfw_dataset_test = fetch_lfw_pairs(subset='test', resize = 1)

In [3]:
X_train = lfw_dataset_train.pairs
y_train = lfw_dataset_train.target

X_test = lfw_dataset_test.pairs
y_test = lfw_dataset_test.target

In [4]:
x_train_1 = X_train[:, 0]  # x_train_1.shape is (60000, 28, 28)
x_train_2 = X_train[:, 1]

x_test_1 = X_test[:, 0]  # x_val_1.shape = (60000, 28, 28)
x_test_2 = X_test[:, 1]

In [5]:
x_train_1.shape

(2200, 125, 94)

In [6]:
def flatten_arr(data):
    arr = []
    for image in data:
        arr.append(image.flatten())
    return np.array(arr)

In [7]:
x_train_1 = flatten_arr(x_train_1)
x_train_2 = flatten_arr(x_train_2)

x_test_1 = flatten_arr(x_test_1)
x_test_2 = flatten_arr(x_test_2)

In [8]:
x_train_1.shape, x_train_2.shape, x_test_1.shape, x_test_2.shape

((2200, 11750), (2200, 11750), (1000, 11750), (1000, 11750))

In [9]:
# x_train_1 = x_train_1.flatten()
# x_train_2 = x_train_2.flatten()

# x_test_1 = x_test_1.flatten()
# x_test_2 = x_test_2.flatten()

In [10]:
scaler = StandardScaler()
def normalize(X_train,X_test):
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

  
x_train_1, x_test_1 = normalize(x_train_1, x_test_1)
x_train_2, x_test_2 = normalize(x_train_2, x_test_2)

In [11]:
def difference(data1, data2):
    sub = data1 - data2
    return sub

x_train_sub = difference(x_train_1, x_train_2)
x_test_sub = difference(x_test_1, x_test_2)

In [12]:
x_train_sub.shape, x_test_sub.shape

((2200, 11750), (1000, 11750))

In [13]:
# x_train_1 = x_train_1[:, 0]  # x_train_1.shape is (60000, 28, 28)
# x_train_2 = x_train_2[:, 1]

# x_test_1 = x_test_1[:, 0]  # x_val_1.shape = (60000, 28, 28)
# x_test_2 = x_test_2[:, 1]

In [14]:
def distances(data1, data2):
    cosine  = []
    euclidean = []
    cityblock = []
    for index, image in enumerate(data1):
        cosine.append(distance.cosine(image, data2[index]))
        euclidean.append(distance.euclidean(image, data2[index]))
        cityblock.append(distance.chebyshev(image, data2[index]))
    return cosine, euclidean, cityblock

x_train_cosine, x_train_euclidean, x_train_cityblock = distances(x_train_1, x_train_2)
x_test_cosine, x_test_euclidean, x_test_cityblock = distances(x_test_1, x_test_2)

In [15]:
df_train = pd.DataFrame(columns=["cosine_distance", "euclidean_distance", "cityblock", "difference", "label"])

df_test = pd.DataFrame(columns=["cosine_distance", "euclidean_distance", "cityblock", "difference", "label"])

In [16]:
df_train["cosine_distance"] = list(x_train_cosine)
df_train["euclidean_distance"] = list(x_train_euclidean)
df_train["cityblock"] = list(x_train_cityblock)
df_train["difference"] = list(x_train_sub)
df_train["label"] = list(y_train)

In [17]:
df_test["cosine_distance"] = list(x_test_cosine)
df_test["euclidean_distance"] = list(x_test_euclidean)
df_test["cityblock"] = list(x_test_cityblock)
df_test["difference"] = list(x_test_sub)
df_test["label"] = list(y_test)

In [18]:
df_train.head(3)

Unnamed: 0,cosine_distance,euclidean_distance,cityblock,difference,label
0,0.624871,81.479301,3.076822,"[0.47313946, 0.3971879, 0.07584522, 0.06514443...",1
1,1.36065,172.871582,3.975001,"[-0.76932395, -0.15090233, 0.62559354, 0.98460...",1
2,0.906941,150.094681,4.344463,"[0.11918616, 0.09976739, -0.0039057136, -0.048...",1


In [19]:
df_test.head(3)

Unnamed: 0,cosine_distance,euclidean_distance,cityblock,difference,label
0,0.745172,102.169167,3.709883,"[1.106498, 0.95236087, 0.64023924, 0.49182248,...",1
1,1.072152,157.613907,4.20989,"[0.112229094, -0.16099897, -0.35348526, -0.278...",1
2,1.107152,126.401039,3.05385,"[-0.5141104, -0.596771, -0.60187906, -0.662098...",1


In [20]:
# convert True -> 1 and False -> 0
df_train["label"] = df_train["label"].astype(int)
df_test["label"] = df_test["label"].astype(int)

features = df_train.drop(columns=["label","difference"]).columns.tolist()

x_train = df_train.drop(columns=["label","difference"])
y_train = df_train["label"]
x_test = df_test.drop(columns=["label","difference"])
y_test = df_test["label"]

In [52]:
x_train


Unnamed: 0,cosine_distance,euclidean_distance,cityblock
0,0.624871,81.479301,3.076822
1,1.360650,172.871582,3.975001
2,0.906941,150.094681,4.344463
3,0.209680,83.075729,3.372981
4,0.808415,147.448212,3.440904
...,...,...,...
2195,0.951820,125.993195,4.345860
2196,1.093717,175.461868,4.655218
2197,0.967957,125.480942,3.907397
2198,0.581426,122.287132,3.769382


In [57]:
xgb_cl = xgb.XGBClassifier(n_estimators=250,
                             max_depth=8,
                             objective= 'binary:logistic',
                             seed=27)

In [58]:
# Fit
xgb_cl.fit(x_train, y_train)

# Predict
y_pred = xgb_cl.predict(x_test)

# Score
accuracy_score(y_test, y_pred)

0.605

In [33]:
from sklearn.metrics import confusion_matrix, classification_report

target = lfw_dataset_test.target
target_names = lfw_dataset_test.target_names


print(classification_report(y_pred, y_test, target_names=target_names))

                   precision    recall  f1-score   support

Different persons       0.64      0.60      0.62       533
      Same person       0.57      0.61      0.59       467

         accuracy                           0.61      1000
        macro avg       0.61      0.61      0.61      1000
     weighted avg       0.61      0.61      0.61      1000



In [49]:
model = RandomForestClassifier(n_estimators=250, max_depth=8, criterion = "log_loss")
model.fit(x_train, y_train)

In [50]:
# Predict
y_pred = model.predict(x_test)

# Score
accuracy_score(y_test, y_pred)

0.64