In [None]:
# https://github.com/maede43/ensemble-learning-lfw/blob/main/Ensemble_learning_LFW.ipynb

In [1]:
from sklearn.datasets import fetch_lfw_pairs

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import Perceptron

In [2]:
lfw_dataset_train = fetch_lfw_pairs(subset='train')
lfw_dataset_test = fetch_lfw_pairs(subset='test')

In [67]:
X_train = lfw_dataset_train.pairs
y_train = lfw_dataset_train.target

X_test = lfw_dataset_test.pairs
y_test = lfw_dataset_test.target

In [68]:
x_train_1 = X_train[:, 0]  # x_train_1.shape is (60000, 28, 28)
x_train_2 = X_train[:, 1]

x_test_1 = X_test[:, 0]  # x_val_1.shape = (60000, 28, 28)
x_test_2 = X_test[:, 1]

In [69]:
x_train_1.shape

(2200, 62, 47)

In [70]:
def flatten_arr(data):
    arr = []
    for image in data:
        arr.append(image.flatten())
    return np.array(arr)

In [71]:
x_train_1 = flatten_arr(x_train_1)
x_train_2 = flatten_arr(x_train_2)

x_test_1 = flatten_arr(x_test_1)
x_test_2 = flatten_arr(x_test_2)

In [72]:
x_train_1.shape, x_train_2.shape, x_test_1.shape, x_test_2.shape

((2200, 2914), (2200, 2914), (1000, 2914), (1000, 2914))

In [73]:
# x_train_1 = x_train_1.flatten()
# x_train_2 = x_train_2.flatten()

# x_test_1 = x_test_1.flatten()
# x_test_2 = x_test_2.flatten()

In [74]:
scaler = StandardScaler()
def normalize(X_train,X_test):
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

  
x_train_1, x_test_1 = normalize(x_train_1, x_test_1)
x_train_2, x_test_2 = normalize(x_train_2, x_test_2)

In [75]:
def difference(data1, data2):
    sub = data1 - data2
    return sub

x_train_sub = difference(x_train_1, x_train_2)
x_test_sub = difference(x_test_1, x_test_2)

In [76]:
x_train_sub.shape, x_test_sub.shape

((2200, 2914), (1000, 2914))

In [77]:
# x_train_1 = x_train_1[:, 0]  # x_train_1.shape is (60000, 28, 28)
# x_train_2 = x_train_2[:, 1]

# x_test_1 = x_test_1[:, 0]  # x_val_1.shape = (60000, 28, 28)
# x_test_2 = x_test_2[:, 1]

In [78]:
def distances(data1, data2):
    cosine  = []
    euclidean = []
    cityblock = []
    for index, image in enumerate(data1):
        cosine.append(distance.cosine(image, data2[index]))
        euclidean.append(distance.euclidean(image, data2[index]))
        cityblock.append(distance.chebyshev(image, data2[index]))
    return cosine, euclidean, cityblock

x_train_cosine, x_train_euclidean, x_train_cityblock = distances(x_train_1, x_train_2)
x_test_cosine, x_test_euclidean, x_test_cityblock = distances(x_test_1, x_test_2)

In [79]:
df_train = pd.DataFrame(columns=["cosine_distance", "euclidean_distance", "cityblock", "difference", "label"])

df_test = pd.DataFrame(columns=["cosine_distance", "euclidean_distance", "cityblock", "difference", "label"])

In [80]:
df_train["cosine_distance"] = list(x_train_cosine)
df_train["euclidean_distance"] = list(x_train_euclidean)
df_train["cityblock"] = list(x_train_cityblock)
df_train["difference"] = list(x_train_sub)
df_train["label"] = list(y_train)

In [81]:
df_test["cosine_distance"] = list(x_test_cosine)
df_test["euclidean_distance"] = list(x_test_euclidean)
df_test["cityblock"] = list(x_test_cityblock)
df_test["difference"] = list(x_test_sub)
df_test["label"] = list(y_test)

In [82]:
df_train.head(3)

Unnamed: 0,cosine_distance,euclidean_distance,cityblock,difference,label
0,0.621189,40.542305,2.914099,"[0.32709953, 0.066055536, 0.060812466, 0.19297...",1
1,1.366259,86.276566,3.846085,"[-0.243422, 0.78147167, 0.6042254, 0.6420846, ...",1
2,0.906725,74.652199,4.188267,"[0.12834334, 0.0021681786, -0.102196336, -0.20...",1


In [83]:
df_test.head(3)

Unnamed: 0,cosine_distance,euclidean_distance,cityblock,difference,label
0,0.740514,50.444973,3.64234,"[0.88399583, 0.3118025, 0.21575242, 0.23412043...",1
1,1.074314,78.348885,4.159446,"[-0.118907735, -0.54705584, -0.33331823, -0.47...",1
2,1.10707,63.010399,2.9761,"[-0.55708647, -0.5926435, -0.47381562, -0.4158...",1


In [84]:
# convert True -> 1 and False -> 0
df_train["label"] = df_train["label"].astype(int)
df_test["label"] = df_test["label"].astype(int)

features = df_train.drop(columns=["label","difference"]).columns.tolist()

x_train = df_train.drop(columns=["label","difference"])
y_train = df_train["label"]
x_test = df_test.drop(columns=["label","difference"])
y_test = df_test["label"]

In [85]:
params = {
    'objective': 'multi:softprob',
    'num_class': 2,
    'eval_metric': 'mlogloss',
    'booster': 'gbtree'
}

In [86]:
xgb_cl = xgb.XGBClassifier(params)



In [87]:
dtrain = xgb.DMatrix(x_train, label=y_train)

In [88]:
# Train the XGBoost model
num_rounds = 250
model = xgb.train(params, dtrain, num_rounds)

# Create the XGBoost DMatrix for test data
dtest = xgb.DMatrix(x_test)

# Make predictions on the test set
y_pred_proba = model.predict(dtest)
y_pred = y_pred_proba.argmax(axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 62.40%


In [89]:
# # Fit
# model.fit(x_train, y_train)

# # Predict
# y_pred = xgb_cl.predict(x_test)

# # Score
# accuracy_score(y_test, y_pred)

In [90]:
from sklearn.metrics import confusion_matrix, classification_report

target = lfw_dataset_test.target
target_names = lfw_dataset_test.target_names


print(classification_report(y_pred, y_test, target_names=target_names))

                   precision    recall  f1-score   support

Different persons       0.64      0.62      0.63       520
      Same person       0.60      0.63      0.62       480

         accuracy                           0.62      1000
        macro avg       0.62      0.62      0.62      1000
     weighted avg       0.62      0.62      0.62      1000



In [91]:
import lightgbm as lgb

In [92]:
lgb_train = lgb.Dataset(x_train, y_train, feature_name=features)
lgb_test = lgb.Dataset(x_test, y_test, feature_name = features)
# lgb.early_stopping(15)

<lightgbm.callback._EarlyStoppingCallback at 0x7f66732c13c0>

In [98]:
params = {
'task': 'train'
, 'boosting_type': 'gbdt'
, 'objective': 'multiclass'
, 'num_class': 2
, 'metric': 'multi_logloss'
}
 
gbm = lgb.train(params, lgb_train, num_boost_round=250, valid_sets=[lgb_test], early_stopping_rounds = 15) #early_stopping_rounds = 15,

TypeError: train() got an unexpected keyword argument 'early_stopping_rounds'

In [95]:
predictions = gbm.predict(x_test)

In [96]:
y_pred = []
for prediction in predictions:
  prediction_class = np.argmax(prediction)
  y_pred.append(prediction_class)

accuracy = accuracy_score(y_pred, y_test)
print("accuracy: {:.1f}%".format(accuracy * 100))

accuracy: 61.4%


In [97]:
from sklearn.metrics import confusion_matrix, classification_report

target = lfw_dataset_test.target
target_names = lfw_dataset_test.target_names


print(classification_report(y_pred, y_test, target_names=target_names))

                   precision    recall  f1-score   support

Different persons       0.62      0.61      0.62       504
      Same person       0.61      0.61      0.61       496

         accuracy                           0.61      1000
        macro avg       0.61      0.61      0.61      1000
     weighted avg       0.61      0.61      0.61      1000

