In [1]:
import numpy as np
from sklearn import datasets
import sys
sys.path.append('..')
from sklearn.decomposition import PCA
from PCA_FDA.FDA_Edo import FisherDiscriminantAnalysis
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display

from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix


from imblearn.over_sampling import RandomOverSampler 

### IMPORTAZIONE DEL DATASET
 Importiamo il dataset e lo dividiamo nelle componenti rilevanti per l'analisi

In [2]:
lfw_people = datasets.fetch_lfw_people(min_faces_per_person=100, resize=0.45)
face_data = lfw_people['data']
face_images = lfw_people['images']
face_tnames = lfw_people['target_names']
face_targets = lfw_people['target']

### DIVISIONE DEL DATASET
Divisione del dataset nella parte di Training e nella componente di Testing

In [3]:
# Creare gli X_trainval, y_trainval, X_test, y_test
#5890
random_state = 19990704 #np.random.randint(0, 100000)
print('random_seed = ', random_state)
test_p = 0.4
#val_p = 0.25  # Percentuale di dati di X_trainval da usare come validation set

X_trainval, X_test, y_trainval, y_test, _, img_test = train_test_split(face_data, face_targets, face_images, test_size=test_p, random_state=random_state, shuffle=True)

random_seed =  19990704


## OVERSAMPLING


In [4]:
ros = RandomOverSampler(random_state= random_state)
#X_trainval, y_trainval = ros.fit_resample(X_trainval, y_trainval)

# RIDUZIONE DELLA DIMENSIONALITÀ
## PCA
Usiamo la PCA per evitare che l'analisi di fischer abbia problemi di singolarità della matrice da invertire

## FDA
Usiamo la FDA per ridurre significativamente la dimensionalità del problema e come metodo di predizione

In [5]:
#PCA
n_components_pca = X_trainval.shape[0] - face_tnames.shape[0]
pca = PCA(n_components= n_components_pca)
pca_data = pca.fit_transform(X_trainval)

#FDA
n_components_fda = face_tnames.shape[0]-1
fda = FisherDiscriminantAnalysis(n_components_fda)
fda.fit(pca_data, y_trainval)
fda_data = fda.transform(pca_data)

In [6]:
print('Dimensionalità Dati: ' , X_trainval.shape[1])
print('Dimensionalità PCA: ' , n_components_pca)
print('Dimensionalità FDA: ' , n_components_fda)

Dimensionalità Dati:  2352
Dimensionalità PCA:  679
Dimensionalità FDA:  4


## VALUTAZIONE DEL TEST SET

Utiliziamo il metodo dell'FDA, anche se in realtà esso è un metodoto di riduzione della dimensionalità che modifica la geometria dello spazio, per determinare a qualche classe il dato appartiene

In [7]:
#Valutazione del dataset usato per il training
y_pred_trainval = fda.evaluate(pca_data)

#valutazine del dataset nuovo

X_test = pca.transform(X_test)
y_pred = fda.evaluate(X_test)

In [8]:
print(y_trainval.shape, y_pred_trainval.shape, pca_data.shape)

(684,) (684,) (684, 679)


In [9]:
prec_trainval = precision_score(y_trainval, y_pred_trainval, average='weighted', zero_division=0)
rec_trainval = recall_score(y_trainval, y_pred_trainval, average='weighted')
f1_trainval = f1_score(y_trainval, y_pred_trainval, average='weighted')

prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

df_perf = pd.DataFrame({ 'Precision': [prec_trainval, prec], 
                        'Recall': [rec_trainval, rec],
                        'F1': [f1_trainval, f1]
                       },
                      index=['train. + val.', 'test'])
display(df_perf)


Unnamed: 0,Precision,Recall,F1
train. + val.,1.0,1.0,1.0
test,0.766413,0.767544,0.763882


In [10]:
cmat = confusion_matrix(y_test, y_pred)
cmat_norm_true = confusion_matrix(y_test, y_pred, normalize='true')
cmat_norm_pred = confusion_matrix(y_test, y_pred, normalize='pred')

df_cmat = pd.DataFrame(cmat, columns=face_tnames, index=face_tnames)
df_cmat_norm_true = pd.DataFrame(cmat_norm_true, columns=face_tnames, index=face_tnames)
df_cmat_norm_pred = pd.DataFrame(cmat_norm_pred, columns=face_tnames, index=face_tnames)

display(df_cmat)
display(df_cmat_norm_true)
display(df_cmat_norm_pred)


Unnamed: 0,Colin Powell,Donald Rumsfeld,George W Bush,Gerhard Schroeder,Tony Blair
Colin Powell,74,6,16,0,1
Donald Rumsfeld,2,26,16,1,10
George W Bush,8,7,185,5,6
Gerhard Schroeder,2,1,3,36,5
Tony Blair,2,1,13,1,29


Unnamed: 0,Colin Powell,Donald Rumsfeld,George W Bush,Gerhard Schroeder,Tony Blair
Colin Powell,0.762887,0.061856,0.164948,0.0,0.010309
Donald Rumsfeld,0.036364,0.472727,0.290909,0.018182,0.181818
George W Bush,0.037915,0.033175,0.876777,0.023697,0.028436
Gerhard Schroeder,0.042553,0.021277,0.06383,0.765957,0.106383
Tony Blair,0.043478,0.021739,0.282609,0.021739,0.630435


Unnamed: 0,Colin Powell,Donald Rumsfeld,George W Bush,Gerhard Schroeder,Tony Blair
Colin Powell,0.840909,0.146341,0.06867,0.0,0.019608
Donald Rumsfeld,0.022727,0.634146,0.06867,0.023256,0.196078
George W Bush,0.090909,0.170732,0.793991,0.116279,0.117647
Gerhard Schroeder,0.022727,0.02439,0.012876,0.837209,0.098039
Tony Blair,0.022727,0.02439,0.055794,0.023256,0.568627
