# Latent Fingerprints Clustering - Model and Evaluation

### Importing Libraries

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
rs = 123
import time
#Preprocessing
from sklearn import preprocessing
import category_encoders as ce
import glob
#Sci-kit libraries
from sklearn import metrics
from sklearn import utils
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold



## VIC Implementation

In [10]:
def VIC(folder_path, classifier, k = 10):
    best_auc = {}
    for file in glob.glob(folder_path):
        auc_score = {}
        # read as pandas data frame
        db = pd.read_csv(file)
        #missing values
        db = db.replace([np.inf, -np.inf], np.nan)
        db = db.fillna(0)
        # define X and y
        y = db.iloc[:,-1:].values
        X = db.iloc[:,2:-2].values
        # encode categorical variables
        labelencoder = preprocessing.LabelEncoder()
        X[:,0] = labelencoder.fit_transform(X[:,0])
        #Pipeline - Scale and Evaluate with CV
        scaler = StandardScaler()
        pipeline = Pipeline(steps =
                 [('s', scaler),
                  ('c', classifier)])
        myscorer = metrics.make_scorer(metrics.roc_auc_score,
                                        average='macro',
                                        multi_class='ovo',needs_proba=True)
        X, y = utils.shuffle(X, y, random_state=0)
        y = y[:,0]
        auc = model_selection.cross_validate(pipeline, X, y, cv = k, scoring = myscorer)
        auc_score[classifier] = sum(auc['test_score'])/len(auc['test_score'])
        # Print result
        print(f'Partition: {file}')
        print(f'Classifier: {classifier} \n AUC: {auc_score[classifier]}')
        # The best partition AUC
        best_auc[file] = auc_score
    return best_auc

In [25]:
import csv
from pathlib import Path
def create_csv(classifier_results, output):
    csvout = output
    with open(csvout, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["file", "classifier", "auc"])
        for x, y in classifier_results.items():
            filename = Path(x)
            writer.writerow([filename.stem, str(list(y)[0]), str(y[list(y)[0]])])

In [28]:
## Datasets 
folder = 'csv_partitions/*.csv'

In [13]:
results_LG = VIC(folder, LogisticRegression())

Partition: csv_partitions/sc_c2_th0.csv
Classifier: LogisticRegression() 
 AUC: 0.7281579478828144
Partition: csv_partitions/sc_c2_th1.csv
Classifier: LogisticRegression() 
 AUC: 0.7227136904231382
Partition: csv_partitions/sc_c2_th2.csv
Classifier: LogisticRegression() 
 AUC: 0.7133222560425918
Partition: csv_partitions/sc_c2_th3.csv
Classifier: LogisticRegression() 
 AUC: 0.7074824406157374
Partition: csv_partitions/sc_c2_th4.csv
Classifier: LogisticRegression() 
 AUC: 0.7071780650445822
Partition: csv_partitions/sc_c2_th5.csv
Classifier: LogisticRegression() 
 AUC: 0.7001124766967848
Partition: csv_partitions/sc_c2_th6.csv
Classifier: LogisticRegression() 
 AUC: 0.6992642047556588
Partition: csv_partitions/sc_c2_th7.csv
Classifier: LogisticRegression() 
 AUC: 0.6946328586443887
Partition: csv_partitions/sc_c2_th8.csv
Classifier: LogisticRegression() 
 AUC: 0.6935128478835514
Partition: csv_partitions/sc_c2_th9.csv
Classifier: LogisticRegression() 
 AUC: 0.6888706852067313
Partition:

In [19]:
results_GNB = VIC(folder, GaussianNB())

Partition: csv_partitions/sc_c2_th0.csv
Classifier: GaussianNB() 
 AUC: 0.7162533242321191
Partition: csv_partitions/sc_c2_th1.csv
Classifier: GaussianNB() 
 AUC: 0.7130332811715933
Partition: csv_partitions/sc_c2_th2.csv
Classifier: GaussianNB() 
 AUC: 0.6998101010999366
Partition: csv_partitions/sc_c2_th3.csv
Classifier: GaussianNB() 
 AUC: 0.696771889966425
Partition: csv_partitions/sc_c2_th4.csv
Classifier: GaussianNB() 
 AUC: 0.6944121282225229
Partition: csv_partitions/sc_c2_th5.csv
Classifier: GaussianNB() 
 AUC: 0.6877058657030761
Partition: csv_partitions/sc_c2_th6.csv
Classifier: GaussianNB() 
 AUC: 0.683731527231247
Partition: csv_partitions/sc_c2_th7.csv
Classifier: GaussianNB() 
 AUC: 0.678744851266323
Partition: csv_partitions/sc_c2_th8.csv
Classifier: GaussianNB() 
 AUC: 0.6763543211764604
Partition: csv_partitions/sc_c2_th9.csv
Classifier: GaussianNB() 
 AUC: 0.673865437111228
Partition: csv_partitions/sc_c2_th10.csv
Classifier: GaussianNB() 
 AUC: 0.6709575505653822
Pa

In [27]:
results_GBC = VIC(folder, GradientBoostingClassifier())

NameError: name 'folder' is not defined

In [None]:
results_LDA= VIC(Path, LinearDiscriminantAnalysis(solver = 'lsqr'))
results_KNN = VIC(Path, KNeighborsClassifier(5))
results_RF= VIC(Path, RandomForestClassifier(n_estimators=100, max_depth=2,n_jobs=-1))
sults_KNN = VIC(Path, KNeighborsClassifier(5))

In [26]:
create_csv(results_GNB, 'classGNB.csv')

In [None]:
create_csv(results_GBC, 'classGBC.csv')