# Baseline ML

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score, KFold
#Import classical libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

plt.style.use('ggplot')



In [3]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('UCI_Credit_Card.csv', sep=',')


In [4]:
df_labels = df['default.payment.next.month']
df.drop(['default.payment.next.month'],axis = 1,inplace = True)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df, df_labels, test_size=0.2, random_state=42)

In [5]:
from sklearn.model_selection import cross_validate
from tqdm import tqdm
def evaluate_ml_model(_models, X, y, n_fold=10, metric='precision'):
    ''' Function to evaluate a ML and QML model with a list of metrics
    
    
    '''
    results = pd.DataFrame()
    kfold = KFold(n_splits=n_fold)
    columns = []
    for name, model in tqdm(_models):
        # -------------------
        # Variables initialization 
        _df = pd.DataFrame()
        names = []
        means = []
        stds = []
        
        # -------------------
        # k-fold Cross validation
        cv_results = cross_validate(model, X, y, cv=kfold, scoring=metric)
        
        # -------------------
        # Compute the mean and standard deviation 
        for _name, _array in cv_results.items():
            names.append(_name)
            means.append(round(100*_array.mean(), 2))
            stds.append(round(100*_array.std(), 2))
        # -------------------
        # Save the results in a dataframe 
        _df =  pd.DataFrame([means, stds], columns=names)
        columns.extend([name+' mean (%)', name+' std (%)'])
        #results = results.join(_df, on=_df.index)
        results = results.append(_df)
    results.index = columns
    print(results)
    return results

In [7]:
models = []
models.append(('LR', LogisticRegression(max_iter=1000, random_state=42)))
models.append(('KNN', KNeighborsClassifier(n_neighbors=7)))
models.append(('CART', DecisionTreeClassifier(random_state=42)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(random_state=42)))

_metrics = ['precision', 'recall', 'f1', 'accuracy',  'matthews_corrcoef','balanced_accuracy']

In [8]:
df_results = pd.DataFrame()

In [9]:
df_results = evaluate_ml_model(models, X_train, y_train, n_fold=10, metric=_metrics)

100%|█████████████████████████████████████████████| 5/5 [01:26<00:00, 17.30s/it]

               fit_time  score_time  test_precision  test_recall  test_f1  \
LR mean (%)       23.51        0.47            0.00         0.00     0.00   
LR std (%)        12.02        0.22            0.00         0.00     0.00   
KNN mean (%)       0.44        9.93           38.74        15.45    22.07   
KNN std (%)        0.45        8.67            2.03         1.51     1.76   
CART mean (%)     33.97        0.35           37.79        40.53    39.10   
CART std (%)       1.19        0.03            1.51         1.51     1.34   
NB mean (%)        0.56        0.35           24.71        88.41    38.62   
NB std (%)         0.03        0.03            0.89         1.55     1.15   
SVM mean (%)     695.94       98.88            0.00         0.00     0.00   
SVM std (%)       13.41        0.63            0.00         0.00     0.00   

               test_accuracy  test_matthews_corrcoef  test_balanced_accuracy  
LR mean (%)            77.81                   -0.22                   49




In [10]:
df_results

Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_accuracy,test_matthews_corrcoef,test_balanced_accuracy
LR mean (%),23.51,0.47,0.0,0.0,0.0,77.81,-0.22,49.99
LR std (%),12.02,0.22,0.0,0.0,0.0,0.86,0.44,0.01
KNN mean (%),0.44,9.93,38.74,15.45,22.07,75.84,12.43,54.26
KNN std (%),0.45,8.67,2.03,1.51,1.76,0.76,1.64,0.65
CART mean (%),33.97,0.35,37.79,40.53,39.1,72.02,20.99,60.76
CART std (%),1.19,0.03,1.51,1.51,1.34,0.77,1.45,0.75
NB mean (%),0.56,0.35,24.71,88.41,38.62,37.7,11.94,55.82
NB std (%),0.03,0.03,0.89,1.55,1.15,0.81,1.74,0.88
SVM mean (%),695.94,98.88,0.0,0.0,0.0,77.82,0.0,50.0
SVM std (%),13.41,0.63,0.0,0.0,0.0,0.86,0.0,0.0


## Santander Dataset

## Full dataset

In [4]:
df = pd.read_csv('santander_transaction.csv', sep=',')

In [5]:
df.columns

Index(['target', 'var_0', 'var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6',
       'var_7', 'var_8',
       ...
       'var_190', 'var_191', 'var_192', 'var_193', 'var_194', 'var_195',
       'var_196', 'var_197', 'var_198', 'var_199'],
      dtype='object', length=201)

In [6]:
df.shape

(1000, 201)

In [8]:
#df = df.drop(['Unnamed: 0'], axis = 1)
df_labels = df['target']
df.drop(['target'],axis = 1,inplace = True)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df, df_labels, test_size=0.2, random_state=42)

In [10]:
models = []
models.append(('LR', LogisticRegression(max_iter=1000, random_state=42)))
models.append(('KNN', KNeighborsClassifier(n_neighbors=7)))
models.append(('CART', DecisionTreeClassifier(random_state=42)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(random_state=42)))

_metrics = ['precision', 'recall', 'f1', 'accuracy',  'matthews_corrcoef','balanced_accuracy']

In [11]:
df_results2 = pd.DataFrame()

In [14]:
df_results2 = evaluate_ml_model(models, X_train, y_train, n_fold=10, metric=_metrics)

100%|████████████████████████████████████████████| 5/5 [09:27<00:00, 113.56s/it]

               fit_time  score_time  test_precision  test_recall  test_f1  \
LR mean (%)     5636.00        3.72           66.83        68.04    67.33   
LR std (%)      3996.84        3.46            6.93         7.15     6.55   
KNN mean (%)       0.18       23.80           56.44        38.38    45.14   
KNN std (%)        0.05       11.86            6.90         8.04     6.68   
CART mean (%)      7.75        0.27           54.12        52.56    53.22   
CART std (%)       1.04        0.01            5.37         8.82     7.00   
NB mean (%)        0.20        0.25           72.29        71.11    71.55   
NB std (%)         0.02        0.01            6.07         6.79     5.65   
SVM mean (%)       4.54        1.10           63.91        63.42    63.35   
SVM std (%)        0.13        0.03            7.84         7.97     6.79   

               test_accuracy  test_matthews_corrcoef  test_balanced_accuracy  
LR mean (%)            66.62                   32.95                   66




In [15]:
df_results2

Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_accuracy,test_matthews_corrcoef,test_balanced_accuracy
LR mean (%),5636.0,3.72,66.83,68.04,67.33,66.62,32.95,66.48
LR std (%),3996.84,3.46,6.93,7.15,6.55,6.02,12.06,6.03
KNN mean (%),0.18,23.8,56.44,38.38,45.14,53.5,8.01,53.82
KNN std (%),0.05,11.86,6.9,8.04,6.68,2.29,5.38,2.65
CART mean (%),7.75,0.27,54.12,52.56,53.22,53.62,6.22,53.1
CART std (%),1.04,0.01,5.37,8.82,7.0,4.52,9.75,4.88
NB mean (%),0.2,0.25,72.29,71.11,71.55,71.38,42.61,71.31
NB std (%),0.02,0.01,6.07,6.79,5.65,5.26,10.56,5.3
SVM mean (%),4.54,1.1,63.91,63.42,63.35,63.0,26.19,63.06
SVM std (%),0.13,0.03,7.84,7.97,6.79,5.57,10.71,5.35


In [16]:
j = 0
for i in range(int(len(df_results2.index)/2)):

    print(f'{df_results2.iloc[j].name.split()[0]} & {df_results2.iloc[j][2]} ({df_results2.iloc[j+1][2]}) & {df_results2.iloc[j][3]} ({df_results2.iloc[j+1][3]}) &  {df_results2.iloc[j][4]} ({df_results2.iloc[j+1][4]}) & {df_results2.iloc[j][6]} ({df_results2.iloc[j+1][6]}) & {df_results2.iloc[j][7]} ({df_results2.iloc[j+1][7]}) \\')
    
    j+=2

LR & 66.83 (6.93) & 68.04 (7.15) &  67.33 (6.55) & 32.95 (12.06) & 66.48 (6.03) \
KNN & 56.44 (6.9) & 38.38 (8.04) &  45.14 (6.68) & 8.01 (5.38) & 53.82 (2.65) \
CART & 54.12 (5.37) & 52.56 (8.82) &  53.22 (7.0) & 6.22 (9.75) & 53.1 (4.88) \
NB & 72.29 (6.07) & 71.11 (6.79) &  71.55 (5.65) & 42.61 (10.56) & 71.31 (5.3) \
SVM & 63.91 (7.84) & 63.42 (7.97) &  63.35 (6.79) & 26.19 (10.71) & 63.06 (5.35) \


# Fraud dataset 

In [4]:
df = pd.read_csv('fraud_detection_bank_dataset.csv', sep=',')

In [6]:
df = df.drop(['Unnamed: 0'], axis = 1)
df_labels = df['targets']
df.drop(['targets'],axis = 1,inplace = True)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df, df_labels, test_size=0.2, random_state=42)

In [8]:
models = []
models.append(('LR', LogisticRegression(max_iter=1000, random_state=42)))
models.append(('KNN', KNeighborsClassifier(n_neighbors=7)))
models.append(('CART', DecisionTreeClassifier(random_state=42)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(random_state=42)))

_metrics = ['precision', 'recall', 'f1', 'accuracy',  'matthews_corrcoef','balanced_accuracy']

In [9]:
df_results = pd.DataFrame()

In [10]:
df_results = evaluate_ml_model(models, X_train, y_train, n_fold=10, metric=_metrics)

100%|█████████████████████████████████████████████| 5/5 [03:24<00:00, 40.91s/it]

               fit_time  score_time  test_precision  test_recall  test_f1  \
LR mean (%)     1118.09        0.88           71.52        46.77    56.49   
LR std (%)        41.09        0.50            3.54         2.59     2.26   
KNN mean (%)       0.79       10.96           74.35        64.39    68.99   
KNN std (%)        0.15       12.63            1.64         2.61     2.03   
CART mean (%)     17.12        0.35           80.70        81.93    81.29   
CART std (%)       0.35        0.02            2.07         1.80     1.42   
NB mean (%)        1.64        0.43           28.45        96.91    43.97   
NB std (%)         0.16        0.04            1.08         0.87     1.30   
SVM mean (%)     739.78      155.14            0.00         0.00     0.00   
SVM std (%)       19.70        3.91            0.00         0.00     0.00   

               test_accuracy  test_matthews_corrcoef  test_balanced_accuracy  
LR mean (%)            80.80                   46.56                   69




In [11]:
j = 0
for i in range(int(len(df_results.index)/2)):

    print(f'{df_results.iloc[j].name.split()[0]} & {df_results.iloc[j][2]} ({df_results.iloc[j+1][2]}) & {df_results.iloc[j][3]} ({df_results.iloc[j+1][3]}) &  {df_results.iloc[j][4]} ({df_results.iloc[j+1][4]}) & {df_results.iloc[j][6]} ({df_results.iloc[j+1][6]}) & {df_results.iloc[j][7]} ({df_results.iloc[j+1][7]}) \\')
    
    j+=2

LR & 71.52 (3.54) & 46.77 (2.59) &  56.49 (2.26) & 46.56 (2.82) & 69.98 (1.3) \
KNN & 74.35 (1.64) & 64.39 (2.61) &  68.99 (2.03) & 59.06 (2.77) & 78.14 (1.5) \
CART & 80.7 (2.07) & 81.93 (1.8) &  81.29 (1.42) & 74.43 (1.92) & 87.4 (0.98) \
NB & 28.45 (1.08) & 96.91 (0.87) &  43.97 (1.3) & 12.62 (1.45) & 54.11 (0.58) \
SVM & 0.0 (0.0) & 0.0 (0.0) &  0.0 (0.0) & 0.0 (0.0) & 50.0 (0.0) \


## Without high correlated columns

In [12]:
cols = ['col_8', 'col_9', 'col_10', 'col_11', 'col_12', 'col_18', 'col_19','col_20', 'col_21', 'col_35', 
        'col_51', 'col_52', 'col_53', 'col_70','col_71','col_7', 'col_22', 'col_54', 'col_56']

X_train = X_train.drop(cols, axis=1)
X_test = X_test.drop(cols, axis=1)

In [13]:
models = []
models.append(('LR', LogisticRegression(max_iter=1000, random_state=42)))
models.append(('KNN', KNeighborsClassifier(n_neighbors=7)))
models.append(('CART', DecisionTreeClassifier(random_state=42)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(random_state=42)))

_metrics = ['precision', 'recall', 'f1', 'accuracy',  'matthews_corrcoef','balanced_accuracy']

In [14]:
df_results = pd.DataFrame()

In [15]:
df_results = evaluate_ml_model(models, X_train, y_train, n_fold=10, metric=_metrics)

100%|█████████████████████████████████████████████| 5/5 [02:12<00:00, 26.49s/it]

               fit_time  score_time  test_precision  test_recall  test_f1  \
LR mean (%)      491.06        0.62           71.54        47.27    56.88   
LR std (%)       237.90        0.28            2.77         1.96     1.62   
KNN mean (%)       0.67        7.27           74.34        64.56    69.09   
KNN std (%)        0.48        2.93            1.77         2.36     1.91   
CART mean (%)     15.73        0.33           80.68        81.69    81.17   
CART std (%)       0.37        0.01            1.87         2.06     1.63   
NB mean (%)        1.35        0.37           28.43        96.95    43.96   
NB std (%)         0.09        0.03            1.07         0.88     1.30   
SVM mean (%)     661.29      144.84            0.00         0.00     0.00   
SVM std (%)        9.95        2.44            0.00         0.00     0.00   

               test_accuracy  test_matthews_corrcoef  test_balanced_accuracy  
LR mean (%)            80.90                   46.89                   70




In [16]:
j = 0
for i in range(int(len(df_results.index)/2)):

    print(f'{df_results.iloc[j].name.split()[0]} & {df_results.iloc[j][2]} ({df_results.iloc[j+1][2]}) & {df_results.iloc[j][3]} ({df_results.iloc[j+1][3]}) &  {df_results.iloc[j][4]} ({df_results.iloc[j+1][4]}) & {df_results.iloc[j][6]} ({df_results.iloc[j+1][6]}) & {df_results.iloc[j][7]} ({df_results.iloc[j+1][7]}) \\')
    
    j+=2

LR & 71.54 (2.77) & 47.27 (1.96) &  56.88 (1.62) & 46.89 (1.88) & 70.2 (0.88) \
KNN & 74.34 (1.77) & 64.56 (2.36) &  69.09 (1.91) & 59.16 (2.65) & 78.22 (1.39) \
CART & 80.68 (1.87) & 81.69 (2.06) &  81.17 (1.63) & 74.27 (2.25) & 87.28 (1.21) \
NB & 28.43 (1.07) & 96.95 (0.88) &  43.96 (1.3) & 12.58 (1.36) & 54.07 (0.54) \
SVM & 0.0 (0.0) & 0.0 (0.0) &  0.0 (0.0) & 0.0 (0.0) & 50.0 (0.0) \
