# Baseline ML

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score, KFold
#Import classical libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

plt.style.use('ggplot')



In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('UCI_Credit_Card.csv', sep=',')

In [4]:
df_labels = df['default.payment.next.month']
df.drop(['default.payment.next.month'],axis = 1,inplace = True)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df, df_labels, test_size=0.2, random_state=42)

In [6]:
from sklearn.model_selection import cross_validate
from tqdm import tqdm
def evaluate_ml_model(_models, X, y, n_fold=10, metric='precision'):
    ''' Function to evaluate a ML and QML model with a list of metrics
    
    
    '''
    results = pd.DataFrame()
    kfold = KFold(n_splits=n_fold)
    columns = []
    for name, model in tqdm(_models):
        # -------------------
        # Variables initialization 
        _df = pd.DataFrame()
        names = []
        means = []
        stds = []
        
        # -------------------
        # k-fold Cross validation
        cv_results = cross_validate(model, X, y, cv=kfold, scoring=metric)
        
        # -------------------
        # Compute the mean and standard deviation 
        for _name, _array in cv_results.items():
            names.append(_name)
            means.append(round(100*_array.mean(), 2))
            stds.append(round(100*_array.std(), 2))
        # -------------------
        # Save the results in a dataframe 
        _df =  pd.DataFrame([means, stds], columns=names)
        columns.extend([name+' mean (%)', name+' std (%)'])
        #results = results.join(_df, on=_df.index)
        results = results.append(_df)
    results.index = columns
    print(results)
    return results

In [7]:
models = []
models.append(('LR', LogisticRegression(max_iter=1000, random_state=42)))
models.append(('KNN', KNeighborsClassifier(n_neighbors=7)))
models.append(('CART', DecisionTreeClassifier(random_state=42)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(random_state=42)))

_metrics = ['precision', 'recall', 'f1', 'accuracy',  'matthews_corrcoef','balanced_accuracy']

In [8]:
df_results = pd.DataFrame()

In [9]:
df_results = evaluate_ml_model(models, X_train, y_train, n_fold=10, metric=_metrics)

100%|█████████████████████████████████████████████| 5/5 [01:26<00:00, 17.30s/it]

               fit_time  score_time  test_precision  test_recall  test_f1  \
LR mean (%)       23.51        0.47            0.00         0.00     0.00   
LR std (%)        12.02        0.22            0.00         0.00     0.00   
KNN mean (%)       0.44        9.93           38.74        15.45    22.07   
KNN std (%)        0.45        8.67            2.03         1.51     1.76   
CART mean (%)     33.97        0.35           37.79        40.53    39.10   
CART std (%)       1.19        0.03            1.51         1.51     1.34   
NB mean (%)        0.56        0.35           24.71        88.41    38.62   
NB std (%)         0.03        0.03            0.89         1.55     1.15   
SVM mean (%)     695.94       98.88            0.00         0.00     0.00   
SVM std (%)       13.41        0.63            0.00         0.00     0.00   

               test_accuracy  test_matthews_corrcoef  test_balanced_accuracy  
LR mean (%)            77.81                   -0.22                   49




In [10]:
df_results

Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_accuracy,test_matthews_corrcoef,test_balanced_accuracy
LR mean (%),23.51,0.47,0.0,0.0,0.0,77.81,-0.22,49.99
LR std (%),12.02,0.22,0.0,0.0,0.0,0.86,0.44,0.01
KNN mean (%),0.44,9.93,38.74,15.45,22.07,75.84,12.43,54.26
KNN std (%),0.45,8.67,2.03,1.51,1.76,0.76,1.64,0.65
CART mean (%),33.97,0.35,37.79,40.53,39.1,72.02,20.99,60.76
CART std (%),1.19,0.03,1.51,1.51,1.34,0.77,1.45,0.75
NB mean (%),0.56,0.35,24.71,88.41,38.62,37.7,11.94,55.82
NB std (%),0.03,0.03,0.89,1.55,1.15,0.81,1.74,0.88
SVM mean (%),695.94,98.88,0.0,0.0,0.0,77.82,0.0,50.0
SVM std (%),13.41,0.63,0.0,0.0,0.0,0.86,0.0,0.0


## Fraud Dataset

## Full dataset

In [11]:
df = pd.read_csv('fraud_detection_bank_dataset.csv', sep=',')

In [12]:
df = df.drop(['Unnamed: 0'], axis = 1)
df_labels = df['targets']
df.drop(['targets'],axis = 1,inplace = True)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df, df_labels, test_size=0.2, random_state=42)

In [14]:
models = []
models.append(('LR', LogisticRegression(max_iter=1000, random_state=42)))
models.append(('KNN', KNeighborsClassifier(n_neighbors=7)))
models.append(('CART', DecisionTreeClassifier(random_state=42)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(random_state=42)))

_metrics = ['precision', 'recall', 'f1', 'accuracy',  'matthews_corrcoef','balanced_accuracy']

In [15]:
df_results2 = pd.DataFrame()

In [None]:
df_results2 = evaluate_ml_model(models, X_train, y_train, n_fold=10, metric=_metrics)

 80%|████████████████████████████████████         | 4/5 [00:35<00:05,  5.16s/it]

In [None]:
df_results2

## Without highly correlated features

In [None]:
cols = ['col_8', 'col_9', 'col_10', 'col_11', 'col_12', 'col_18', 'col_19','col_20', 'col_21', 'col_35', 
        'col_51', 'col_52', 'col_53', 'col_70','col_71','col_7', 'col_22', 'col_54', 'col_56']

X_train = X_train.drop(cols, axis=1)
X_test = X_test.drop(cols, axis=1)

In [None]:
models = []
models.append(('LR', LogisticRegression(max_iter=1000, random_state=42)))
models.append(('KNN', KNeighborsClassifier(n_neighbors=7)))
models.append(('CART', DecisionTreeClassifier(random_state=42)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(random_state=42)))

_metrics = ['precision', 'recall', 'f1', 'accuracy',  'matthews_corrcoef','balanced_accuracy']

In [None]:
df_results3 = pd.DataFrame()

In [None]:
df_results3 = evaluate_ml_model(models, X_train, y_train, n_fold=10, metric=_metrics)