In [None]:
!pip install xgboost==1.7.5
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import make_scorer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, recall_score, precision_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_data = pd.read_csv('train_data.csv',index_col=0)
test_data = pd.read_csv('test_data.csv',index_col=0)
train_data.index = range(len(train_data))
train_data.loc[train_data['OCCUPATION']!=1,'OCCUPATION'] = 0
test_data.index = range(len(test_data))
test_data.loc[test_data['OCCUPATION']!=1,'OCCUPATION'] = 0

## Prepping data

In [None]:
# first, we list all the categorical variables to be one hot encoded
cat_vars = ['MARRIAGE', 'EDUCATION']

In [None]:
# create an encoder for each cat_vars
encoders = [OneHotEncoder(categories='auto') for _ in range(len(cat_vars))] 
# encode each of the cat_vars with their respective encoder
encoded_tr = [encoders[i].fit_transform(train_data[[cat_var]]).todense() for i,cat_var in enumerate(cat_vars)]
encoded_test = [encoders[i].fit_transform(test_data[[cat_var]]).todense() for i,cat_var in enumerate(cat_vars)]

In [None]:
# drop the label column and also drop the cat_vars 
# this way we can combine the encoded categorical variables with the continuous variables 
X_train = pd.concat([train_data.iloc[:,:-1].drop(cat_vars, axis=1), 
                     pd.DataFrame(np.concatenate(encoded_tr, axis=1))], axis=1)
X_test = pd.concat([test_data.iloc[:,:-1].drop(cat_vars, axis=1), 
                    pd.DataFrame(np.concatenate(encoded_test, axis=1))], axis=1)
y_train = train_data.iloc[:,-1] 
y_test = test_data.iloc[:,-1]
X_train = X_train.rename(columns={0:'Marriage 1',1:'Marriage 2',2:'Marriage 3',3:'Edu 1',4:'Edu 2',5:'Edu 3',
                                  6:'Edu 4',7:'Edu 5',8:'Edu 6',9:'Edu 7'})
# Note that in the testing data, we do not have Marriage 3 and Edu 6
X_test = X_test.rename(columns={0:'Marriage 1',1:'Marriage 2',2:'Edu 1',3:'Edu 2',4:'Edu 3',
                                  5:'Edu 4',6:'Edu 5',7:'Edu 7'})
X_train = X_train.astype('float64')
X_test = X_test.astype('float64')

### Normalize Continuous Features - For the testing data, the mean and standard deviation has been used from the training data to do the normalization.¶

In [None]:
for i in [0,1,2,3,4,5,8]:
    X1 = X_train.iloc[:,i]
    mean = X1.mean()
    std = X1.std()
    X_train.iloc[:,i] = (X1-mean)/std
    X_test.iloc[:,i] = (X_test.iloc[:,i]-mean)/std

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
# Get rid of some dummy variables to avoid perfect multicollinearity
X_train = X_train.drop(['Marriage 3','Edu 6'], axis=1)

In [None]:
X_train.head()

In [None]:
X_test.head()

### Classifier Models

In [None]:
#List of models to train with
models = []
models.append(('Linaer Discriminant Analysis',LinearDiscriminantAnalysis()))
models.append(('Logistic Regression',LogisticRegression()))
# models.append(('SVM',LinearSVC(max_iter=10000,dual="auto")))
models.append(('KNN',KNeighborsClassifier()))
models.append(('DecisionTree',DecisionTreeClassifier(random_state=0)))
models.append(('RandomForest',RandomForestClassifier(random_state=0)))
models.append(('GradientBoost',GradientBoostingClassifier(random_state=0)))
models.append(('XGBoost',XGBClassifier(random_state=0,objective='binary:logistic')))

### Fitting the Models to the Data Using the Default Options

In [None]:
for classifier, model in models:
    model.fit(X_train,y_train)
    Q = model.predict_proba(X_test)[:,1]
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_accuracy = accuracy_score(y_train,y_train_pred)
    test_accuracy = accuracy_score(y_test,y_test_pred)
    TN, FP, FN, TP = confusion_matrix(y_test,y_test_pred,sample_weight=None).ravel()
    TPR = TP/(TP+FN)
    TNR = TN/(TN+FP)
    f1 = f1_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred)
    fpr, tpr, _ = roc_curve(y_test, Q)
    roc_auc = auc(fpr,tpr)  
    print("Classifier: {}".format(classifier))    
    print("Accuracy Score on Training Data = {:.4f}".format(train_accuracy))    
    print("Performance on Testing Data:")
    print("Accuracy Score = {:.4f}".format(test_accuracy))    
    print("True Positive Rate = {:.4f}".format(TPR))
    print("True Negative Rate = {:.4f}".format(TNR))
    print("F1 Score= {:.4f}".format(f1))
    print("Precision Score= {:.4f}".format(precision))
    print("AUC= {:.4f}".format(roc_auc))
    print("\n")

In [None]:
### Paramater Tuning Function

In [None]:
def optimal_tuning(model, model_param_grid):
    clf = GridSearchCV(model, model_param_grid, cv=3, scoring=['f1', 'recall'], refit='f1')
    clf.fit(X_train, y_train)
    return clf.best_params_

### Optimal Threshold Determining Function

In [None]:
def optimal_threshold(model):
    model.fit(X_train,y_train)
    Q = model.predict_proba(X_test)[:,1]
    THRESHOLD = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    results = pd.DataFrame(columns=["THRESHOLD", "accuracy", "true positive rate", "true negative rate", "F1 Score", "Precision Score", "AUC"]) # df to store results
    results['THRESHOLD'] = THRESHOLD
    for i in range(9):                                                                         # iterate over each threshold
      preds = np.where(Q>THRESHOLD[i], 1, 0)                                                 # if prob > threshold, predict 1
      test_accuracy = accuracy_score(y_test,preds)
      TN, FP, FN, TP = confusion_matrix(y_test,preds,sample_weight=None).ravel()
      TPR = TP/(TP+FN)
      TNR = TN/(TN+FP)
      f1 = f1_score(y_test,preds)
      fpr, tpr, _ = roc_curve(y_test, Q)
      roc_auc = auc(fpr,tpr)  
      precision = precision_score(y_test,preds)
      results.iloc[i,1] = test_accuracy
      results.iloc[i,2] = TPR
      results.iloc[i,3] = TNR
      results.iloc[i,4] = f1
      results.iloc[i,5] = precision
      results.iloc[i,6] = roc_auc
    best_row = results.sort_values(['F1 Score', 'true positive rate'], ascending=[False, False]).iloc[0]
    output = pd.DataFrame(best_row).T
    output.columns = ["Optimal Threshold", "Accuracy", "True Positive Rate", "True Negative Rate", "F1 Score", "Precision Score", "AUC"]
    return output

### Logistic Regression - L2 Penalty

In [None]:
lgr2_param_grid = {'C': np.linspace(0.1,10,100)}
lgr2 = LogisticRegression(penalty='l2') # L2 Penalty
lgr2_reg_params = optimal_tuning(lgr2, lgr2_param_grid)['C']
lgr2 = LogisticRegression(penalty='l2',C=lgr2_reg_params)
threshold = optimal_threshold(lgr2)
threshold

### Logistic Regression - L1 Penalty

In [None]:
lgr1_param_grid = {'C': np.linspace(0.05,5,100)}
lgr1 = LogisticRegression(solver='liblinear',penalty='l1') # L1 Penalty
lgr1_reg_params = optimal_tuning(lgr1, lgr1_param_grid)['C']
lgr1 = LogisticRegression(solver='liblinear',penalty='l1',C=lgr1_reg_params)
threshold = optimal_threshold(lgr1)
threshold

### Linear Discriminant Analysis

In [None]:
lda_param_grid = {'solver': ['svd', 'lsqr', 'eigen'],
                  'shrinkage': [None, 'auto', np.linspace(0,1,100)]}
lda = LinearDiscriminantAnalysis()
lda_params = optimal_tuning(lda, lda_param_grid)
lda = LinearDiscriminantAnalysis(solver=lda_params['solver'], shrinkage=lda_params['shrinkage'])
threshold = optimal_threshold(lda)
threshold

### Decision Tree

In [None]:
dt_param_grid = {
    'max_depth': [np.linspace(2, 20, 10, dtype=int), None],
    'min_samples_split': np.linspace(0.1, 1.0, 10),
    'min_samples_leaf': np.linspace(0.1, 0.5, 5),
    'max_features': [None, 'sqrt', 'log2', np.linspace(0.1, 1.0, 10)],
    'criterion': ['gini', 'entropy']
}
dt = DecisionTreeClassifier(random_state=0)
dt_params = optimal_tuning(dt, dt_param_grid)
dt = DecisionTreeClassifier(random_state=0,
                            max_depth=dt_params['max_depth'],
                            min_samples_split=dt_params['min_samples_split'],
                            min_samples_leaf=dt_params['min_samples_leaf'],
                            max_features=dt_params['max_features'],
                            criterion=dt_params['criterion'])
threshold = optimal_threshold(dt)
threshold

### K-Nearest Neighbour

In [None]:
knn_param_grid = {
    'n_neighbors': np.linspace(1, 20, 20, dtype=int),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'p': np.linspace(1, 10, 10, dtype=int)
}
knn = KNeighborsClassifier()
knn_params = optimal_tuning(knn, knn_param_grid)
knn = KNeighborsClassifier(n_neighbors=knn_params['n_neighbors'],
                           weights=knn_params['weights'],
                           metric=knn_params['metric'],
                           p=knn_params['p'])
threshold = optimal_threshold(knn)
threshold

### Final Results of All Selected Classifiers in Sorted Table

In [None]:
final_results = pd.DataFrame(columns=["Optimal Threshold", "Accuracy", "True Positive Rate", "True Negative Rate", "F1 Score", "Precision Score", "AUC"])
final_results.loc['Logistic Regression - L2 Penalty'] = optimal_threshold(lgr2).iloc[0]
final_results.loc['Logistic Regression - L1 Penalty'] = optimal_threshold(lgr1).iloc[0]
final_results.loc['Linear Discriminant Analysis'] = optimal_threshold(lda).iloc[0]
final_results.loc['Decision Tree'] = optimal_threshold(dt).iloc[0]
final_results.loc['K-Nearest Neighbour'] = optimal_threshold(knn).iloc[0]
final_results.sort_values(['F1 Score', 'True Positive Rate', "Precision Score", "AUC", "True Negative Rate", "Accuracy"], ascending=[False, False, False, False, False, False])