## import libraries

In [2]:
# import necessary libraries

import pandas as pd
import numpy as np
import missingno as msno 
import seaborn as sns
import matplotlib.pyplot as plt 

#sklearn

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, fbeta_score
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, balanced_accuracy_score

# import custom functions

from custom_functions import get_data_summary, our_metrics, eval_metrics, evaluate_model


import warnings
warnings.filterwarnings('ignore')

# Import test and train data, target

In [3]:
# load data

X_train_orig = pd.read_csv('data/X_train_orig.csv', delimiter=',')
X_test_orig = pd.read_csv('data/X_test_orig.csv', delimiter=',')
X_train_minmax = pd.read_csv('data/X_train_minmax.csv', delimiter=',')
X_test_minmax = pd.read_csv('data/X_test_minmax.csv', delimiter=',')
X_train_std = pd.read_csv('data/X_train_std.csv', delimiter=',')
X_test_std = pd.read_csv('data/X_test_std.csv', delimiter=',')
y_train = pd.read_csv('data/y_train.csv', delimiter=',')
y_test = pd.read_csv('data/y_test.csv', delimiter=',')

In [4]:
# verify shape


print(f" X_train_ orig : {X_train_orig.shape}")
print(f" X_test_orig : {X_test_orig.shape}")
print(f" X_train_minmax : {X_train_minmax.shape}")
print(f" X_test_minmax : {X_test_minmax.shape}")
print(f" X_train_std : {X_train_std.shape}")
print(f" X_test_std : {X_test_std.shape}")
print(f" y_train: {y_train.shape}")
print(f" y_test : {y_test.shape}")



 X_train_ orig : (3357518, 116)
 X_test_orig : (1119173, 116)
 X_train_minmax : (3357518, 116)
 X_test_minmax : (1119173, 116)
 X_train_std : (3357518, 116)
 X_test_std : (1119173, 116)
 y_train: (3357518, 1)
 y_test : (1119173, 1)


# K-neighbours classifier 




In [5]:
# on X_train_orig, X_test_orig

# initialize and fit/train model on data

knn_orig = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn_orig.fit(X_train_orig, y_train)

# predict on test

y_pred_orig = knn_orig.predict(X_test_orig)

# Make probability predictions
train_probs_knn_orig = knn_orig.predict_proba(X_train_orig)[:, 1]
test_probs_knn_orig = knn_orig.predict_proba(X_test_orig)[:, 1]

train_preds_knn_orig = knn_orig.predict(X_train_orig)
test_preds_knn_orig = knn_orig.predict(X_test_orig)

print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs_knn_orig)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, test_probs_knn_orig)}')
print(f'Baseline ROC AUC: {roc_auc_score(y_test, [1 for _ in range(len(y_test))])}')

print(eval_metrics(y_test, y_pred_orig))
print(evaluate_model(test_preds_knn_orig, test_probs_knn_orig, train_preds_knn_orig, train_probs_knn_orig))


In [None]:
# on X_train_minmax, X_test_minmax

# initialize and fit/train model on data

knn_minmax = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn_minmax.fit(X_train_minmax, y_train)

# predict on test

y_pred_minmax = knn_minmax.predict(X_test_minmax)

# Make probability predictions
train_probs_knn_minmax = knn_minmax.predict_proba(X_train_minmax)[:, 1]
test_probs_knn_minmax = knn_minmax.predict_proba(X_test_minmax)[:, 1]

train_preds_knn_minmax= knn_minmax.predict(X_train_minmax)
test_preds_knn_minmax = knn_minmax.predict(X_test_minmax)

print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs_knn_minmax)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, test_probs_knn_minmax)}')
print(f'Baseline ROC AUC: {roc_auc_score(y_test, [1 for _ in range(len(y_test))])}')

print(eval_metrics(y_test, y_pred_minmax))
print(evaluate_model(test_preds_knn_minmax, test_probs_knn_minmax, train_preds_knn_minmax, train_probs_knn_minmax))

In [None]:
# on X_train_std, X_test_std

# initialize and fit/train model on data

knn_std = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn_std.fit(X_train_std, y_train)

# predict on test

y_pred_std = knn_std.predict(X_test_std)

# Make probability predictions
train_probs_knn_std = knn_std.predict_proba(X_train_std)[:, 1]
test_probs_knn_std = knn_std.predict_proba(X_test_std)[:, 1]

train_preds_knn_std = knn_std.predict(X_train_std)
test_preds_knn_std = knn_std.predict(X_test_std)

print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs_knn_std)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, test_probs_knn_std)}')
print(f'Baseline ROC AUC: {roc_auc_score(y_test, [1 for _ in range(len(y_test))])}')

print(eval_metrics(y_test, y_pred_std))
print(evaluate_model(test_preds_knn_std, test_probs_knn_std, train_preds_knn_std, train_probs_knn_std))

In [None]:

from sklearn.neighbors import DistanceMetric


metric = DistanceMetric.get_metric('mahalanobis', V=np.cov(X_train_orig))
