# Section 1

Objective: to compare how models with different hyperparameter settings perform versus the baseline model on training and test data. 

The sample error is intentional.

The dataset used for this task can be found here:

https://www.kaggle.com/ronitf/heart-disease-uci

In [22]:
import pandas as pd
import numpy as np
from statistics import mean
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [23]:
# Read csv
df = pd.read_csv("heart.csv")

In [24]:
# Checking out the DF
df.info()
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


In [25]:
# Checking the class (in)balance and NaN's

print(df["target"].value_counts())
print("Number of NaNs: ", df.isnull().sum(axis = 0))

1    165
0    138
Name: target, dtype: int64
Number of NaNs:  age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [26]:
# Getting the class labels and drop class column
y_res = df["target"]
df.drop(['target'], 1, inplace=True)

# Splitting the dataset into 50/50
# Stratified split for equal class proportions in each dataset
X_train, X_test, y_train, y_test = train_test_split(df, y_res, stratify=y_res,test_size=0.5, random_state=1)

# Note to myself:
# X_train = Training data of first half
# X_test = Second half, evaluate on this data. Don't touch
# y_train = Class labels for X_train
# y_test = Class labels for X_test. Don't touch

In [27]:
def hundred_settings_ten_folds(X, y):
    
    #Copying X_train and y_train
    X_train = X.copy();
    y_train = y.copy();
    
    # Stratified for keeping class proportions in each fold
    skf = StratifiedKFold(n_splits=10)

    # scaler = MinMaxScaler()

    # Using Knn as learning algo
    clf = KNeighborsClassifier()
    
    # kNN.get_params().keys()
    # pipe = Pipeline(steps=[("scaler", scaler), ("kNN", clf)])

    # approx. 100 different parameter settings
    grid_param = {
         'n_neighbors': [1, 3, 5, 7, 10, 15, 20, 25],
         'weights': ['uniform', 'distance'],
         'algorithm': ['auto', 'ball_tree', "kd_tree", "brute"],
         'leaf_size': [10, 30]
    }
    
    # Grid search with 100 settings and 10 stratified splits
    grid_sr = GridSearchCV(param_grid=grid_param,
                           scoring='accuracy',
                           estimator=clf,
                           cv=skf,
                           n_jobs=-1)
    
    # Fit the model according to best params
    grid_sr.fit(X_train, y_train)
    
    # Display best params
    best_parameters = grid_sr.best_params_
    print("Best params: ", best_parameters)  
    
    # Display individual mean accuracy for each fold
    fold_scores = grid_sr.cv_results_['mean_test_score']
    print("Fold scores: ", fold_scores)
    
    # Display best cv result (accuracy)
    best_result = grid_sr.best_score_
    print("Best cv acc: ", best_result)
    
    return fold_scores;


In [28]:
hyper_p_accuracies = hundred_settings_ten_folds(X_train, y_train)

Best params:  {'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 7, 'weights': 'distance'}
Fold scores:  [0.60958333 0.60958333 0.64208333 0.62875    0.64166667 0.655
 0.64791667 0.66125    0.64833333 0.64166667 0.62875    0.62833333
 0.62166667 0.64833333 0.63541667 0.63541667 0.60958333 0.60958333
 0.64208333 0.62875    0.64166667 0.655      0.64791667 0.66125
 0.64833333 0.64166667 0.62875    0.62833333 0.62166667 0.64833333
 0.63541667 0.63541667 0.60958333 0.60958333 0.64208333 0.62875
 0.64166667 0.655      0.64791667 0.66125    0.64833333 0.64166667
 0.62875    0.62833333 0.62166667 0.64833333 0.63541667 0.63541667
 0.60958333 0.60958333 0.64208333 0.62875    0.64166667 0.655
 0.64791667 0.66125    0.64833333 0.64166667 0.62875    0.62833333
 0.62166667 0.64833333 0.63541667 0.63541667 0.60958333 0.60958333
 0.64208333 0.62875    0.64166667 0.655      0.64791667 0.66125
 0.64833333 0.64166667 0.62875    0.62833333 0.62166667 0.64833333
 0.63541667 0.63541667 0.60958333 0.6095

In [29]:
def baseline(X, y):
    
    #Copying X_train and y_train
    X_train = X.copy();
    y_train = y.copy();
    
    # SK-fold 10 times
    cv = StratifiedKFold(n_splits=10, shuffle=True)
    
    # Using kNN
    clf = KNeighborsClassifier()
    
    # Get accuracies
    accuracy = cross_val_score(clf, X_train, y_train, scoring="balanced_accuracy", cv=cv)
    print(accuracy)
    print("Avarage accuracy: ", mean(accuracy))
    
    return accuracy


In [30]:
# Printing baseline accuracies
baseline_accuracies = baseline(X_train, y_train)

[0.42063492 0.41666667 0.53571429 0.67857143 0.58928571 0.74107143
 0.86607143 0.33035714 0.74107143 0.58928571]
Avarage accuracy:  0.5908730158730159


In [621]:
# Checking how many times the HP model outperformed the mean accuracy of the baseline model 

count = 0
for ac in hyper_p_accuracies:
    if(ac > mean(baseline_accuracies)):
        count = count + 1

print("Number of tuned models that scored higher than the mean of baseline models: ", count)
if(len(hyper_p_accuracies) > 50):
    print("Tuned models outperformed the baseline in the majority of the cases.")
else:
    print("Tuned models did not outperform the baseline in the majority of the cases.")

Number of tuned models that scored higher than the mean of baseline models:  96
Tuned models outperformed the baseline in the majority of the cases.


In [31]:
# Due to 96 tuned models outperforming the baseline model, 
# the tuned model should outperform the baseline model on the test set.

# Running kNN on test data
# With best params
def test_with_params(X_train, X_test, y_train, y_test):
    
    clf = KNeighborsClassifier(algorithm='auto', n_neighbors=7, weights='distance')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print("Hyper parameter model accuracy: ", accuracy)

# Running kNN on test data
# With no params 
def test_baseline(X_train, X_test, y_train, y_test):
    
    clf = KNeighborsClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print("Baseline model accuracy: ", accuracy)

In [32]:
# Checking difference in accuracies between the two models
test_with_params(X_train, X_test, y_train, y_test)
test_baseline(X_train, X_test, y_train, y_test)

Hyper parameter model accuracy:  0.6842105263157895
Baseline model accuracy:  0.6644736842105263


In my run, the HP models outperformed the baseline models in 96 / 100 cases. Due to this, the results on the final comparison was expected.  

#Final comment: 

Low accuracy may be a result of the small dataset and the case could also be that kNN is not performing well on this dataset.  

# Section 2

Objective: to compare how class imbalance affects the accuracy, AUC and precision.

In this task, I selected a dataset with approx. 20k instances, and scaled it down to approx. 4k instances of class 0 and 1k instances of class 1. This was done by randomly picking 5k samples from the original dataset during the split.

The dataset can be found here:

https://www.kaggle.com/volodymyrgavrysh/fraud-detection-bank-dataset-20k-records-binary

In [577]:
# Read DF
df_fd = pd.read_csv("fraud_detection_bank_dataset.csv")

# Checking out stuff and extracting the class labels
df2 = df_fd.copy()
df2.info()
y_res2 = df2["targets"]

# Splitting the dataset into 75/25
# Stratified split for equal class proportions in each dataset
X_train2, X_test2, y_train2, y_test2 = train_test_split(df2, y_res2, stratify=y_res2, test_size=0.25)

# Class proportions of the test data. I hope 3756 vs 1360 this is ok :) 
display(y_test2.value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20468 entries, 0 to 20467
Columns: 114 entries, Unnamed: 0 to targets
dtypes: float64(1), int64(113)
memory usage: 17.8 MB


0    3757
1    1360
Name: targets, dtype: int64

In [578]:
# MinMax normalization
scaler = MinMaxScaler()

scaler.fit_transform(X_train2)
scaler.transform(X_test2)

array([[2.74504056e-01, 8.69187310e-04, 3.86161659e-03, ...,
        0.00000000e+00, 7.09504685e-02, 0.00000000e+00],
       [6.41747288e-01, 0.00000000e+00, 1.74566229e-03, ...,
        0.00000000e+00, 6.02409639e-02, 0.00000000e+00],
       [6.56259162e-01, 8.69187310e-04, 1.62928481e-02, ...,
        0.00000000e+00, 2.67737617e-02, 0.00000000e+00],
       ...,
       [1.15899541e-01, 0.00000000e+00, 4.49640288e-04, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [1.82204632e-01, 0.00000000e+00, 1.45471858e-03, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [3.95436333e-01, 8.69187310e-04, 1.43620398e-02, ...,
        0.00000000e+00, 9.77242303e-02, 0.00000000e+00]])

In [579]:
def kNN_5k_test():
    
    # Dropping some columns
    X_train_5k = X_train2.drop(['targets', 'Unnamed: 0'], 1)
    X_test_5k = X_test2.drop(['targets', 'Unnamed: 0'], 1)

    # Train and predict with kNN
    clf = KNeighborsClassifier()
    clf.fit(X_train_5k, y_train2)
    y_pred2 = clf.predict(X_test_5k)
    y_pred_proba = clf.predict_proba(X_test_5k)
    
    # Relevant metrics for the objective
    cm = confusion_matrix(y_test2, y_pred2)
    accuracy = accuracy_score(y_test2, y_pred2)
    auc = roc_auc_score(y_test2, y_pred_proba[:, 1])
    
    # Print metrics
    print(cm)
    print("Test Accuracy: ", accuracy)    
    print("Test AUC: ", auc)   
    print(classification_report(y_test2, y_pred2, target_names=["0", "1"]))

In [580]:
# There is a difference in precision between the classes,
# whereas the minority class scores lower
# Accuracy and AUC of ~.85

kNN_5k_test()

[[3459  298]
 [ 470  890]]
Test Accuracy:  0.8499120578463943
Test AUC:  0.8539950915154457
              precision    recall  f1-score   support

           0       0.88      0.92      0.90      3757
           1       0.75      0.65      0.70      1360

    accuracy                           0.85      5117
   macro avg       0.81      0.79      0.80      5117
weighted avg       0.85      0.85      0.85      5117



In [581]:
#Copying X_test2 before resampling
X_test3 = X_test2.copy()

# Resampling the data by selecting 1359 samples of class 0 from X_test3
# (corresponding to the amount of test instances of class 1 in X_test2) 
# For the sample function, replace is by default false (no replacement), 
# but I choose to demonstrate it here anyways.
X_test3 = X_test3.groupby('targets', group_keys=False).apply(lambda x: x.sample(1359, replace=False, random_state=1)).reset_index()
y_res3 = X_test3['targets']

# Had to drop some columns (indexes) that appeared, and also the classes
X_test3.drop(['index', 'Unnamed: 0', 'targets'], axis=1, inplace=True)

# Print classes
print(y_res3.value_counts())

1    1359
0    1359
Name: targets, dtype: int64


In [582]:
def kNN_2k_test():
    
    # Drop some columns
    X_train_2k = X_train2.drop(['targets', 'Unnamed: 0'], 1)
    X_test_2k = X_test3.copy()
    
    # Train and predict with kNN
    clf = KNeighborsClassifier()
    clf.fit(X_train_2k, y_train2)
    y_pred3 = clf.predict(X_test_2k)
    y_pred_proba = clf.predict_proba(X_test_2k)
  
    # Relevant metrics for the objective
    cm = confusion_matrix(y_res3, y_pred3)
    accuracy = accuracy_score(y_res3, y_pred3)
    auc = roc_auc_score(y_res3, y_pred_proba[:, 1])

    # Print metrics
    print(cm)
    print("Test Accuracy: ", accuracy)    
    print("Test AUC: ", auc)
    print(classification_report(y_res3, y_pred3, target_names=["0", "1"]))

In [583]:
# We can see a an increase in precision for the previous minority class
# Interestingly, the previous majority class' precision decreased
# Accuracy decreased due to decreased rel. frequency of the majority class
# and the AUC is more or less the same since changes in proportion 
# of the classes does not affect the score.

kNN_2k_test()

[[1258  101]
 [ 470  889]]
Test Accuracy:  0.7899190581309786
Test AUC:  0.8531426767615238
              precision    recall  f1-score   support

           0       0.73      0.93      0.82      1359
           1       0.90      0.65      0.76      1359

    accuracy                           0.79      2718
   macro avg       0.81      0.79      0.79      2718
weighted avg       0.81      0.79      0.79      2718

