In [1]:
#imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#sets
PATH = "../Datasets/Modified/mod_UCI_Credit_Card.csv"

In [3]:
df = pd.read_csv(PATH)

In [4]:
df.head()

Unnamed: 0,CreditAmount,SEX,MARRIAGE,AGE,REPAYMENT_status_in_September,REPAYMENT_status_in_August,REPAYMENT_status_in_July,REPAYMENT_status_in_June,REPAYMENT_status_in_May,REPAYMENT_status_in_April,...,PAY_amount_in_September,PAY_amount_in_August,PAY_amount_in_July,PAY_amount_in_June,PAY_amount_in_May,PAY_amount_in_April,Default payment,EDUCLVL_school,EDUCLVL_university,EDUCLVL_high_school
0,20000.0,2,1,24,2,2,-1,-1,-2,-2,...,0.0,689.0,0.0,0.0,0.0,0.0,1,0,1,0
1,120000.0,2,2,26,-1,2,0,0,0,2,...,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1,0,1,0
2,90000.0,2,2,34,0,0,0,0,0,0,...,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0,0,1,0
3,50000.0,2,1,37,0,0,0,0,0,0,...,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0,0,1,0
4,50000.0,1,1,57,-1,0,-1,0,0,0,...,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0,0,1,0


In [5]:
corr = df.corr()

In [6]:
def get_best_correlators(dataframe_corr,to_cor,n2=1,n3=0):
    correlators_list = []
    for k,v in dataframe_corr[to_cor].items():
        n = str(abs(v))
        if int(n[2])>=n2 and int(n[3])>=n3:
            correlators_list.append(k)
    return correlators_list

In [7]:
features = get_best_correlators(corr,"Default payment")

In [8]:
len(corr["Default payment"])

26

In [9]:
len(features)

7

In [10]:
corr["Default payment"].sort_values(ascending=False)

Default payment                  1.000000
REPAYMENT_status_in_September    0.327235
REPAYMENT_status_in_August       0.264091
REPAYMENT_status_in_July         0.234446
REPAYMENT_status_in_June         0.214696
REPAYMENT_status_in_May          0.204116
REPAYMENT_status_in_April        0.187187
EDUCLVL_high_school              0.031394
EDUCLVL_university               0.031347
AGE                              0.012883
BILL_amount_in_April            -0.004716
BILL_amount_in_May              -0.006009
BILL_amount_in_June             -0.009421
BILL_amount_in_July             -0.013478
BILL_amount_in_August           -0.013505
BILL_amount_in_September        -0.018931
MARRIAGE                        -0.031518
SEX                             -0.038858
PAY_amount_in_April             -0.054335
PAY_amount_in_May               -0.055746
EDUCLVL_school                  -0.056231
PAY_amount_in_July              -0.057372
PAY_amount_in_June              -0.057543
PAY_amount_in_August            -0

In [11]:
#Set up X and y

y = df["Default payment"].values
X = df[features]

In [12]:
#train and test set
#SPlitting into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 70% training and 30% test

In [22]:
#choose the best number for x up to 20 nearest neighbors
def pick_n():
    train_score = []
    test_score = []
    k_vals = []

    for k in range(1, 21):
        k_vals.append(k)
        knn = KNeighborsClassifier(n_neighbors = k)
        knn.fit(X_train, y_train)

        tr_score = knn.score(X_train, y_train)
        train_score.append(tr_score)

        te_score = knn.score(X_test, y_test)
        test_score.append(te_score)
        
        max_test_score = max(test_score)
        max_test_scores_ind = [i for i, v in enumerate(test_score) if v == max_test_score]
        print('Max test score {} and k = {}'.format(max_test_score * 100, list(map(lambda x: x + 1, max_test_scores_ind))))
        opt_K = list(map(lambda x: x + 1, max_test_scores_ind))[0]
        
        return opt_K


In [23]:
opt_k = pick_n()

Max test score 72.33264793237377 and k = [1]


In [24]:
knn = KNeighborsClassifier(opt_k)


In [25]:
#train/fit

knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.7233264793237377

In [26]:
#predict

In [27]:
y_pred = knn.predict(X_test)

In [28]:
confusion_matrix(y_test,y_pred)
pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5569,1286,6855
1,1136,763,1899
All,6705,2049,8754


In [29]:
#adjust/verify F1 score
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.81      0.82      6855
           1       0.37      0.40      0.39      1899

    accuracy                           0.72      8754
   macro avg       0.60      0.61      0.60      8754
weighted avg       0.73      0.72      0.73      8754



In [30]:
#refit on entire set