In [41]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

np.random.seed(1)

In [42]:
PATH = "lab/data/"
df_ari_raw = pd.read_csv(f'{PATH}Credit.csv')
df_ari_raw = df_ari_raw.drop("Unnamed: 0", axis=1)
df_ari = pd.get_dummies(df_ari_raw)
df_ari_y = df_ari.apply(lambda row: 1 if row['Income'] > 50 else 0, axis=1)  # making new column

## a) K-Nearest Neighbors Classifier & Decision Trees

In [43]:
kf = KFold(n_splits=5, shuffle=True, random_state=2)

In [51]:
def perform_classifier_routine(classifier_class, X_train, y_train, X_test, y_test, verbose, **kwargs):
    classifier = classifier_class(**kwargs)
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    if verbose is True:
        print(f"{kwargs=}, {score=}")
    return classifier, score


def get_best_classifier(X_train, y_train, X_test, y_test, verbose):
    best_score = -1
    for neighbours in range(1, 10):
        knn, new_score = perform_classifier_routine(KNeighborsClassifier, X_train, y_train, X_test, y_test, verbose,
                                                    n_neighbors=neighbours)
        if new_score > best_score:
            best_neighbours = neighbours
            best_score = new_score
            best_knn = knn
    print(f"{best_neighbours=} with {best_score=}")

    best_score = -1
    for depth in range(1, 10):
        dt, new_score = perform_classifier_routine(DecisionTreeClassifier, X_train, y_train, X_test, y_test, verbose,
                                                   max_depth=depth)
        if new_score > best_score:
            best_depth = depth
            best_score = new_score
            best_dt = dt
    print(f"{best_depth=} with {best_score=}")

    best_score = -1
    for estimators in [10, 25, 50, 100, 200]:
        rf, new_score = perform_classifier_routine(RandomForestClassifier, X_train, y_train, X_test, y_test, verbose,
                                                   n_estimators=estimators)
        if new_score > best_score:
            best_estimators = estimators
            best_score = new_score
            best_rf = rf
    print(f"{best_estimators=} with {best_score=}")


def find_best_classifier(data_frame, data_frame_y, verbose=True):
    for i, (train_index, test_index) in enumerate(kf.split(data_frame)):
        print(f"{i}-th fold:")
        X_train, X_test = data_frame.iloc[train_index], data_frame.iloc[test_index]
        y_train, y_test = data_frame_y.iloc[train_index], data_frame_y.iloc[test_index]

        get_best_classifier(X_train, y_train, X_test, y_test, verbose)
        print()



In [52]:
find_best_classifier(df_ari, df_ari_y, verbose=False)

0-th fold:
    Income  Limit  Rating  Cards  Age  Education  Balance  Gender_ Male  \
0   14.891   3606     283      2   34         11      333             1   
1  106.025   6645     483      3   82         15      903             0   
3  148.924   9504     681      3   36         11      964             0   
4   55.882   4897     357      2   68         16      331             1   
5   80.180   8047     569      4   77         10     1151             1   

   Gender_Female  Student_No  Student_Yes  Married_No  Married_Yes  \
0              0           1            0           0            1   
1              1           0            1           0            1   
3              1           1            0           1            0   
4              0           1            0           0            1   
5              0           1            0           1            0   

   Ethnicity_African American  Ethnicity_Asian  Ethnicity_Caucasian  
0                           0                0 

Feature names seen at fit time, yet now missing:
- Age
- Balance
- Cards
- Education
- Ethnicity_African American
- ...



ValueError: X has 2 features, but KNeighborsClassifier is expecting 16 features as input.

## b) K-Nearest Neighbors Classifier & Decision Trees

In [26]:
PATH = "lab/data/"
df_cards_raw = pd.read_csv(f'{PATH}Credit.csv')
df_cards = df_cards_raw.drop("Unnamed: 0", axis=1)
df_cards = pd.get_dummies(df_cards)  # one hot encoding for category data
df_cards_y = df_cards.pop('Cards')

Model test score:  0.325


In [40]:
find_best_classifier(df_cards, df_cards_y, verbose=False)

0-th fold:
best_neighbours=2
best_depth=1

1-th fold:
best_neighbours=1
best_depth=3

2-th fold:
best_neighbours=7
best_depth=7

3-th fold:
best_neighbours=8
best_depth=7

4-th fold:
best_neighbours=3
best_depth=1

