# Task 4 Classification

In [36]:
import pandas as pd 

df = pd.read_excel('data/default.xls', header=1, index_col='ID')
df.sample(5)

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27400,280000,1,2,1,45,-1,-1,-1,-1,-1,...,1207,1207,1207,1207,1207,1207,1207,1207,1207,0
29979,310000,1,2,1,39,0,0,0,0,0,...,219409,216540,210675,10029,9218,10029,8049,8040,10059,0
12335,120000,2,1,2,27,-1,-1,-1,-1,-1,...,390,390,0,390,390,390,390,0,780,0
8689,50000,1,2,2,24,0,0,0,0,0,...,32945,29348,23570,6013,3011,1019,1019,2015,17,0
16892,120000,1,1,2,33,1,-2,-2,-1,-1,...,600,0,0,0,0,600,0,0,0,0


## Task 4.1.1 Creating Feature matrix X and target vector y

In [37]:
from sklearn.model_selection import train_test_split

def get_X_y(df, test_size=0.3):
    X = df.loc[:, df.columns != 'default payment next month']
    y = df['default payment next month']

    return train_test_split(X, y, test_size=test_size, random_state=0)

X_train, X_test, y_train, y_test = get_X_y(df, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((21000, 23), (9000, 23), (21000,), (9000,))

# Task 4.1.2 Fit a Decision Tree Classifier

In [38]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

def fit_decision_tree(X_train, X_test, y_train, y_test):
    clf = DecisionTreeClassifier(max_depth=10, random_state=1)
    clf.fit(X_train, y_train)
    # predictions = clf.predict(X_test)
    scores = cross_val_score(clf, X_test, y_test, cv=5)

    return scores 

scores = fit_decision_tree(X_train, X_test, y_train, y_test)
scores.mean()

0.7985555555555555

# Task 4.1.3 One Hot Encoding

In [39]:
def one_hot_encoding(df):
    categorical_cols = [
        'SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'
    ]
    df_encoded = pd.get_dummies(df, columns=categorical_cols, dtype=int)
    return df_encoded

df_encoded = one_hot_encoding(df)
df_encoded.sample(5)

Unnamed: 0_level_0,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,...,PAY_6_-2,PAY_6_-1,PAY_6_0,PAY_6_2,PAY_6_3,PAY_6_4,PAY_6_5,PAY_6_6,PAY_6_7,PAY_6_8
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16557,180000,27,134168,120734,124730,120423,113901,114077,4345,5966,...,0,0,1,0,0,0,0,0,0,0
18555,70000,25,5412,5177,6291,9085,9270,6821,0,1206,...,0,0,1,0,0,0,0,0,0,0
8320,50000,42,46100,46949,18755,11112,11374,5919,2133,1058,...,0,1,0,0,0,0,0,0,0,0
17442,30000,34,29678,28878,26758,27462,28014,29826,0,1751,...,0,0,1,0,0,0,0,0,0,0
27595,50000,34,11340,11367,10982,10243,10826,11699,3200,3000,...,0,0,1,0,0,0,0,0,0,0


In [41]:
scores_encoded = fit_decision_tree(*get_X_y(df_encoded, test_size=0.3))
scores_encoded.mean()

0.7967777777777778

# Task 4.1.4 Grid Search

In [58]:
from sklearn.model_selection import GridSearchCV

parameters = {
    # "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": range(2, 15),
    # 'max_leaf_nodes': [5, 10, 20, 100],
    'min_samples_split': [2, 5, 10, 20]
}

grid_search = GridSearchCV(
    DecisionTreeClassifier(),
    parameters, 
    cv=5, 
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 52 candidates, totalling 260 fits
[CV] END ...................max_depth=2, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=2, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=2, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=2, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=2, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=2, min_samples_split=5; total time=   0.0s
[CV] END ...................max_depth=2, min_samples_split=5; total time=   0.0s
[CV] END ...................max_depth=2, min_samples_split=5; total time=   0.0s
[CV] END ...................max_depth=2, min_samples_split=5; total time=   0.0s
[CV] END ...................max_depth=2, min_samples_split=5; total time=   0.0s
[CV] END ..................max_depth=2, min_samples_split=10; total time=   0.0s
[CV] END ..................max_depth=2, min_sam

In [59]:
results = pd.DataFrame(grid_search.cv_results_)
results = results.sort_values('rank_test_score')
results.iloc[0, :]

# Best min_samples_split=2
# Best max_depth=3

mean_fit_time                                              0.054485
std_fit_time                                               0.001228
mean_score_time                                            0.001466
std_score_time                                             0.000461
param_max_depth                                                   3
param_min_samples_split                                           2
params                     {'max_depth': 3, 'min_samples_split': 2}
split0_test_score                                          0.822857
split1_test_score                                          0.825952
split2_test_score                                          0.818333
split3_test_score                                          0.814048
split4_test_score                                          0.815238
mean_test_score                                            0.819286
std_test_score                                             0.004515
rank_test_score                                 

Other preprocessing methods that could improve the result include scaling the numerical values