# Task 4 Classification

In [7]:
import pandas as pd 
import numpy as np 

# * Seeding for reproducibility 
np.random.seed(0)
df = pd.read_excel('data/default.xls', header=1, index_col='ID')
df.sample(5)

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8226,20000,1,1,2,33,1,2,2,2,2,...,18453,19755,19288,0,2260,0,1600,0,644,0
10795,20000,2,2,2,35,0,0,2,0,0,...,19000,19000,20000,3400,0,1000,0,1000,0,0
9164,230000,2,1,1,44,1,-1,-1,-1,-1,...,933,0,0,949,2873,933,0,0,0,0
26592,100000,1,2,1,42,0,0,0,0,0,...,17758,18774,20272,5000,2000,2000,2000,2000,2000,0
6632,150000,1,1,2,29,-2,-2,-2,-2,-2,...,6469,5138,7810,6989,833,6488,5153,7833,7130,0


## Task 4.1.1 Creating Feature matrix X and target vector y

In [8]:
from sklearn.model_selection import train_test_split

def get_X_y(df, test_size=0.3):
    X = df.loc[:, df.columns != 'default payment next month']
    y = df['default payment next month']

    return train_test_split(X, y, test_size=test_size, random_state=0)

X_train, X_test, y_train, y_test = get_X_y(df, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((21000, 23), (9000, 23), (21000,), (9000,))

# Task 4.1.2 Fit a Decision Tree Classifier

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

def fit_decision_tree(X_train, X_test, y_train, y_test):
    clf = DecisionTreeClassifier(max_depth=10, random_state=1)
    clf.fit(X_train, y_train)
    # predictions = clf.predict(X_test)
    scores = cross_val_score(clf, X_test, y_test, cv=5)

    return scores 

scores = fit_decision_tree(X_train, X_test, y_train, y_test)
scores.mean()

0.7985555555555555

# Task 4.1.3 One Hot Encoding

In [10]:
def one_hot_encoding(df):
    categorical_cols = [
        'SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'
    ]
    df_encoded = pd.get_dummies(df, columns=categorical_cols, dtype=int)
    return df_encoded

df_encoded = one_hot_encoding(df)
df_encoded.sample(5)

Unnamed: 0_level_0,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,...,PAY_6_-2,PAY_6_-1,PAY_6_0,PAY_6_2,PAY_6_3,PAY_6_4,PAY_6_5,PAY_6_6,PAY_6_7,PAY_6_8
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26643,90000,24,86724,91394,78767,74007,29480,27941,7016,3800,...,0,0,1,0,0,0,0,0,0,0
17126,60000,26,-25,-25,-25,1901,38746,39178,0,0,...,0,0,1,0,0,0,0,0,0,0
27972,110000,29,18817,21068,16412,16809,8682,8861,2600,1102,...,0,0,0,1,0,0,0,0,0,0
5764,20000,35,17584,19819,11280,3680,4480,1650,3013,1005,...,0,1,0,0,0,0,0,0,0,0
15606,20000,27,19667,15,15,41,10522,12340,0,0,...,0,0,1,0,0,0,0,0,0,0


In [11]:
scores_encoded = fit_decision_tree(*get_X_y(df_encoded, test_size=0.3))
scores_encoded.mean()

0.7967777777777778

# Task 4.1.4 Grid Search

In [12]:
from sklearn.model_selection import GridSearchCV

parameters = {
    "max_depth": range(2, 15),
    # 'max_leaf_nodes': [5, 10, 20, 100],
    'min_samples_split': [2, 5, 10, 20]
}

grid_search = GridSearchCV(
    DecisionTreeClassifier(),
    parameters, 
    cv=5, 
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 52 candidates, totalling 260 fits
[CV] END ...................max_depth=2, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=2, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=2, min_samples_split=2; total time=   0.0s
[CV] END ..................max_depth=2, min_samples_split=10; total time=   0.1s
[CV] END ..................max_depth=2, min_samples_split=10; total time=   0.0s
[CV] END ..................max_depth=2, min_samples_split=10; total time=   0.0s
[CV] END ...................max_depth=2, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=2, min_samples_split=2; total time=   0.0s
[CV] END ..................max_depth=2, min_samples_split=20; total time=   0.0s
[CV] END ..................max_depth=2, min_samples_split=20; total time=   0.0s
[CV] END ..................max_depth=2, min_samples_split=20; total time=   0.0s
[CV] END ...................max_depth=2, min_sa

In [16]:
results = pd.DataFrame(grid_search.cv_results_)
results = results.sort_values('rank_test_score')
results.head(1)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,0.069793,0.005888,0.001744,0.000603,3,2,"{'max_depth': 3, 'min_samples_split': 2}",0.822857,0.825952,0.818333,0.814048,0.815238,0.819286,0.004515,1


Other preprocessing methods that could improve the result include scaling the numerical values