In [1]:
import pandas as pd
from scipy.stats.mstats import winsorize
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from scipy.stats.mstats import winsorize
from sklearn.model_selection import GridSearchCV 
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline



# 1. Load Data

In [2]:
train_set = pd.read_csv('cs-training.csv',index_col=0)
test_set = pd.read_csv('cs-test.csv',index_col=0)
data_desp = pd.read_excel('Data Dictionary.xls',header=1)

In [3]:
data_desp

Unnamed: 0,Variable Name,Description,Type
0,SeriousDlqin2yrs,Person experienced 90 days past due delinquenc...,Y/N
1,RevolvingUtilizationOfUnsecuredLines,Total balance on credit cards and personal lin...,percentage
2,age,Age of borrower in years,integer
3,NumberOfTime30-59DaysPastDueNotWorse,Number of times borrower has been 30-59 days p...,integer
4,DebtRatio,"Monthly debt payments, alimony,living costs di...",percentage
5,MonthlyIncome,Monthly income,real
6,NumberOfOpenCreditLinesAndLoans,Number of Open loans (installment like car loa...,integer
7,NumberOfTimes90DaysLate,Number of times borrower has been 90 days or m...,integer
8,NumberRealEstateLoansOrLines,Number of mortgage and real estate loans inclu...,integer
9,NumberOfTime60-89DaysPastDueNotWorse,Number of times borrower has been 60-89 days p...,integer


# 2. Feature Selection 

## 2.1 Data Processing 

In [4]:
train_set.iloc[:, 1:] = train_set.iloc[:, 1:].apply(lambda x: x.fillna(x.mean()), axis=0)
train_set.iloc[:, 1:] = train_set.iloc[:, 1:].apply(lambda x: winsorize(x, limits=[0.025, 0.025]), axis=0)
train_set.iloc[:, 1:] = train_set.iloc[:, 1:].apply(lambda x: (x-x.mean()) / x.std(), axis=0)

## 2.2 PCA

In [5]:
pca = PCA(n_components=3)
train_comp = pca.fit_transform(train_set.iloc[:, 1:])

# 3. Grid Search 

In [6]:
param_grid = {'DT__max_depth': [5, 8, 10], 
              'DT__max_leaf_nodes': [2, 4]}

In [7]:
estimator = Pipeline([
    ('scaler', StandardScaler()),
    ("DT", DecisionTreeClassifier(criterion='gini', max_depth=5, max_leaf_nodes=2))
])

In [8]:
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    n_jobs=-1,
    cv=5
)

In [9]:
grid_search.fit(train_comp, train_set.iloc[:, 0].values)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('DT',
                                        DecisionTreeClassifier(max_depth=5,
                                                               max_leaf_nodes=2))]),
             n_jobs=-1,
             param_grid={'DT__max_depth': [5, 8, 10],
                         'DT__max_leaf_nodes': [2, 4]})

## 3.1 Result

**Optimal Parameters**

In [10]:
grid_search.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('DT', DecisionTreeClassifier(max_depth=5, max_leaf_nodes=4))])

**Best Accuracy**

In [11]:
grid_search.best_score_

0.9333666666666668