In [1]:
import numpy as np
import pandas as pd
import warnings 
from Functions import *

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [2]:
warnings.filterwarnings('ignore')

In [3]:
# Training data
# Load csv file and fill the missing value with columns means
Training_Set = pd.read_csv('cs-training.csv').drop(columns='Unnamed: 0')
Training_Set = Training_Set.fillna(Training_Set.mean()).copy()

# Testing data
Test_Set = pd.read_csv('cs-test.csv').drop(columns='Unnamed: 0')
Test_Set = Test_Set.fillna(Test_Set.mean()).copy()

In [4]:
# Preprocessing data (normalization)
Y_Train = Training_Set['SeriousDlqin2yrs'].values
Y_Test = np.round(pd.read_csv('sampleEntry.csv')['Probability'].values).astype(int)

_X_Train_, _X_Test_ = RemoveOutlier(Training_Set.iloc[:, 1:]).values, RemoveOutlier(Test_Set.iloc[:, 1:]).values

Scaler = StandardScaler()
Scaler.fit(_X_Train_)

X_Train, X_Test = Scaler.transform(_X_Train_), Scaler.transform(_X_Test_)

# X_Combined, Y_Combined = np.vstack((X_Train, X_Test)), np.hstack((Y_Train, Y_Test))

In [6]:
# 5 features
# Logistic Regression
Pipe_LR= make_pipeline(StandardScaler(), PCA(n_components=5), LogisticRegression(random_state=1, solver='lbfgs'))

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

param_grid = [{'logisticregression__C': param_range}]

gs = GridSearchCV(estimator=Pipe_LR, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  refit=True,
                  cv=5,
                  n_jobs=-1)

gs = gs.fit(X_Train, Y_Train)
print('Best scores:', gs.best_score_)
print('Best parameters:', gs.best_params_)
      
clf = gs.best_estimator_
print('Test accuracy: %.3f' % clf.score(X_Test, Y_Test))

Best scores: 0.93316
Best parameters: {'logisticregression__C': 0.0001}
Test accuracy: 0.983


In [7]:
# 5 features
Pipe_SVC= make_pipeline(StandardScaler(), PCA(n_components=5), SVC(random_state=1, max_iter=100))

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

param_grid = [{'svc__C': param_range, 
               'svc__gamma': param_range, 
               'svc__kernel': ['linear']},
              {'svc__C': param_range, 
               'svc__gamma': param_range, 
               'svc__kernel': ['rbf']}]

gs = GridSearchCV(estimator=Pipe_SVC, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  refit=True,
                  cv=5,
                  n_jobs=-1)

gs = gs.fit(X_Train, Y_Train)
print('Best scores:', gs.best_score_)
print('Best parameters:', gs.best_params_)


clf = gs.best_estimator_
print('Test accuracy: %.3f' % clf.score(X_Test, Y_Test))





















Best scores: 0.9330933333333334
Best parameters: {'svc__C': 1.0, 'svc__gamma': 1000.0, 'svc__kernel': 'rbf'}
Test accuracy: 0.983


In [9]:
Pipe_Tree = make_pipeline(StandardScaler(), PCA(n_components=5), DecisionTreeClassifier(random_state=1, criterion='gini'))


param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]}]

gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=5,
                  n_jobs=-1)

gs = gs.fit(X_Train, Y_Train)
print('Best scores:', gs.best_score_)
print('Best parameters:', gs.best_params_)

clf = gs.best_estimator_
print('Test accuracy: %.3f' % clf.score(X_Test, Y_Test))

Best scores: 0.93316
Best parameters: {'max_depth': 1}
Test accuracy: 0.983
