# Course 5 Task 3 - Credit One Classification - Additional Drill-in
### Andrew Janzen
### September 10, 2019

### Objectives:
* Apply classification modeling to dataset
* Consolidate findings to final report

In [1]:
#Imports
#numpy,pandas,scipy, math, matplotlib
import numpy as np
import pandas as pd
import scipy
from math import sqrt
import matplotlib.pyplot as plt

In [2]:
#Import dataset processed in Course 5 Task 2
pcf = pd.read_csv('prepped_credit_file.csv', header =0)

In [3]:
#Estimators
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [4]:
#Model metrics
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [5]:
#Cross validation
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split

In [6]:
modelSVC = SVC()
modelRFC = RandomForestClassifier()
modelKNC = KNeighborsClassifier()

In [7]:
modelSVC_o = SVC()
modelRFC_o = RandomForestClassifier()
modelKNC_o = KNeighborsClassifier()

In [8]:
from sklearn.model_selection import GridSearchCV

# Quick Validation

In [9]:
pcf.head()

Unnamed: 0.1,Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,...,BILL_PCT_6,PAY_6MO_PCT_LIM,LIMIT_PER_AGE,BILL1_PCT_LIM,EDU_BUCK,SEX_BUCK,MAR_BUCK,def_word,AGE_BUCK,LIMIT_BAL_BUCK
0,0,1,20000,2,2,1,24,2,2,-1,...,0.0,0.03,833.0,0.2,UNIV,FEML,MARR,DEFAULT,<25,<50K
1,1,2,120000,2,2,2,26,-1,2,0,...,0.03,0.04,4615.0,0.02,UNIV,FEML,SING,DEFAULT,<30,<200K
2,2,3,90000,2,2,2,34,0,0,0,...,0.17,0.12,2647.0,0.32,UNIV,FEML,SING,PAID,<35,<100K
3,3,4,50000,2,2,1,37,0,0,0,...,0.59,0.17,1351.0,0.94,UNIV,FEML,MARR,PAID,<40,<100K
4,4,5,50000,1,2,1,57,-1,0,-1,...,0.38,1.18,877.0,0.17,UNIV,MALE,MARR,PAID,50+,<100K


In [10]:
pcf.columns

Index(['Unnamed: 0', 'ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE',
       'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1',
       'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
       'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month', 'BILL_PCT_1', 'BILL_PCT_2', 'BILL_PCT_3',
       'BILL_PCT_4', 'BILL_PCT_5', 'BILL_PCT_6', 'PAY_6MO_PCT_LIM',
       'LIMIT_PER_AGE', 'BILL1_PCT_LIM', 'EDU_BUCK', 'SEX_BUCK', 'MAR_BUCK',
       'def_word', 'AGE_BUCK', 'LIMIT_BAL_BUCK'],
      dtype='object')

### Analysis: "Unnamed: 0" is the row numbers from the output in C5T2... I will drop that column in building my test and training datasets.

### The buckets/discretization we applied with C5T2 will need to be dropped or converted to dummy variables in our classification analysis. For now I will not include them in my classification test modeling.

# BASE - Creating Training and Test Datasets

## BASE - Defining Features and Dependent Variable

In [11]:
depVar = pcf['default payment next month']

### Point of View for Feature Selection
- Initial dataset will include only original columns (no feature engineering)
- Excludes discritization - will need to use dummy variables for that
- Will include columns from Task 2 that were determined to be highly correlated

In [12]:
features = pcf.iloc[:,2:25]

In [13]:
test_size = .3
seed = 40
X_train, X_test, Y_train, Y_test = train_test_split(features, depVar, test_size=test_size, random_state=seed)

In [14]:
#Independent Variable Training Set (X Training)
X_train_count = len(X_train.index)
print('The number of observations in the X Training set are:',str(X_train_count))
X_train.head()

The number of observations in the X Training set are: 21000


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
14632,20000,1,3,2,35,0,0,0,0,0,...,20269,18654,18914,20200,2618,2276,791,686,1600,0
5372,270000,1,2,2,29,0,0,0,0,-1,...,17555,17803,1852,2044,3022,1000,2054,1857,2069,6327
8154,80000,1,2,2,30,0,0,0,-2,-2,...,0,0,0,0,3750,0,0,0,0,0
6468,340000,2,5,1,27,0,0,0,0,0,...,282231,216946,216403,218209,11000,11031,8000,8000,8300,7500
10581,90000,2,1,2,24,0,0,0,0,0,...,90370,26420,27398,28295,4054,4444,1420,1398,1345,3265


In [15]:
#Dependent Variable Training Set (Y Training)
Y_train_count = len(Y_train.index)
print('The number of observations in the Y training set are:',str(Y_train_count))
Y_train.head()

The number of observations in the Y training set are: 21000


14632    0
5372     0
8154     0
6468     0
10581    0
Name: default payment next month, dtype: int64

In [16]:
#Independent Variable Test Set (X Test)
X_test_count = len(X_test.index)
print('The number of observations in the X Training set are:',str(X_test_count))
X_test.head()

The number of observations in the X Training set are: 9000


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
28478,100000,2,2,2,35,-1,-1,-1,-1,-1,...,326,326,652,326,2691,326,326,652,0,326
3956,380000,1,1,2,33,0,0,0,0,0,...,80080,81162,89571,96627,10000,5000,5000,10000,10000,10000
7014,150000,2,2,2,41,-1,-1,-1,0,-1,...,1995,285,4926,5523,40824,2340,0,4970,5527,903
18659,50000,2,2,1,22,0,0,2,0,0,...,26570,28085,27009,28142,2134,1000,2001,910,1510,500
15274,170000,2,3,2,29,0,0,0,0,0,...,119050,120225,122345,124302,4200,4300,4400,4518,4500,5000


In [17]:
#Dependent Variable Testing Set (Y Test)
Y_test_count = len(Y_test.index)
print('The number of observations in the Y Test set are:',str(Y_test_count))
Y_test.head()

The number of observations in the Y Test set are: 9000


28478    1
3956     0
7014     0
18659    0
15274    1
Name: default payment next month, dtype: int64

# BASE - Initial Modeling

## SVM Classification

In [18]:
#Model Fitting
modelSVC.fit(X_train,Y_train)
print(cross_val_score(modelSVC, X_train, Y_train))
modelSVC.score(X_train,Y_train)



[0.77903157 0.77814286 0.77839691]


0.9938571428571429

### Including Cross Validation

In [19]:
#Model Fitting
modelSVC.fit(X_train,Y_train)
predSVC=modelSVC.predict(X_train)
print(cross_val_score(modelSVC, X_train, Y_train))
print('Train kappa: %.2f' 
% cohen_kappa_score(y1=Y_train, y2=predSVC))
print('Train Accuracy: %.2f' 
% accuracy_score(y_true=Y_train, y_pred=predSVC))



[0.77903157 0.77814286 0.77839691]
Train kappa: 0.98
Train Accuracy: 0.99


### Testing

In [20]:
predSVC_test=modelSVC.predict(X_test)
print('Test kappa: %.2f' 
% cohen_kappa_score(y1=Y_test, y2=predSVC_test))
print('Test Accuracy: %.2f' 
% accuracy_score(y_true=Y_test, y_pred=predSVC_test))

Test kappa: 0.02
Test Accuracy: 0.78


## Random Forest Classification

In [21]:
#Model Fitting
modelRFC.fit(X_train,Y_train)
predRFC=modelRFC.predict(X_train)
print(cross_val_score(modelRFC, X_train, Y_train))
print('Train kappa: %.2f' 
% cohen_kappa_score(y1=Y_train, y2=predRFC))
print('Train Accuracy: %.2f' 
% accuracy_score(y_true=Y_train, y_pred=predRFC))



[0.80159977 0.79642857 0.80168596]
Train kappa: 0.94
Train Accuracy: 0.98


### Testing

In [22]:
predRFC_test=modelRFC.predict(X_test)
print('Test kappa: %.2f' 
% cohen_kappa_score(y1=Y_test, y2=predRFC_test))
print('Test Accuracy: %.2f' 
% accuracy_score(y_true=Y_test, y_pred=predRFC_test))

Test kappa: 0.34
Test Accuracy: 0.81


## KNN Classification

In [23]:
#Model Fitting
modelKNC.fit(X_train,Y_train)
predKNC=modelKNC.predict(X_train)
print(cross_val_score(modelKNC, X_train, Y_train))
print('Train kappa: %.2f' 
% cohen_kappa_score(y1=Y_train, y2=predKNC))
print('Train Accuracy: %.2f' 
% accuracy_score(y_true=Y_train, y_pred=predKNC))



[0.74917869 0.753      0.75410773]
Train kappa: 0.35
Train Accuracy: 0.82


In [24]:
predKNC_test=modelKNC.predict(X_test)
print('Test kappa: %.2f' 
% cohen_kappa_score(y1=Y_test, y2=predKNC_test))
print('Test Accuracy: %.2f' 
% accuracy_score(y_true=Y_test, y_pred=predKNC_test))

Test kappa: 0.13
Test Accuracy: 0.76


# Initial Model Assessment

### Training Results
* KNN did not perform well.
* Both RF and SVM performed exceptionally (too) well. Likely need to remove some features.

### Testing Results
* RF performed the best, with moderate kappa score and 82% accuracy


# OPTIMIZED -  In Depth Modeling
Need to:
* Change features selected, include engineered fields
* Remove field identified in Task 2 that were highly correlated
* Apply One-Hot encoding for discritization (Age, Education, etc)
* Tune model parameters

# OPTIMIZED - Creating Training and Test Datasets

## OPTIMIZED - Defining Features and Dependent Variable

In [25]:
pcf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 41 columns):
Unnamed: 0                    30000 non-null int64
ID                            30000 non-null int64
LIMIT_BAL                     30000 non-null int64
SEX                           30000 non-null int64
EDUCATION                     30000 non-null int64
MARRIAGE                      30000 non-null int64
AGE                           30000 non-null int64
PAY_1                         30000 non-null int64
PAY_2                         30000 non-null int64
PAY_3                         30000 non-null int64
PAY_4                         30000 non-null int64
PAY_5                         30000 non-null int64
PAY_6                         30000 non-null int64
BILL_AMT1                     30000 non-null int64
BILL_AMT2                     30000 non-null int64
BILL_AMT3                     30000 non-null int64
BILL_AMT4                     30000 non-null int64
BILL_AMT5               

# Question - Can only payment history and basic demographic info predict default?

In [29]:
pcf_o = pcf.drop('LIMIT_PER_AGE',1)
pcf_o = pcf_o.drop('BILL_AMT1',1)
pcf_o = pcf_o.drop('BILL_AMT2',1)
pcf_o = pcf_o.drop('BILL_AMT3',1)
pcf_o = pcf_o.drop('BILL_AMT4',1)
pcf_o = pcf_o.drop('BILL_AMT5',1)
pcf_o = pcf_o.drop('BILL_AMT6',1)
pcf_o = pcf_o.drop('BILL_PCT_1',1)
pcf_o = pcf_o.drop('BILL_PCT_2',1)
pcf_o = pcf_o.drop('BILL_PCT_3',1)
pcf_o = pcf_o.drop('BILL_PCT_4',1)
pcf_o = pcf_o.drop('BILL_PCT_5',1)
pcf_o = pcf_o.drop('BILL_PCT_6',1)
pcf_o = pcf_o.drop('PAY_AMT1',1)
pcf_o = pcf_o.drop('PAY_AMT2',1)
pcf_o = pcf_o.drop('PAY_AMT3',1)
pcf_o = pcf_o.drop('PAY_AMT4',1)
pcf_o = pcf_o.drop('PAY_AMT5',1)
pcf_o = pcf_o.drop('PAY_AMT6',1)
pcf_o = pcf_o.drop('BILL1_PCT_LIM',1)
pcf_o = pcf_o.drop('LIMIT_BAL',1)
pcf_o = pcf_o.drop('SEX',1)
pcf_o = pcf_o.drop('AGE',1)
pcf_o = pcf_o.drop('MARRIAGE',1)
pcf_o = pcf_o.drop('EDUCATION',1)
pcf_o = pcf_o.drop('Unnamed: 0',1)
pcf_o = pcf_o.drop('ID',1)
pcf_o = pcf_o.drop('def_word',1)
pcf_o = pcf_o.drop('PAY_6MO_PCT_LIM',1)

In [30]:
pcf_o.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 12 columns):
PAY_1                         30000 non-null int64
PAY_2                         30000 non-null int64
PAY_3                         30000 non-null int64
PAY_4                         30000 non-null int64
PAY_5                         30000 non-null int64
PAY_6                         30000 non-null int64
default payment next month    30000 non-null int64
EDU_BUCK                      30000 non-null object
SEX_BUCK                      30000 non-null object
MAR_BUCK                      30000 non-null object
AGE_BUCK                      30000 non-null object
LIMIT_BAL_BUCK                30000 non-null object
dtypes: int64(7), object(5)
memory usage: 2.7+ MB


In [31]:
mar_buck_d = pd.get_dummies(pcf_o['MAR_BUCK'], prefix_sep="_", drop_first=True)

In [32]:
pcf_o_dum = pd.merge(pcf_o, mar_buck_d, how='left', right_index=True, left_index=True)

In [33]:
pcf_o_dum = pcf_o_dum.drop('MAR_BUCK',1)

In [34]:
pcf_o_dum.head()

Unnamed: 0,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,default payment next month,EDU_BUCK,SEX_BUCK,AGE_BUCK,LIMIT_BAL_BUCK,MARR,OTHR,SING
0,2,2,-1,-1,-2,-2,1,UNIV,FEML,<25,<50K,1,0,0
1,-1,2,0,0,0,2,1,UNIV,FEML,<30,<200K,0,0,1
2,0,0,0,0,0,0,0,UNIV,FEML,<35,<100K,0,0,1
3,0,0,0,0,0,0,0,UNIV,FEML,<40,<100K,1,0,0
4,-1,0,-1,0,0,0,0,UNIV,MALE,50+,<100K,1,0,0


"NORMAL STATE" is "DIVORCED"

In [35]:
edu_buck_d = pd.get_dummies(pcf_o['EDU_BUCK'], prefix_sep="_", drop_first=True)

In [36]:
pcf_o_dum = pd.merge(pcf_o_dum, edu_buck_d, how='left', right_index=True, left_index=True)

In [37]:
pcf_o_dum = pcf_o_dum.drop('EDU_BUCK',1)

In [38]:
pcf_o_dum.head()

Unnamed: 0,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,default payment next month,SEX_BUCK,AGE_BUCK,LIMIT_BAL_BUCK,MARR,OTHR_x,SING,HS,OTHR_y,UNIV
0,2,2,-1,-1,-2,-2,1,FEML,<25,<50K,1,0,0,0,0,1
1,-1,2,0,0,0,2,1,FEML,<30,<200K,0,0,1,0,0,1
2,0,0,0,0,0,0,0,FEML,<35,<100K,0,0,1,0,0,1
3,0,0,0,0,0,0,0,FEML,<40,<100K,1,0,0,0,0,1
4,-1,0,-1,0,0,0,0,MALE,50+,<100K,1,0,0,0,0,1


* "NORMAL STATE" IS "GRAD"
* OTHR_x is marital status
* OTHR_y is education

In [39]:
sex_buck_d = pd.get_dummies(pcf_o['SEX_BUCK'], prefix_sep="_", drop_first=True)

In [40]:
pcf_o_dum = pd.merge(pcf_o_dum, sex_buck_d, how='left', right_index=True, left_index=True)

In [41]:
pcf_o_dum = pcf_o_dum.drop('SEX_BUCK',1)

In [42]:
pcf_o_dum.head()

Unnamed: 0,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,default payment next month,AGE_BUCK,LIMIT_BAL_BUCK,MARR,OTHR_x,SING,HS,OTHR_y,UNIV,MALE
0,2,2,-1,-1,-2,-2,1,<25,<50K,1,0,0,0,0,1,0
1,-1,2,0,0,0,2,1,<30,<200K,0,0,1,0,0,1,0
2,0,0,0,0,0,0,0,<35,<100K,0,0,1,0,0,1,0
3,0,0,0,0,0,0,0,<40,<100K,1,0,0,0,0,1,0
4,-1,0,-1,0,0,0,0,50+,<100K,1,0,0,0,0,1,1


"NORMAL STATE" is "FEMALE"

In [43]:
age_buck_d = pd.get_dummies(pcf_o['AGE_BUCK'], prefix_sep="_", drop_first=True)

In [44]:
pcf_o_dum = pd.merge(pcf_o_dum, age_buck_d, how='left', right_index=True, left_index=True)

In [45]:
pcf_o_dum = pcf_o_dum.drop('AGE_BUCK',1)

In [46]:
pcf_o_dum.head()

Unnamed: 0,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,default payment next month,LIMIT_BAL_BUCK,MARR,OTHR_x,SING,HS,OTHR_y,UNIV,MALE,<25,<30,<35,<40,<50
0,2,2,-1,-1,-2,-2,1,<50K,1,0,0,0,0,1,0,1,0,0,0,0
1,-1,2,0,0,0,2,1,<200K,0,0,1,0,0,1,0,0,1,0,0,0
2,0,0,0,0,0,0,0,<100K,0,0,1,0,0,1,0,0,0,1,0,0
3,0,0,0,0,0,0,0,<100K,1,0,0,0,0,1,0,0,0,0,1,0
4,-1,0,-1,0,0,0,0,<100K,1,0,0,0,0,1,1,0,0,0,0,0


"NORMAL STATE" for age is 50+

In [47]:
bal_buck_d = pd.get_dummies(pcf_o['LIMIT_BAL_BUCK'], prefix_sep="_", drop_first=True)

In [48]:
pcf_o_dum = pd.merge(pcf_o_dum, bal_buck_d, how='left', right_index=True, left_index=True)

In [49]:
pcf_o_dum = pcf_o_dum.drop('LIMIT_BAL_BUCK',1)

In [50]:
pcf_o_dum.head()

Unnamed: 0,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,default payment next month,MARR,OTHR_x,SING,...,MALE,<25,<30,<35,<40,<50,<100K,<200K,<300K,<50K
0,2,2,-1,-1,-2,-2,1,1,0,0,...,0,1,0,0,0,0,0,0,0,1
1,-1,2,0,0,0,2,1,0,0,1,...,0,0,1,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,1,0,0,0
4,-1,0,-1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0


"NORMAL STATE" IS >300K

# SLIM - Create Test and Training Sets

In [53]:
depVar_o = pcf_o['default payment next month']

In [55]:
features_o = pcf_o_dum.drop('default payment next month',1)

In [56]:
X_train_o, X_test_o, Y_train_o, Y_test_o = train_test_split(features_o, depVar_o, test_size=test_size, random_state=seed)

In [57]:
#Independent Variable Training Set (X Training)
X_train_count_o = len(X_train_o.index)
print('The number of observations in the X Training set are:',str(X_train_count_o))
X_train_o.head()

The number of observations in the X Training set are: 21000


Unnamed: 0,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,MARR,OTHR_x,SING,HS,...,MALE,<25,<30,<35,<40,<50,<100K,<200K,<300K,<50K
14632,0,0,0,0,0,0,0,0,1,1,...,1,0,0,0,1,0,0,0,0,1
5372,0,0,0,0,-1,-1,0,0,1,0,...,1,0,1,0,0,0,0,0,1,0
8154,0,0,0,-2,-2,-2,0,0,1,0,...,1,0,0,1,0,0,1,0,0,0
6468,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
10581,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0


# SLIM - Modeling

## SVM

In [58]:
#Model Fitting
modelSVC_o.fit(X_train_o,Y_train_o)
predSVC_o=modelSVC_o.predict(X_train_o)
print(cross_val_score(modelSVC_o, X_train_o, Y_train_o))
print('Train kappa: %.2f' 
% cohen_kappa_score(y1=Y_train_o, y2=predSVC_o))
print('Train Accuracy: %.2f' 
% accuracy_score(y_true=Y_train_o, y_pred=predSVC_o))



[0.82031138 0.815      0.82197457]
Train kappa: 0.37
Train Accuracy: 0.82


## RF

In [59]:
#Model Fitting
modelRFC_o.fit(X_train_o,Y_train_o)
predRFC_o = modelRFC_o.predict(X_train_o)
print(cross_val_score(modelRFC_o, X_train_o, Y_train_o))
print('Train kappa: %.2f' 
% cohen_kappa_score(y1 = Y_train_o, y2 = predRFC_o))
print('Train Accuracy: %.2f' 
% accuracy_score(y_true = Y_train_o, y_pred = predRFC_o))



[0.79003    0.77971429 0.7918274 ]
Train kappa: 0.70
Train Accuracy: 0.91


In [61]:
RF_param_grid = {
    'n_estimators': [100, 1000],
    'max_features': ['sqrt', 'log2']
}

In [62]:
CV_RFC_o = GridSearchCV(estimator = modelRFC_o, param_grid = RF_param_grid, cv = 5)
CV_RFC_o.fit(X_train_o, Y_train_o)
print(CV_RFC_o.best_params_)

{'max_features': 'sqrt', 'n_estimators': 1000}


In [63]:
modelRFC_o1 = RandomForestClassifier(n_jobs=-1,max_features="log2", n_estimators=1000, oob_score = True)

In [64]:
modelRFC_o1.fit(X_train_o,Y_train_o)
predRFC_o1 = modelRFC_o1.predict(X_train_o)
print(cross_val_score(modelRFC_o1, X_train_o, Y_train_o))
print('Train kappa: %.2f' 
% cohen_kappa_score(y1 = Y_train_o, y2 = predRFC_o1))
print('Train Accuracy: %.2f' 
% accuracy_score(y_true = Y_train_o, y_pred = predRFC_o1))



[0.79988573 0.786      0.8001143 ]
Train kappa: 0.73
Train Accuracy: 0.91


## RFC - Testing

In [65]:
predRFC_test_o1=modelRFC_o1.predict(X_test_o)
print('Test kappa: %.2f' 
% cohen_kappa_score(y1=Y_test_o, y2=predRFC_test_o1))
print('Test Accuracy: %.2f' 
% accuracy_score(y_true=Y_test_o, y_pred=predRFC_test_o1))

Test kappa: 0.33
Test Accuracy: 0.80


## KNC

In [60]:
#Model Fitting
modelKNC_o.fit(X_train_o,Y_train_o)
predKNC_o=modelKNC_o.predict(X_train_o)
print(cross_val_score(modelKNC_o, X_train_o, Y_train_o))
print('Train kappa: %.2f' 
% cohen_kappa_score(y1=Y_train_o, y2=predKNC_o))
print('Train Accuracy: %.2f' 
% accuracy_score(y_true=Y_train_o, y_pred=predKNC_o))



[0.79231538 0.782      0.79225604]
Train kappa: 0.46
Train Accuracy: 0.84


# Final Analysis

### My initial classification notebook overfit the RF model. I removed several features to focus on payment trends and demographic data. The result was a training model accuracy of 91% and performance on a test data set of 80%.