In [58]:
import pandas as pd
import numpy as np
import matplotlib as plt
import sklearn as sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree 
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.model_selection import StratifiedKFold

In [2]:
df= pd.read_csv('default_data.csv')

In [3]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [4]:
df.drop('ID',axis=1,inplace=True)

In [5]:
df.rename(columns={"default payment next month": "Default"}, inplace=True)

In [6]:
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Default
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


#Data preprocessing starts:

#Checking for null values in whole data frame

In [7]:
df.isna().any().sum()

0

In [8]:
list(df['SEX'].unique())

[2, 1]

In [9]:
list(df['EDUCATION'].unique())

[2, 1, 3, 5, 4, 6, 0]

#Above the education column shows total seven unique values. I will accumulate 5,6,0 values to 
#4 value which represents others.

In [10]:
df['EDUCATION']=np.where(df['EDUCATION'] == 5, 4, df['EDUCATION'])

In [11]:
list(df['EDUCATION'].unique())

[2, 1, 3, 4, 6, 0]

In [12]:
df['EDUCATION']=np.where(df['EDUCATION'] == 6, 4, df['EDUCATION'])

In [13]:
list(df['EDUCATION'].unique())

[2, 1, 3, 4, 0]

In [14]:
df['EDUCATION']=np.where(df['EDUCATION'] == 0, 4, df['EDUCATION'])

In [15]:
list(df['EDUCATION'].unique())

[2, 1, 3, 4]

In [16]:
list(df['MARRIAGE'].unique())

[1, 2, 3, 0]

In [17]:
df['MARRIAGE']=np.where(df['MARRIAGE'] == 0, 3, df['MARRIAGE'])

In [18]:
list(df['MARRIAGE'].unique())

[1, 2, 3]

In [19]:
dummies=pd.get_dummies(df.MARRIAGE)

In [20]:
dummies.head()

Unnamed: 0,1,2,3
0,1,0,0
1,0,1,0
2,0,1,0
3,1,0,0
4,1,0,0


In [21]:
merged=pd.concat([df,dummies],axis='columns') 

In [22]:
merged.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Default,1,2,3
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,689,0,0,0,0,1,1,0,0
1,120000,2,2,2,26,-1,2,0,0,0,...,0,1000,1000,1000,0,2000,1,0,1,0
2,90000,2,2,2,34,0,0,0,0,0,...,1518,1500,1000,1000,1000,5000,0,0,1,0
3,50000,2,2,1,37,0,0,0,0,0,...,2000,2019,1200,1100,1069,1000,0,1,0,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,2000,36681,10000,9000,689,679,0,1,0,0


In [23]:
merged.rename(columns={1:'MARRIED',2:'SINGLE',3:'OTHERS'},inplace=True)

In [24]:
merged.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Default,MARRIED,SINGLE,OTHERS
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,689,0,0,0,0,1,1,0,0
1,120000,2,2,2,26,-1,2,0,0,0,...,0,1000,1000,1000,0,2000,1,0,1,0
2,90000,2,2,2,34,0,0,0,0,0,...,1518,1500,1000,1000,1000,5000,0,0,1,0
3,50000,2,2,1,37,0,0,0,0,0,...,2000,2019,1200,1100,1069,1000,0,1,0,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,2000,36681,10000,9000,689,679,0,1,0,0


In [25]:
merged.drop('MARRIAGE',axis=1,inplace=True)
merged.head(5)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,...,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Default,MARRIED,SINGLE,OTHERS
0,20000,2,2,24,2,2,-1,-1,-2,-2,...,0,689,0,0,0,0,1,1,0,0
1,120000,2,2,26,-1,2,0,0,0,2,...,0,1000,1000,1000,0,2000,1,0,1,0
2,90000,2,2,34,0,0,0,0,0,0,...,1518,1500,1000,1000,1000,5000,0,0,1,0
3,50000,2,2,37,0,0,0,0,0,0,...,2000,2019,1200,1100,1069,1000,0,1,0,0
4,50000,1,2,57,-1,0,-1,0,0,0,...,2000,36681,10000,9000,689,679,0,1,0,0


In [26]:
merged.drop('OTHERS',axis=1,inplace=True)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
LIMIT_BAL    30000 non-null int64
SEX          30000 non-null int64
EDUCATION    30000 non-null int64
MARRIAGE     30000 non-null int64
AGE          30000 non-null int64
PAY_0        30000 non-null int64
PAY_2        30000 non-null int64
PAY_3        30000 non-null int64
PAY_4        30000 non-null int64
PAY_5        30000 non-null int64
PAY_6        30000 non-null int64
BILL_AMT1    30000 non-null int64
BILL_AMT2    30000 non-null int64
BILL_AMT3    30000 non-null int64
BILL_AMT4    30000 non-null int64
BILL_AMT5    30000 non-null int64
BILL_AMT6    30000 non-null int64
PAY_AMT1     30000 non-null int64
PAY_AMT2     30000 non-null int64
PAY_AMT3     30000 non-null int64
PAY_AMT4     30000 non-null int64
PAY_AMT5     30000 non-null int64
PAY_AMT6     30000 non-null int64
Default      30000 non-null int64
dtypes: int64(24)
memory usage: 5.5 MB


In [28]:
y=merged.Default

In [29]:
y.head()

0    1
1    1
2    0
3    0
4    0
Name: Default, dtype: int64

In [30]:
x=merged.drop('Default',axis=1)

In [31]:
x.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,...,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,MARRIED,SINGLE
0,20000,2,2,24,2,2,-1,-1,-2,-2,...,0,0,0,689,0,0,0,0,1,0
1,120000,2,2,26,-1,2,0,0,0,2,...,3455,3261,0,1000,1000,1000,0,2000,0,1
2,90000,2,2,34,0,0,0,0,0,0,...,14948,15549,1518,1500,1000,1000,1000,5000,0,1
3,50000,2,2,37,0,0,0,0,0,0,...,28959,29547,2000,2019,1200,1100,1069,1000,1,0
4,50000,1,2,57,-1,0,-1,0,0,0,...,19146,19131,2000,36681,10000,9000,689,679,1,0


##LOGISTIC REGRESSION with RFECV

In [32]:
logit=LogisticRegression()

In [33]:
rfecv_logit=RFECV(estimator=logit, step=1, cv=StratifiedKFold(10),min_features_to_select=10,scoring='accuracy')

In [34]:
rfecv_logit.fit(x,y)









RFECV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
   estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
   min_features_to_select=10, n_jobs=None, scoring='accuracy', step=1,
   verbose=0)

In [35]:
rfecv_logit.get_support()

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True,  True])

In [36]:
features_lr = x.columns[rfecv_logit.get_support()]

In [37]:
features_lr

Index(['SEX', 'EDUCATION', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5',
       'MARRIED', 'SINGLE'],
      dtype='object')

In [38]:
rfecv_logit.score(x,y)

0.8099666666666666

 Decision Tree with RFECV

In [39]:
DT=tree.DecisionTreeClassifier()

In [40]:
rfecv_DT=RFECV(estimator=DT, step=1, cv=StratifiedKFold(10),min_features_to_select=10,scoring='accuracy')

In [41]:
rfecv_DT.fit(x,y)

RFECV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
   estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
   min_features_to_select=10, n_jobs=None, scoring='accuracy', step=1,
   verbose=0)

In [42]:
rfecv_DT.get_support()

array([ True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False])

In [43]:
features_DT = x.columns[rfecv_DT.get_support()]

In [44]:
features_DT

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3',
       'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
       'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'MARRIED'],
      dtype='object')

In [45]:
rfecv_DT.score(x,y)

0.9993

#RANDOM FOREST CLASSIFIER

In [46]:
rfc = RandomForestClassifier()


In [47]:
rfecv_RFC = RFECV(estimator=rfc, step=1, cv=StratifiedKFold(10),min_features_to_select=10, scoring='accuracy')

In [48]:
rfecv_RFC.fit(x,y)









RFECV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
   estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
   min_features_to_select=10, n_jobs=None, scoring='accuracy', step=1,
   verbose=0)

In [49]:
rfecv_RFC.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

In [50]:
features_RFC = x.columns[rfecv_RFC.get_support()]

In [51]:
rfecv_RFC.score(x,y)

0.9791333333333333

#SUPPORT VECTOR MACHINE LEARNING

In [52]:
svm=SVC()

In [53]:
rfecv_SVM = RFECV(estimator=svm, step=1, cv=StratifiedKFold(10),min_features_to_select=10, scoring='accuracy')

In [56]:
rfecv_SVM.fit(x,y)



RuntimeError: The classifier does not expose "coef_" or "feature_importances_" attributes

In [59]:
rfe_SVM= RFE(SVC(),n_features_to_select=10)

In [60]:
rfe_SVM.fit(x,y)



RuntimeError: The classifier does not expose "coef_" or "feature_importances_" attributes