In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold

k_fold = KFold(n_splits = 10, shuffle=True, random_state=1)

In [3]:
train = pd.read_csv('Final_train.csv')
test = pd.read_csv('test_lAUu6dG.csv')
train.head()

Unnamed: 0.1,Unnamed: 0,id,gender,married,family,education,selfemployed,appin,coappin,loanam,loanterm,credit,property,status
0,1,LP001003,0.0,1.0,2.0,1,0.0,4583,1508.0,128.0,360.0,1.0,Rural,0
1,2,LP001005,0.0,1.0,1.0,1,1.0,3000,0.0,66.0,360.0,1.0,Urban,1
2,3,LP001006,0.0,1.0,1.0,0,0.0,2583,2358.0,120.0,360.0,1.0,Urban,1
3,4,LP001008,0.0,0.0,1.0,1,0.0,6000,0.0,141.0,360.0,1.0,Urban,1
4,5,LP001011,0.0,1.0,3.0,1,1.0,5417,4196.0,267.0,360.0,1.0,Urban,1


In [4]:
test.columns = ['id', 'gender', 'married', 'family', 'education', 'selfemployed', 'appin', 'coappin', 'loanam', 'loanterm',
                'credit', 'property']

test.head()

Unnamed: 0,id,gender,married,family,education,selfemployed,appin,coappin,loanam,loanterm,credit,property
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [5]:
train.dtypes

Unnamed: 0        int64
id               object
gender          float64
married         float64
family          float64
education         int64
selfemployed    float64
appin             int64
coappin         float64
loanam          float64
loanterm        float64
credit          float64
property         object
status            int64
dtype: object

In [6]:
train.property.replace({'Urban' : 1, 'Semiurban' : 2, 'Rural' : 3}, inplace = True)

train.isnull().sum()

Unnamed: 0       0
id               0
gender          10
married          2
family          10
education        0
selfemployed    26
appin            0
coappin          0
loanam           0
loanterm         0
credit           0
property         0
status           0
dtype: int64

In [7]:
train.gender.replace({np.nan : 0}, inplace=True)
train.married.replace({np.nan : 0}, inplace=True)
train.family.replace({np.nan : 1}, inplace=True)
train.selfemployed.replace({np.nan : 0}, inplace=True)

train.isnull().sum()

Unnamed: 0      0
id              0
gender          0
married         0
family          0
education       0
selfemployed    0
appin           0
coappin         0
loanam          0
loanterm        0
credit          0
property        0
status          0
dtype: int64

In [8]:
print(train.appin.describe())
print(train.coappin.describe())
print(train.loanam.describe())
print(train.loanterm.describe())

count      495.000000
mean      4419.842424
std       2424.680958
min       1000.000000
25%       2879.000000
50%       3704.000000
75%       5275.500000
max      16692.000000
Name: appin, dtype: float64
count     495.000000
mean     1371.971556
std      1437.965878
min         0.000000
25%         0.000000
50%      1387.000000
75%      2241.000000
max      5701.000000
Name: coappin, dtype: float64
count    495.000000
mean     134.753535
std       59.374710
min        9.000000
25%      100.000000
50%      126.000000
75%      160.000000
max      496.000000
Name: loanam, dtype: float64
count    495.000000
mean     341.696970
std       66.044007
min       12.000000
25%      360.000000
50%      360.000000
75%      360.000000
max      480.000000
Name: loanterm, dtype: float64


In [9]:
train['appinC'] = pd.cut(train['appin'], [0, 3000, 5000, 200000], labels = [0, 1, 2])
train['coappinC'] = pd.cut(train['coappin'], [0, 501, 1501, 200000], labels = [0, 1, 2])
train['loanamC'] = pd.cut(train['loanam'], [0, 100, 130, 100000], labels = [0, 1, 2])
train['loantermC'] = pd.cut(train['loanterm'], [0, 200, 400, 100000], labels = [0, 1, 2])

test['appinC'] = pd.cut(test['appin'], [0, 3001, 5001, 200000], labels = [0, 1, 2])
test['coappinC'] = pd.cut(test['coappin'], [0, 501, 1501, 200000], labels = [0, 1, 2])
test['loanamC'] = pd.cut(test['loanam'], [0, 100, 131, 100000], labels = [0, 1, 2])
test['loantermC'] = pd.cut(test['loanterm'], [0, 201, 401, 100000], labels = [0, 1, 2])

train.head()

Unnamed: 0.1,Unnamed: 0,id,gender,married,family,education,selfemployed,appin,coappin,loanam,loanterm,credit,property,status,appinC,coappinC,loanamC,loantermC
0,1,LP001003,0.0,1.0,2.0,1,0.0,4583,1508.0,128.0,360.0,1.0,3,0,1,2.0,1,1
1,2,LP001005,0.0,1.0,1.0,1,1.0,3000,0.0,66.0,360.0,1.0,1,1,0,,0,1
2,3,LP001006,0.0,1.0,1.0,0,0.0,2583,2358.0,120.0,360.0,1.0,1,1,0,2.0,1,1
3,4,LP001008,0.0,0.0,1.0,1,0.0,6000,0.0,141.0,360.0,1.0,1,1,2,,2,1
4,5,LP001011,0.0,1.0,3.0,1,1.0,5417,4196.0,267.0,360.0,1.0,1,1,2,2.0,2,1


In [10]:
train.drop(['appin','coappin','loanam','loanterm'], axis = 'columns', inplace=True)
test.drop(['appin','coappin','loanam','loanterm'], axis = 'columns', inplace=True)

In [11]:
test.gender.replace({'Male' : 0, 'Female' : 1}, inplace=True)
test.married.replace({'Yes' : 1, 'No' : 0}, inplace=True)
test.family.replace({'0' : 1, '1' : 2, '2' : 3, '3+' : 4}, inplace=True)
test.education.replace({'Graduate' : 1, 'Not Graduate' : 0}, inplace=True)
test.property.replace({'Urban' : 1, 'Rural' : 3, 'Semiurban' : 2}, inplace=True)

test.head()

Unnamed: 0,id,gender,married,family,education,selfemployed,credit,property,appinC,coappinC,loanamC,loantermC
0,LP001015,0.0,1,1.0,1,No,1.0,1,2,,1,1
1,LP001022,0.0,1,2.0,1,No,1.0,1,1,1.0,1,1
2,LP001031,0.0,1,3.0,1,No,1.0,1,1,2.0,2,1
3,LP001035,0.0,1,3.0,1,No,,1,0,2.0,0,1
4,LP001051,0.0,0,1.0,0,No,1.0,1,1,,0,1


In [12]:
train.isnull().sum()

Unnamed: 0        0
id                0
gender            0
married           0
family            0
education         0
selfemployed      0
credit            0
property          0
status            0
appinC            0
coappinC        203
loanamC           0
loantermC         0
dtype: int64

In [13]:
test.isnull().sum()

id                0
gender           11
married           0
family           10
education         0
selfemployed     23
credit           29
property          0
appinC            2
coappinC        156
loanamC           5
loantermC         6
dtype: int64

In [15]:
train = train.fillna(0)
test = test.fillna(0)
print(test.isnull().sum())
print(train.isnull().sum())

id              0
gender          0
married         0
family          0
education       0
selfemployed    0
credit          0
property        0
appinC          0
coappinC        0
loanamC         0
loantermC       0
dtype: int64
Unnamed: 0      0
id              0
gender          0
married         0
family          0
education       0
selfemployed    0
credit          0
property        0
status          0
appinC          0
coappinC        0
loanamC         0
loantermC       0
dtype: int64


In [17]:
x = train.drop(['id','status'], axis='columns')
y = train.status

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score

k_fold = KFold(n_splits=10,shuffle=True,random_state=1)
scoring = 'accuracy'

#KNN
clfk = KNeighborsClassifier(n_neighbors=13)
scorek = cross_val_score(clfk,x,y,cv=k_fold,n_jobs=1,scoring=scoring)
print('KNN',round(np.mean(scorek),2))

#DecisionTree
clfd = DecisionTreeClassifier()
scored = cross_val_score(clfd,x,y,cv=k_fold,n_jobs=1,scoring=scoring)
print('DecisionTree',round(np.mean(scored),2))

#RandomForestClassifier
clfr = RandomForestClassifier(n_estimators=13)
scorer = cross_val_score(clfr,x,y,cv=k_fold,n_jobs=1,scoring=scoring)
print('RandomForestClassifier',round(np.mean(scorer),2))

#GaussianNB
clfn = GaussianNB()
scoren = cross_val_score(clfn,x,y,cv=k_fold,n_jobs=1,scoring=scoring)
print('GaussianNB',round(np.mean(scoren),2))

#SVC
clfs = SVC()
scores = cross_val_score(clfs,x,y,cv=k_fold,n_jobs=1,scoring=scoring)
print('SVC',round(np.mean(scores),2))

KNN 0.72
DecisionTree 0.7
RandomForestClassifier 0.76
GaussianNB 0.77
SVC 0.74


In [24]:
Submission = pd.read_csv('sample_submission_49d68Cx.csv')

Submissionk = pd.DataFrame({'Loan_ID' : Submission.Loan_ID,
                          'Loan_Status' : clfk.fit(x, y).predict(test.drop['id'])})

TypeError: 'method' object is not subscriptable