In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_input=pd.read_csv("Credit_Risk_Train_data.csv")
test_input=pd.read_csv("Credit_Risk_Validate_data.csv")

In [3]:
print(train_input.columns)
print(test_input.columns)

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'outcome'],
      dtype='object')


In [4]:
# Last colums has the diff  name in both,
#make the names same and merge them together so that the missing values will be filled
test_input.rename(columns={"outcome":"Loan_Status"},inplace=True)

In [5]:
data_all=pd.concat([train_input,test_input],axis=0)
data_all.shape

(981, 13)

In [6]:
data_all.reset_index(inplace=True,drop=True)
# reset ,index else merging will have the issues
print(data_all.tail())

      Loan_ID Gender Married Dependents     Education Self_Employed  \
976  LP002971   Male     Yes         3+  Not Graduate           Yes   
977  LP002975   Male     Yes          0      Graduate            No   
978  LP002980   Male      No          0      Graduate            No   
979  LP002986   Male     Yes          0      Graduate            No   
980  LP002989   Male      No          0      Graduate           Yes   

     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
976             4009             1777.0       113.0             360.0   
977             4158              709.0       115.0             360.0   
978             3250             1993.0       126.0             360.0   
979             5000             2393.0       158.0             360.0   
980             9200                0.0        98.0             180.0   

     Credit_History Property_Area Loan_Status  
976             1.0         Urban           Y  
977             1.0         Urban     

In [7]:
data_all.isnull().sum() # gives the missing value of all columns

Loan_ID               0
Gender               24
Married               3
Dependents           25
Education             0
Self_Employed        55
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [8]:
data_all.shape

(981, 13)

In [9]:
# before proceeding to model building, lets fill the missing values
Counter(data_all['Gender'])

Counter({'Male': 775, 'Female': 182, nan: 24})

In [10]:
# getting the list of the indexes having nan values
gender_null=data_all[data_all['Gender'].isnull()].index.tolist()
print(gender_null)

[23, 126, 171, 188, 314, 334, 460, 467, 477, 507, 576, 588, 592, 636, 665, 720, 752, 823, 845, 859, 893, 910, 917, 932]


In [11]:
gender_null_M=gender_null[:12]
gender_null_F=gender_null[12:]
data_all['Gender'].iloc[gender_null_M]='Male'
data_all['Gender'].iloc[gender_null_F]='Female'

In [12]:
print(sum(data_all['Gender'].isnull()))
Counter(data_all['Gender'])

0


Counter({'Male': 787, 'Female': 194})

In [13]:
# lets clean nan values in married
print(Counter(data_all['Married']))

Counter({'Yes': 631, 'No': 347, nan: 3})


In [14]:
married_null=data_all[data_all['Married'].isnull()].index.tolist()
married_null

[104, 228, 435]

In [15]:
data_all['Married'].iloc[married_null]="Yes"


In [16]:
data_all.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents           25
Education             0
Self_Employed        55
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [17]:
Counter(data_all['Dependents'])

Counter({'0': 545, '1': 160, '2': 160, '3+': 91, nan: 25})

In [18]:
pd.crosstab(data_all['Married'],data_all['Dependents'].isnull())

Dependents,False,True
Married,Unnamed: 1_level_1,Unnamed: 2_level_1
No,338,9
Yes,618,16


In [19]:
pd.crosstab(data_all['Dependents'],data_all['Married'])

Married,No,Yes
Dependents,Unnamed: 1_level_1,Unnamed: 2_level_1
0,276,269
1,36,124
2,14,146
3+,12,79


In [20]:
bachelor_nulldependent=data_all[(data_all['Married']=='No') & 
                               (data_all["Dependents"].isnull())].index.tolist()
print(bachelor_nulldependent)

[293, 332, 355, 597, 684, 752, 879, 916, 926]


In [21]:
data_all['Dependents'].iloc[bachelor_nulldependent]='0'

In [22]:
Counter(data_all["Dependents"])

Counter({'0': 554, '1': 160, '2': 160, '3+': 91, nan: 16})

In [23]:
pd.crosstab(data_all['Gender'],data_all["Dependents"])

Dependents,0,1,2,3+
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,135,34,13,11
Male,419,126,147,80


In [24]:
pd.crosstab(data_all['Gender'],data_all['Dependents'].isnull())

Dependents,False,True
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,193,1
Male,772,15


In [25]:
data_all['Gender'].iloc[data_all[data_all['Dependents'].isnull()].
                       index.tolist()]

102      Male
104      Male
120      Male
226      Male
228      Male
301      Male
335      Male
346      Male
435    Female
517      Male
571      Male
660      Male
725      Male
816      Male
861      Male
865      Male
Name: Gender, dtype: object

In [26]:
pd.crosstab((data_all['Gender']=='Male') & 
           (data_all['Married']=='Yes'), data_all['Dependents'])

Dependents,0,1,2,3+
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,319,49,23,16
True,235,111,137,75


In [27]:
data_all['Dependents'].iloc[data_all[data_all['Dependents'].isnull()].index.tolist()]='1'

In [28]:
data_all.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed        55
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [29]:
Counter(data_all["Self_Employed"])

Counter({'No': 807, 'Yes': 119, nan: 55})

In [30]:
self_emp_null=data_all[data_all['Self_Employed'].isnull()].index.tolist()

In [31]:
data_all['Self_Employed'].iloc[self_emp_null]="No"

In [32]:
data_all.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [33]:
pd.crosstab(data_all['LoanAmount'].isnull(), data_all['Loan_Amount_Term'].isnull())

Loan_Amount_Term,False,True
LoanAmount,Unnamed: 1_level_1,Unnamed: 2_level_1
False,934,20
True,27,0


In [34]:
pd.crosstab(data_all['LoanAmount'].isnull(), data_all['Loan_Amount_Term'])

Loan_Amount_Term,6.0,12.0,36.0,60.0,84.0,120.0,180.0,240.0,300.0,350.0,360.0,480.0
LoanAmount,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
False,1,2,3,3,7,4,64,7,20,1,800,22
True,0,0,0,0,0,0,2,1,0,0,23,1


In [35]:
data_all.groupby(data_all['Loan_Amount_Term'])['LoanAmount'].mean()

Loan_Amount_Term
6.0       95.000000
12.0     185.500000
36.0     117.666667
60.0     139.666667
84.0     121.142857
120.0     36.750000
180.0    131.125000
240.0    128.857143
300.0    166.250000
350.0    133.000000
360.0    144.420000
480.0    137.181818
Name: LoanAmount, dtype: float64

In [36]:
data_all['LoanAmount'][(data_all['LoanAmount'].isnull())
                      & (data_all['LoanAmount']==360)]=144

data_all['LoanAmount'][(data_all['LoanAmount'].isnull())
                      & (data_all['LoanAmount']==480)]=137

In [37]:
data_all['LoanAmount'][(data_all['LoanAmount'].isnull())]=130
                      

In [38]:
(data_all['Loan_Amount_Term']).value_counts()

360.0    823
180.0     66
480.0     23
300.0     20
240.0      8
84.0       7
120.0      4
36.0       3
60.0       3
12.0       2
350.0      1
6.0        1
Name: Loan_Amount_Term, dtype: int64

In [39]:
data_all['Loan_Amount_Term'][data_all['Loan_Amount_Term'].isnull()]=360

In [40]:
data_all.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [41]:
data_all['Credit_History'].value_counts()

1.0    754
0.0    148
Name: Credit_History, dtype: int64

In [42]:
pd.crosstab(data_all['Gender'],data_all['Credit_History'])

Credit_History,0.0,1.0
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,32,145
Male,116,609


In [43]:
pd.crosstab(data_all['Self_Employed'], data_all['Credit_History'])

Credit_History,0.0,1.0
Self_Employed,Unnamed: 1_level_1,Unnamed: 2_level_1
No,134,658
Yes,14,96


In [44]:
pd.crosstab(data_all['Education'], data_all['Credit_History'])

Credit_History,0.0,1.0
Education,Unnamed: 1_level_1,Unnamed: 2_level_1
Graduate,106,596
Not Graduate,42,158


In [45]:
pd.crosstab(data_all['Married'], data_all['Credit_History'])

Credit_History,0.0,1.0
Married,Unnamed: 1_level_1,Unnamed: 2_level_1
No,56,263
Yes,92,491


In [46]:
data_all['Credit_History'][data_all['Credit_History'].isnull()]=1

In [47]:
data_all.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [48]:
data_all.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,130.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [49]:
data_all_new=pd.get_dummies(data_all.drop(['Loan_ID'],axis=1), drop_first=True)

In [50]:
data_all_new.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
0,5849,0.0,130.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1
1,4583,1508.0,128.0,360.0,1.0,1,1,1,0,0,0,0,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,0,1,0,1,1
3,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,1,0,0,1,1
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1


In [51]:
x=data_all_new.drop(['Loan_Status_Y'], axis=1)
y=data_all_new['Loan_Status_Y']

In [52]:
x.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,5849,0.0,130.0,360.0,1.0,1,0,0,0,0,0,0,0,1
1,4583,1508.0,128.0,360.0,1.0,1,1,1,0,0,0,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,0,1,0,1
3,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,1,0,0,1
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,0,0,1


In [53]:
y.head()

0    1
1    0
2    1
3    1
4    1
Name: Loan_Status_Y, dtype: uint8

### Train test split

In [54]:
from sklearn.model_selection import train_test_split

In [55]:
x_train,x_test , y_train, y_test =train_test_split(x,y)

In [56]:
x_train.shape

(735, 14)

In [57]:
x_test.shape

(246, 14)

## Data preprocessing

In [58]:
from sklearn.preprocessing import StandardScaler

In [59]:
scaler=StandardScaler()

In [60]:
scaler.fit(x)

StandardScaler()

In [61]:
x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)

In [62]:
x_train[:5]

array([[-0.37995761, -0.34552179, -0.52618713,  0.2705276 ,  0.42151046,
        -2.01412538,  0.73980985, -0.46758266, -0.44145701, -0.31976115,
         1.87082869, -0.37155221,  1.34569248, -0.73158135],
       [-0.44372907,  0.14575904, -0.185589  ,  0.2705276 , -2.37242036,
         0.49649342,  0.73980985, -0.46758266, -0.44145701,  3.12733429,
         1.87082869, -0.37155221, -0.74311183, -0.73158135],
       [ 0.10544337, -0.5895062 ,  0.65280642,  0.2705276 ,  0.42151046,
         0.49649342,  0.73980985, -0.46758266,  2.26522626, -0.31976115,
        -0.53452248, -0.37155221, -0.74311183,  1.36690199],
       [-0.25487418, -0.5895062 , -0.32968821,  0.2705276 ,  0.42151046,
        -2.01412538,  0.73980985, -0.46758266,  2.26522626, -0.31976115,
         1.87082869, -0.37155221,  1.34569248, -0.73158135],
       [-0.48501363,  0.03867086, -0.23798871,  0.2705276 ,  0.42151046,
         0.49649342,  0.73980985,  2.13865931, -0.44145701, -0.31976115,
        -0.53452248, -0.37

## Training the model

In [63]:
from sklearn import svm

In [64]:
clf=svm.SVC(kernel='linear',C=1.0)
clf.fit(x_train,y_train)

SVC(kernel='linear')

In [65]:
predictions=clf.predict(x_test)

In [66]:
from sklearn.metrics import classification_report, confusion_matrix

In [67]:
print(confusion_matrix(y_test, predictions))

[[ 41  39]
 [  3 163]]


In [68]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.93      0.51      0.66        80
           1       0.81      0.98      0.89       166

    accuracy                           0.83       246
   macro avg       0.87      0.75      0.77       246
weighted avg       0.85      0.83      0.81       246

