In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [8]:
train=pd.read_csv("train.csv")

In [9]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


LoanAmount            Loan amount in thousands

Loan_Amount_Term      Term of loan in months

Credit_History        Credit history meets guidelines

Property_Area         Urban/ Semi Urban/ Rural

Loan_Status           Loan approved (Y/N)

In [10]:
len(train)

614

In [11]:
train.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

Preparing the data for the ML Models

In [12]:
# dealing with nans
train.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

High Nans in gender, married, dependents, self_employed, loanAmount, loanTerms, credit_history


In [13]:
NaColsCats=['Gender','Married','Dependents','Self_Employed']

Seeing the number of males and females

In [14]:
for x in NaColsCats:
    print(train[x].value_counts(),end='\n\n')

Male      489
Female    112
Name: Gender, dtype: int64

Yes    398
No     213
Name: Married, dtype: int64

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64

No     500
Yes     82
Name: Self_Employed, dtype: int64



In [15]:
train['Gender']=train['Gender'].fillna('Male')

In [16]:
train['Married']=train['Married'].fillna('Yes')

In [17]:
train['Dependents']=train['Dependents'].fillna('0')

In [18]:
train['Self_Employed']=train['Self_Employed'].fillna('No')

Filling the other nas with mean 

In [19]:
train=train.fillna(train.mean())

In [20]:
train.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

#### Done with Nans 

In [21]:
from sklearn.preprocessing import LabelEncoder
list_encoders=[]
columns=['Gender','Married','Education','Self_Employed','Property_Area']
for c in columns:
    cEncoder=LabelEncoder()
    train[c]=cEncoder.fit_transform(train[c].astype('str'))
    list_encoders.append(cEncoder)

In [22]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,2,Y
1,LP001003,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,N
2,LP001005,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,Y
3,LP001006,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,Y
4,LP001008,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,Y


In [23]:
train.dtypes

Loan_ID               object
Gender                 int32
Married                int32
Dependents            object
Education              int32
Self_Employed          int32
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int32
Loan_Status           object
dtype: object

In [24]:
# seeing all the lebels
for x,c in zip(list_encoders,columns):
    print('classes are ',x.classes_,' in ',c)


classes are  ['Female' 'Male']  in  Gender
classes are  ['No' 'Yes']  in  Married
classes are  ['Graduate' 'Not Graduate']  in  Education
classes are  ['No' 'Yes']  in  Self_Employed
classes are  ['Rural' 'Semiurban' 'Urban']  in  Property_Area


Converting 3+ in dependents to 3, as we cannot go with 3+ as it is not a valid number

In [25]:
train['Dependents']=train['Dependents'].replace(['3+'], '3')

In [26]:
# coverting dependents column to integer
train['Dependents']=train['Dependents'].astype(str).astype(int)

In [27]:
train.dtypes

Loan_ID               object
Gender                 int32
Married                int32
Dependents             int32
Education              int32
Self_Employed          int32
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int32
Loan_Status           object
dtype: object

Let us one Hot encode some of the columns because numeric assignmnet does not make a sense

Columns are Gender, Married, Education, Self_employed



I prefer using sklearn.preprocessing.OneHotEncoder instead of pd.get_dummies This is because sklearn.preprocessing.OneHotEncoder returns an object of sklearn.preprocessing.OneHotEncoder class. We can fit this object on the training set and then use the same object to transform the test set. On the other hand, pd.get_dummies returns a dataframe with encodings based on the values in the dataframe we pass to it. This might be good for a quick analysis, but for an extended model building project where you train on training set and will be later testing on a test set, I would suggest using sklearn.preprocessing.OneHotEncoder.

In [28]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,2,Y
1,LP001003,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,N
2,LP001005,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,Y
3,LP001006,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,Y
4,LP001008,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,Y


In [29]:
col_to_one_hot=['Gender','Married','Education','Self_Employed']
df_subset=train[col_to_one_hot]
df_subset.head()

Unnamed: 0,Gender,Married,Education,Self_Employed
0,1,0,0,0
1,1,1,0,0
2,1,1,0,1
3,1,1,1,0
4,1,0,0,0


In [38]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder=OneHotEncoder(categories='auto',sparse=False)
trainnp=onehot_encoder.fit_transform(df_subset)

In [39]:
trainnp.shape

(614, 4)

In [42]:
# trainnp[4]

array([1., 0., 0., 0.])

In [37]:
trainnp[0]

array([0., 1., 1., 0., 1., 0., 1., 0.])