In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Loading the test and train data into variables

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

### Viewing the train and test data

In [3]:
train.head(1)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y


In [4]:
test.head(1)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban


In [5]:
# adding a field source to identify the test and train data before concating both
train["source"] = "train"
test["source"] = "test"
data = pd.concat([train,test])


In [6]:
print("Shape of the data is:{}".format(data.shape))


Shape of the data is:(981, 14)


In [7]:
print("List of columns is: {}".format(list(data.columns)))

List of columns is: ['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status', 'source']


### Printing the sum of null values of each columns

In [8]:
for i in data.columns:
    print("The number of null values in:{} == {}".format(i, data[i].isnull().sum()))

The number of null values in:Loan_ID == 0
The number of null values in:Gender == 24
The number of null values in:Married == 3
The number of null values in:Dependents == 25
The number of null values in:Education == 0
The number of null values in:Self_Employed == 55
The number of null values in:ApplicantIncome == 0
The number of null values in:CoapplicantIncome == 0
The number of null values in:LoanAmount == 27
The number of null values in:Loan_Amount_Term == 20
The number of null values in:Credit_History == 79
The number of null values in:Property_Area == 0
The number of null values in:Loan_Status == 367
The number of null values in:source == 0


### checking out the unique labels on the columns having missing values

In [9]:
null_cols = ['Dependents', 'Self_Employed', 'Loan_Amount_Term', 'Gender', 'Married']

for i in null_cols:
    print("List of unique labels for {}:::{}".format(i, set(data[i])))

List of unique labels for Dependents:::{nan, '1', '2', '3+', '0'}
List of unique labels for Self_Employed:::{'No', nan, 'Yes'}
List of unique labels for Loan_Amount_Term:::{nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 12.0, nan, nan, nan, nan, nan, nan, 36.0, 6.0, 300.0, 180.0, 60.0, 84.0, 350.0, 480.0, 360.0, 240.0, 120.0}
List of unique labels for Gender:::{nan, 'Female', 'Male'}
List of unique labels for Married:::{'No', nan, 'Yes'}


#### Eliminating the Null values on columns by filling alternate values based on the each cols requirement 

In [10]:
data['Dependents'].fillna(data['Dependents'].mode()[0],inplace= True)
data['Self_Employed'].fillna(data['Self_Employed'].mode()[0],inplace= True)
data['Credit_History'].fillna(data['Credit_History'].mode()[0],inplace= True)

data['Married'].fillna(data['Credit_History'].mode()[0],inplace= True)
data['Gender'].fillna(data['Credit_History'].mode()[0],inplace= True)
data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mode()[0])

In [11]:
data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].median())

#### Identifying the unique values on each categorical columns

In [12]:

categorical_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Dependents']

for i in categorical_columns:
    print("List of unique values {}:{}".format(i, set(data[i])))

List of unique values Gender:{'Female', 'Male', 1.0}
List of unique values Married:{'No', 1.0, 'Yes'}
List of unique values Education:{'Graduate', 'Not Graduate'}
List of unique values Self_Employed:{'No', 'Yes'}
List of unique values Property_Area:{'Urban', 'Rural', 'Semiurban'}
List of unique values Dependents:{'0', '1', '2', '3+'}


#### Converting each categories into Integers

In [13]:
gender_values = {'Female' : 0, 'Male' : 1} 
married_values = {'No' : 0, 'Yes' : 1}
education_values = {'Graduate' : 0, 'Not Graduate' : 1}
employed_values = {'No' : 0, 'Yes' : 1}
property_values = {'Rural' : 0, 'Urban' : 1, 'Semiurban' : 2}
dependent_values = {'3+': 3, '0': 0, '2': 2, '1': 1}
data.replace({'Gender': gender_values, 'Married': married_values, 'Education': education_values, \
                'Self_Employed': employed_values, 'Property_Area': property_values, 'Dependents': dependent_values}\
                , inplace=True)

In [14]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,source
0,LP001002,1.0,0.0,0,0,0,5849,0.0,126.0,360.0,1.0,1,Y,train
1,LP001003,1.0,1.0,1,0,0,4583,1508.0,128.0,360.0,1.0,0,N,train
2,LP001005,1.0,1.0,0,0,1,3000,0.0,66.0,360.0,1.0,1,Y,train
3,LP001006,1.0,1.0,0,1,0,2583,2358.0,120.0,360.0,1.0,1,Y,train
4,LP001008,1.0,0.0,0,0,0,6000,0.0,141.0,360.0,1.0,1,Y,train


In [15]:
for i in data.columns:
    print("The number of null values in:{} == {}".format(i, data[i].isnull().sum()))

The number of null values in:Loan_ID == 0
The number of null values in:Gender == 0
The number of null values in:Married == 0
The number of null values in:Dependents == 0
The number of null values in:Education == 0
The number of null values in:Self_Employed == 0
The number of null values in:ApplicantIncome == 0
The number of null values in:CoapplicantIncome == 0
The number of null values in:LoanAmount == 0
The number of null values in:Loan_Amount_Term == 0
The number of null values in:Credit_History == 0
The number of null values in:Property_Area == 0
The number of null values in:Loan_Status == 367
The number of null values in:source == 0


#### Splitting  Train and Test data based on the source field

In [16]:
train_preprocessed = data[data["source"]=="train"]
test_preprocessed = data[data["source"]=="test"]

In [17]:
del train_preprocessed["source"]
train_preprocessed

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1.0,0.0,0,0,0,5849,0.0,126.0,360.0,1.0,1,Y
1,LP001003,1.0,1.0,1,0,0,4583,1508.0,128.0,360.0,1.0,0,N
2,LP001005,1.0,1.0,0,0,1,3000,0.0,66.0,360.0,1.0,1,Y
3,LP001006,1.0,1.0,0,1,0,2583,2358.0,120.0,360.0,1.0,1,Y
4,LP001008,1.0,0.0,0,0,0,6000,0.0,141.0,360.0,1.0,1,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,0.0,0.0,0,0,0,2900,0.0,71.0,360.0,1.0,0,Y
610,LP002979,1.0,1.0,3,0,0,4106,0.0,40.0,180.0,1.0,0,Y
611,LP002983,1.0,1.0,1,0,0,8072,240.0,253.0,360.0,1.0,1,Y
612,LP002984,1.0,1.0,2,0,0,7583,0.0,187.0,360.0,1.0,1,Y


#### Removing temporary source field

In [18]:
del test_preprocessed["source"]
test_preprocessed

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001015,1.0,1.0,0,0,0,5720,0.0,110.0,360.0,1.0,1,
1,LP001022,1.0,1.0,1,0,0,3076,1500.0,126.0,360.0,1.0,1,
2,LP001031,1.0,1.0,2,0,0,5000,1800.0,208.0,360.0,1.0,1,
3,LP001035,1.0,1.0,2,0,0,2340,2546.0,100.0,360.0,1.0,1,
4,LP001051,1.0,0.0,0,1,0,3276,0.0,78.0,360.0,1.0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,LP002971,1.0,1.0,3,1,1,4009,1777.0,113.0,360.0,1.0,1,
363,LP002975,1.0,1.0,0,0,0,4158,709.0,115.0,360.0,1.0,1,
364,LP002980,1.0,0.0,0,0,0,3250,1993.0,126.0,360.0,1.0,2,
365,LP002986,1.0,1.0,0,0,0,5000,2393.0,158.0,360.0,1.0,0,


In [19]:
del test_preprocessed["Loan_Status"]
test_preprocessed

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,1.0,1.0,0,0,0,5720,0.0,110.0,360.0,1.0,1
1,LP001022,1.0,1.0,1,0,0,3076,1500.0,126.0,360.0,1.0,1
2,LP001031,1.0,1.0,2,0,0,5000,1800.0,208.0,360.0,1.0,1
3,LP001035,1.0,1.0,2,0,0,2340,2546.0,100.0,360.0,1.0,1
4,LP001051,1.0,0.0,0,1,0,3276,0.0,78.0,360.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
362,LP002971,1.0,1.0,3,1,1,4009,1777.0,113.0,360.0,1.0,1
363,LP002975,1.0,1.0,0,0,0,4158,709.0,115.0,360.0,1.0,1
364,LP002980,1.0,0.0,0,0,0,3250,1993.0,126.0,360.0,1.0,2
365,LP002986,1.0,1.0,0,0,0,5000,2393.0,158.0,360.0,1.0,0


### Creating final Train and Test data to fit into the model

In [21]:
train_x = train_preprocessed.drop(columns=["Loan_ID","Loan_Status"])
train_x

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1.0,0.0,0,0,0,5849,0.0,126.0,360.0,1.0,1
1,1.0,1.0,1,0,0,4583,1508.0,128.0,360.0,1.0,0
2,1.0,1.0,0,0,1,3000,0.0,66.0,360.0,1.0,1
3,1.0,1.0,0,1,0,2583,2358.0,120.0,360.0,1.0,1
4,1.0,0.0,0,0,0,6000,0.0,141.0,360.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
609,0.0,0.0,0,0,0,2900,0.0,71.0,360.0,1.0,0
610,1.0,1.0,3,0,0,4106,0.0,40.0,180.0,1.0,0
611,1.0,1.0,1,0,0,8072,240.0,253.0,360.0,1.0,1
612,1.0,1.0,2,0,0,7583,0.0,187.0,360.0,1.0,1


In [22]:
test_x = test_preprocessed.drop(columns=["Loan_ID"])


In [23]:
train_y = train["Loan_Status"]


In [24]:
train_x.shape, test_x.shape, train_y.shape

((614, 11), (367, 11), (614,))

In [25]:
model = LogisticRegression()

### Fitting the processed data into the model

In [36]:
model.fit(train_x,train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
model.coef_

array([[ 1.60380885e-01,  5.18676834e-01, -2.51904645e-01,
        -4.45837340e-01,  5.26137605e-04,  2.12990775e-06,
        -5.81313660e-05, -1.49945243e-03, -5.82834187e-03,
         2.77945960e+00,  5.19306698e-01]])

In [38]:
model.intercept_

array([0.1086135])

In [29]:
train_pred = model.predict(train_x)
test_pred = model.predict(test_x)

In [30]:
print("accuracy_score ", accuracy_score(train_y, train_pred))

accuracy_score  0.8110749185667753


In [31]:
sample = pd.read_csv("sample_submission.csv")

In [32]:
sample.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,N
1,LP001022,N
2,LP001031,N
3,LP001035,N
4,LP001051,N


In [33]:
sample["Loan_Status"] = test_pred


In [34]:
sample.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y


In [35]:
sample.to_csv("loanApprovalPredictionSubmission.csv", index = False)