## Import libraries

In [1]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Loading Dataset

In [2]:
data = pd.read_csv("Dataset/loan_data.csv")
## 
df = data.copy()

## Overviewing of Dataset

In [3]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [4]:
df.tail().T

Unnamed: 0,376,377,378,379,380
Loan_ID,LP002953,LP002974,LP002978,LP002979,LP002990
Gender,Male,Male,Female,Male,Female
Married,Yes,Yes,No,Yes,No
Dependents,3+,0,0,3+,0
Education,Graduate,Graduate,Graduate,Graduate,Graduate
Self_Employed,No,No,No,No,Yes
ApplicantIncome,5703,3232,2900,4106,4583
CoapplicantIncome,0.0,1950.0,0.0,0.0,0.0
LoanAmount,128.0,108.0,71.0,40.0,133.0
Loan_Amount_Term,360.0,360.0,360.0,180.0,360.0


In [5]:
df.axes

[RangeIndex(start=0, stop=381, step=1),
 Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
        'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
        'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
       dtype='object')]

In [6]:
df.empty

False

In [7]:
print(f'total number of rows: {df.shape[0]} => total number of columns: {df.shape[1]}')

total number of rows: 381 => total number of columns: 13


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            381 non-null    object 
 1   Gender             376 non-null    object 
 2   Married            381 non-null    object 
 3   Dependents         373 non-null    object 
 4   Education          381 non-null    object 
 5   Self_Employed      360 non-null    object 
 6   ApplicantIncome    381 non-null    int64  
 7   CoapplicantIncome  381 non-null    float64
 8   LoanAmount         381 non-null    float64
 9   Loan_Amount_Term   370 non-null    float64
 10  Credit_History     351 non-null    float64
 11  Property_Area      381 non-null    object 
 12  Loan_Status        381 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 38.8+ KB


In [9]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [10]:
df.isnull().sum()

Loan_ID               0
Gender                5
Married               0
Dependents            8
Education             0
Self_Employed        21
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     11
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

## Data Preprocessing
- Normalize the column names to lower case
- Drop the ID column
- Remove the (+) sign on the Dependants column
- Fill the NaN in the (Dependants, Credit_History, Loan_Amount, Gender, Self_Employed) columns
- Change the dtypes for (Gender, Married, Self_Employed) to categorical
- Replace categorical column(Gender, Married, Self_Employed and Loan-Status) with integers

In [11]:
df.columns = df.columns.str.lower()
df.columns

Index(['loan_id', 'gender', 'married', 'dependents', 'education',
       'self_employed', 'applicantincome', 'coapplicantincome', 'loanamount',
       'loan_amount_term', 'credit_history', 'property_area', 'loan_status'],
      dtype='object')

In [12]:
df = df.drop(['loan_id'], axis=1)

In [13]:
## remove + from the dependents column
df['dependents'] = df['dependents'].str.replace('+', '')

In [14]:
## convert the dependents column type to float
df['dependents'] = df['dependents'].astype('float')

In [15]:
df['dependents'].isnull().sum()

np.int64(8)

In [16]:
## fill the null values in the dependents column
df['dependents'] = df['dependents'].fillna(df['dependents'].mean())

In [17]:
## fill the null values in the credit_history column
df['credit_history'] = df['credit_history'].fillna(df['credit_history'].mean())

In [18]:
## fill the null values in the loan_amount_term column
df['loan_amount_term'] = df['loan_amount_term'].fillna(df['loan_amount_term'].mean())

In [19]:
## convert the dependents column type to integers
df['dependents'] = df['dependents'].astype('int64')

In [20]:
## convert the loan_amount_term column type to integer
df['loan_amount_term'] = df['loan_amount_term'].astype('int64')

In [21]:
## convert the applicantincome column type to float
df['applicantincome'] = df['applicantincome'].astype('float')

In [22]:
## convert the credit history column type to integer
df['credit_history'] = df['credit_history'].astype('int64')

In [23]:
## convert the gender column type to category
df['gender'] = df['gender'].astype('category')

In [24]:
## convert the astype of the gender from category to integer
df.gender = (df.gender == 'Male').astype(int)

In [25]:
## convert the married column type to category
df['married'] = df['married'].astype('category')

In [26]:
## convert the astype of the married from category to integer
df.married = (df.married == 'Yes').astype(int)

In [27]:
## convert the self employed column type to category
df['self_employed'] = df['self_employed'].astype('category')

In [28]:
## convert the astype of the self employed from category to integer
df.self_employed = (df.self_employed == 'No').astype(int)

In [29]:
## target variable 
df.gender.value_counts()

gender
1    291
0     90
Name: count, dtype: int64

In [30]:
## fill the missing value of the gender column to Male
df.gender = df.gender.fillna("Male")

In [31]:
df.self_employed.value_counts()

self_employed
1    325
0     56
Name: count, dtype: int64

In [32]:
## fill the missing value of the self employed column to No
df.self_employed = df.self_employed.fillna("No")

In [33]:
## confirming the null values
df.isnull().sum()

gender               0
married              0
dependents           0
education            0
self_employed        0
applicantincome      0
coapplicantincome    0
loanamount           0
loan_amount_term     0
credit_history       0
property_area        0
loan_status          0
dtype: int64

In [34]:
## confirming the dtypes
df.dtypes

gender                 int64
married                int64
dependents             int64
education             object
self_employed          int64
applicantincome      float64
coapplicantincome    float64
loanamount           float64
loan_amount_term       int64
credit_history         int64
property_area         object
loan_status           object
dtype: object

In [35]:
df.loan_status.head()

0    N
1    Y
2    Y
3    Y
4    Y
Name: loan_status, dtype: object

In [36]:
## convert the astype of the loan status from category to integer
df.loan_status = (df.loan_status == 'N').astype(int)

In [37]:
df.loan_status.head()

0    1
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [38]:
df.head()

Unnamed: 0,gender,married,dependents,education,self_employed,applicantincome,coapplicantincome,loanamount,loan_amount_term,credit_history,property_area,loan_status
0,1,1,1,Graduate,1,4583.0,1508.0,128.0,360,1,Rural,1
1,1,1,0,Graduate,0,3000.0,0.0,66.0,360,1,Urban,0
2,1,1,0,Not Graduate,1,2583.0,2358.0,120.0,360,1,Urban,0
3,1,0,0,Graduate,1,6000.0,0.0,141.0,360,1,Urban,0
4,1,1,0,Not Graduate,1,2333.0,1516.0,95.0,360,1,Urban,0


In [39]:
## saving the cleaned loan dataset
df.to_csv("Dataset/cleaned_loan_status_dataset.csv")

## Loading Dataset to perform an (EDA)

In [40]:
## loading the cleaned loan status dataset
df = pd.read_csv("Dataset/cleaned_loan_status_dataset.csv")

## Exploratory Data Analysis
- Perform target variable analysis

In [41]:
## target variable 
df.loan_status.value_counts()

loan_status
0    271
1    110
Name: count, dtype: int64

In [42]:
numeric_cols = df.select_dtypes(exclude=[object])

corr_matrix = numeric_cols.corr()

corr_matrix['loan_status']

Unnamed: 0           0.036862
gender              -0.054757
married             -0.092473
dependents          -0.013701
self_employed        0.019105
applicantincome      0.010167
coapplicantincome   -0.009017
loanamount          -0.041220
loan_amount_term     0.046672
credit_history      -0.453699
loan_status          1.000000
Name: loan_status, dtype: float64

## Build a Validation Framework
- Devide the dataset into:
      - Training set (60%)
      - Validation set (20%)
      - Test set (20%)

In [43]:
## 
df_train_full , df_test = train_test_split(df, test_size=0.2, random_state=11) 
df_train, df_valid = train_test_split(df_train_full, test_size=0.25, random_state=11)


print(f'Training dataset: {len(df_train)}')
print(f'Validation dataset: {len(df_valid)}')
print(f'Test dataset: {len(df_test)}')

Training dataset: 228
Validation dataset: 76
Test dataset: 77


In [44]:
y_train = df_train['loan_status'].values
y_valid = df_valid['loan_status'].values
y_test = df_test['loan_status'].values

In [45]:
## Deleting all the loan_status columns from our traning, validation and the test dataset
del df_train['loan_status']
del df_valid['loan_status']
del df_test['loan_status']

## Feature Engineering 
- Dividing our data into numerical and categorical
- perform the one-hot encoding

In [46]:
numerical_features = ['dependents','applicantincome','coapplicantincome','loanamount','loan_amount_term','credit_history']

categorical_features = ['gender','married','education','self_employed','property_area']

In [47]:
## convert the dataframe into dict
train_dict = df_train[categorical_features + numerical_features].to_dict(orient='records')

valid_dict = df_valid[categorical_features + numerical_features].to_dict(orient='records')

In [48]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [49]:
X_train = dv.transform(train_dict)

X_valid = dv.transform(valid_dict)

## Training The Model

In [50]:
## fitting the training dataset to train the model
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

## Computing predicted truth against ground truth

In [51]:
y_valid_pred = model.predict_proba(X_valid)

In [52]:
y_valid_pred

array([[0.71107195, 0.28892805],
       [0.92313314, 0.07686686],
       [0.88376136, 0.11623864],
       [0.93878074, 0.06121926],
       [0.80230834, 0.19769166],
       [0.76260103, 0.23739897],
       [0.3494753 , 0.6505247 ],
       [0.95464795, 0.04535205],
       [0.26514758, 0.73485242],
       [0.16671078, 0.83328922],
       [0.75899369, 0.24100631],
       [0.66559324, 0.33440676],
       [0.92067664, 0.07932336],
       [0.71005293, 0.28994707],
       [0.85609394, 0.14390606],
       [0.4647433 , 0.5352567 ],
       [0.50844573, 0.49155427],
       [0.7497022 , 0.2502978 ],
       [0.93402048, 0.06597952],
       [0.94367929, 0.05632071],
       [0.61128842, 0.38871158],
       [0.44645578, 0.55354422],
       [0.88001365, 0.11998635],
       [0.85172355, 0.14827645],
       [0.63643667, 0.36356333],
       [0.64994498, 0.35005502],
       [0.15640786, 0.84359214],
       [0.78181322, 0.21818678],
       [0.70878074, 0.29121926],
       [0.93817819, 0.06182181],
       [0.

In [53]:
y_valid_pred = model.predict_proba(X_valid)[:, 1]

In [54]:
loan_status = y_valid_pred >= 0.5

In [55]:
(y_valid == loan_status).mean()

np.float64(0.8026315789473685)

In [56]:
acc_score = accuracy_score(y_valid, loan_status)
print(f'Validation Accuracy Score: {round(acc_score * 100, 1)}%')

Validation Accuracy Score: 80.3%


## Saving The Model

In [57]:
import pickle

In [58]:
## specifyging where to save the file
with open('loan_model.bin', 'wb') as f_out:
    ## save the model
    pickle.dump((dv,model), f_out)

## Loading The Model

In [59]:
with open('loan_model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

## Testing our model against applicant information

In [60]:
## applicant Data
applicant = {
 'customerid': 0,
 'gender': 1,
 'married': 0,
 'dependents': 2,
 'education': 'Graduate',
 'self_employed': 'Yes',    
 'applicantincome': 50083.0,
 'coapplicantincome': 10.0,
 'loanamount': 100.0,
 'loan_amount_term': 24,
 'credit_history': 0,
 'property_area': 'Rural', 
}

In [61]:
def predict_single(df, dv, model):
    X = dv.transform([applicant])
    y_pred = model.predict_proba(X)[:,1]
    return y_pred[0]

In [62]:
prediction = predict_single(applicant, dv, model)

In [63]:
## Applicant data prediction score
print(f'{prediction}')

0.9993241434558613


In [64]:
## Models's verdict
if prediction >= 0.5:
    print('Good Standing - Approved: "Applicant stand a higher chance paying back loan"')
else:
    print('Bad Standing - Rejected: "Applicant stand a higher chance defaulting payment"')

Good Standing - Approved: "Applicant stand a higher chance paying back loan"
