## Import libraries

In [1]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

## Loading and Overviewing of Dataset

In [2]:
data = pd.read_csv("Dataset/loan_data.csv")
## 
df = data.copy()

In [3]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [4]:
df.tail().T

Unnamed: 0,376,377,378,379,380
Loan_ID,LP002953,LP002974,LP002978,LP002979,LP002990
Gender,Male,Male,Female,Male,Female
Married,Yes,Yes,No,Yes,No
Dependents,3+,0,0,3+,0
Education,Graduate,Graduate,Graduate,Graduate,Graduate
Self_Employed,No,No,No,No,Yes
ApplicantIncome,5703,3232,2900,4106,4583
CoapplicantIncome,0.0,1950.0,0.0,0.0,0.0
LoanAmount,128.0,108.0,71.0,40.0,133.0
Loan_Amount_Term,360.0,360.0,360.0,180.0,360.0


In [5]:
print(f'total number of rows: {df.shape[0]} => total number of columns: {df.shape[1]}')

total number of rows: 381 => total number of columns: 13


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            381 non-null    object 
 1   Gender             376 non-null    object 
 2   Married            381 non-null    object 
 3   Dependents         373 non-null    object 
 4   Education          381 non-null    object 
 5   Self_Employed      360 non-null    object 
 6   ApplicantIncome    381 non-null    int64  
 7   CoapplicantIncome  381 non-null    float64
 8   LoanAmount         381 non-null    float64
 9   Loan_Amount_Term   370 non-null    float64
 10  Credit_History     351 non-null    float64
 11  Property_Area      381 non-null    object 
 12  Loan_Status        381 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 38.8+ KB


In [7]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [8]:
df.isnull().sum()

Loan_ID               0
Gender                5
Married               0
Dependents            8
Education             0
Self_Employed        21
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     11
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

## Data Preprocessing - Step 1
- Normalize the column names
- Droping the ID column
- Removing the (+) sign on the Dependants column
- Fill the NaN in the (Dependants, Credit_History, Loan_Amount, Gender, Self_Employed) columns
- Replacing categorical columns(Gender, Married, Self-Employed, Education, Property Area, Loan-Status) with integers


In [9]:
df.columns = df.columns.str.lower()
df.columns

Index(['loan_id', 'gender', 'married', 'dependents', 'education',
       'self_employed', 'applicantincome', 'coapplicantincome', 'loanamount',
       'loan_amount_term', 'credit_history', 'property_area', 'loan_status'],
      dtype='object')

In [10]:
df = df.drop(['loan_id'], axis=1)

In [11]:
## remove + from the dependents column
df['dependents'] = df['dependents'].str.replace('+', '')

In [12]:
## convert the dependents column type to integer
df['dependents'] = df['dependents'].astype('float')

In [13]:
df['dependents'].isnull().sum()

8

In [14]:
df['dependents'] = df['dependents'].fillna(df['dependents'].mean())

In [15]:
df['credit_history'] = df['credit_history'].fillna(df['credit_history'].mean())

In [16]:
df['loan_amount_term'] = df['loan_amount_term'].fillna(df['loan_amount_term'].mean())

In [17]:
## target variable 
df.gender.value_counts()

gender
Male      291
Female     85
Name: count, dtype: int64

In [18]:
df.gender = df.gender.fillna("Male")

In [19]:
## target variable 
df.self_employed.value_counts()

self_employed
No     325
Yes     35
Name: count, dtype: int64

In [20]:
df.self_employed = df.self_employed.fillna("No")

In [21]:
df.isnull().sum()

gender               0
married              0
dependents           0
education            0
self_employed        0
applicantincome      0
coapplicantincome    0
loanamount           0
loan_amount_term     0
credit_history       0
property_area        0
loan_status          0
dtype: int64

In [22]:
df.dtypes

gender                object
married               object
dependents           float64
education             object
self_employed         object
applicantincome        int64
coapplicantincome    float64
loanamount           float64
loan_amount_term     float64
credit_history       float64
property_area         object
loan_status           object
dtype: object

In [23]:
df.loan_status.head()

0    N
1    Y
2    Y
3    Y
4    Y
Name: loan_status, dtype: object

In [24]:
df.loan_status = (df.loan_status == 'Y').astype(int)

In [25]:
df.loan_status.head()

0    0
1    1
2    1
3    1
4    1
Name: loan_status, dtype: int32

In [26]:
df.head()

Unnamed: 0,gender,married,dependents,education,self_employed,applicantincome,coapplicantincome,loanamount,loan_amount_term,credit_history,property_area,loan_status
0,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
1,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
2,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
3,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1
4,Male,Yes,0.0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,1


## Exploratory Data Analysis

In [27]:
## target variable 
df.loan_status.value_counts()

loan_status
1    271
0    110
Name: count, dtype: int64

In [28]:
numeric_cols = df.select_dtypes(exclude=[object])

corr_matrix = numeric_cols.corr()

corr_matrix['loan_status']

dependents           0.006977
applicantincome     -0.010167
coapplicantincome    0.009017
loanamount           0.041220
loan_amount_term    -0.046807
credit_history       0.601881
loan_status          1.000000
Name: loan_status, dtype: float64

## Data Preprocessing - Step 2

## Build a Validation Framework


In [29]:
## 
df_train_full , df_test = train_test_split(df, test_size=0.2, random_state=11) 
df_train, df_valid = train_test_split(df_train_full, test_size=0.25, random_state=11)


print(f'Training dataset: {len(df_train)}')
print(f'Validation dataset: {len(df_valid)}')
print(f'Test dataset: {len(df_test)}')

Training dataset: 228
Validation dataset: 76
Test dataset: 77


In [30]:
y_train = df_train['loan_status'].values
y_valid = df_valid['loan_status'].values
y_test = df_test['loan_status'].values

In [31]:
del df_train['loan_status']
del df_valid['loan_status']
del df_test['loan_status']

## Feature Engineering 
- Dividing our data into numerical and categorical
- perform the one-hot encoding

In [32]:
numerical_features = ['dependents','applicantincome','coapplicantincome','loanamount','loan_amount_term','credit_history']

categorical_features = ['gender','married','education','self_employed','property_area']

In [33]:
## convert the dataframe to into dict
train_dict = df_train[categorical_features + numerical_features].to_dict(orient='records')

valid_dict = df_valid[categorical_features + numerical_features].to_dict(orient='records')

In [34]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [35]:
X_train = dv.transform(train_dict)

X_valid = dv.transform(valid_dict)

## Training The Model

In [36]:
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

In [37]:
y_valid_pred = model.predict_proba(X_valid)

In [38]:
y_valid_pred

array([[0.29904901, 0.70095099],
       [0.05349974, 0.94650026],
       [0.09855407, 0.90144593],
       [0.05139747, 0.94860253],
       [0.22603381, 0.77396619],
       [0.24613695, 0.75386305],
       [0.86618202, 0.13381798],
       [0.03618916, 0.96381084],
       [0.35077405, 0.64922595],
       [0.94732957, 0.05267043],
       [0.2595941 , 0.7404059 ],
       [0.31193893, 0.68806107],
       [0.06307527, 0.93692473],
       [0.26280233, 0.73719767],
       [0.10301751, 0.89698249],
       [0.15404105, 0.84595895],
       [0.46010115, 0.53989885],
       [0.28356447, 0.71643553],
       [0.06021712, 0.93978288],
       [0.04764756, 0.95235244],
       [0.36715074, 0.63284926],
       [0.17712346, 0.82287654],
       [0.09271481, 0.90728519],
       [0.13391169, 0.86608831],
       [0.38108233, 0.61891767],
       [0.3513784 , 0.6486216 ],
       [0.48530569, 0.51469431],
       [0.23176625, 0.76823375],
       [0.27767248, 0.72232752],
       [0.05152199, 0.94847801],
       [0.

In [39]:
y_valid_pred = model.predict_proba(X_valid)[:, 1]

In [40]:
loan_status = y_valid_pred >= 0.5

In [41]:
(y_valid == loan_status).mean()

0.881578947368421

In [42]:
acc_score = accuracy_score(y_valid, loan_status)
print(f'Validation Accuracy Score: {round(acc_score * 100, 1)}%')

Validation Accuracy Score: 88.2%


## Saving The Model

In [43]:
import pickle

In [44]:
## specifyging where to save the file
with open('loan-model.bin', 'wb') as f_out:
    ## save the model
    pickle.dump((dv,model), f_out)

## Loading The Model

In [45]:
with open('loan-model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [47]:
## applicant Data here
applicant = {
 'customerid': 0,
 'gender': 'Male',
 'married': 'Yes',
 'dependents': 1.0,
 'education': 'Graduate',
 'self_employed': 'No',    
 'applicantincome': 4583,
 'coapplicantincome': 1508.0,
 'loanamount': 128.0,
 'loan_amount_term': 360.0,
 'credit_history': 1.0,
 'property_area': 'Rural', 
}

In [50]:
def predict_single(df, dv, model):
    X = dv.transform([applicant])
    y_pred = model.predict_proba(X)[:,1]
    return y_pred[0]

In [51]:
prediction = predict_single(applicant, dv, model)

In [52]:
print(f'{prediction}')

0.7754906290999604


In [53]:
if prediction >= 0.5:
    print('verdict: Good standing - "Approve"')
else:
    print('verdict: Bad standing - "Reject"')

verdict: Good standing - "Approve"
