<a href="https://colab.research.google.com/github/ChandrashekharGhanokar/loan_status_prediction/blob/main/loan_status_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Gathering Dataset


In [6]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# loading data set
data=pd.read_csv('/content/train.csv')

#Data Preprocessing

In [8]:
# shape of a data
data.shape

(614, 13)

In [9]:
# five random rows of a dataframe
data.sample(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
167,LP001578,Male,Yes,0,Graduate,No,2439,3333.0,129.0,360.0,1.0,Rural,Y
546,LP002768,Male,No,0,Not Graduate,No,3358,0.0,80.0,36.0,1.0,Semiurban,N
502,LP002615,Male,Yes,2,Graduate,No,4865,5624.0,208.0,360.0,1.0,Semiurban,Y
315,LP002031,Male,Yes,1,Not Graduate,No,3399,1640.0,111.0,180.0,1.0,Urban,Y
375,LP002211,Male,Yes,0,Graduate,No,4817,923.0,120.0,180.0,1.0,Urban,Y


In [10]:
# display summary of a dataframe
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [11]:
# statistical summary of a dataframe
data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [12]:
# Count the number of missing (null) values in each column
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [13]:
# Count the number of duplicated rows
data.duplicated().sum()

0

#Exploratory Data Analysis

In [14]:
# Calculate the percentage of missing values for each column in the DataFrame
missing_percentage = data.isnull().sum() * 100 / len(data)

In [15]:
# Drop the 'Loan_ID' column from the DataFrame as it is not needed for the analysis
data = data.drop('Loan_ID', axis=1)

In [16]:
data.head(1)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y


In [17]:
columns = ['Gender','Dependents','LoanAmount','Loan_Amount_Term']

In [18]:
# Drop rows that contain missing values in the specified subset of columns
data = data.dropna(subset=columns)

In [19]:
# Calculate the percentage of missing values for each column in the DataFrame
missing_percentage = data.isnull().sum() * 100 / len(data)

In [20]:
# Get the most frequent (mode) value in the 'Self_Employed' column
most_frequent_self_employed = data['Self_Employed'].mode()[0]

In [21]:
# Fill missing values in 'Self_Employed' column with the most frequent (mode) value
data['Self_Employed'] = data['Self_Employed'].fillna(data['Self_Employed'].mode()[0])

In [22]:
# Calculate the percentage of missing values for each column in the DataFrame
missing_percentage = data.isnull().sum() * 100 / len(data)

In [23]:
# Get unique values in the 'Gender' column
unique_genders = data['Gender'].unique()

In [24]:
# Get unique values in the 'Self_Employed' column
unique_self_employed = data['Self_Employed'].unique()

In [25]:
# Get the mode (most frequent value) in the 'Credit_History' column
mode_credit_history = data['Credit_History'].mode()[0]

In [26]:
# Fill missing values in 'Credit_History' column with the mode (most frequent value)
data['Credit_History'] = data['Credit_History'].fillna(data['Credit_History'].mode()[0])

In [27]:
# Calculate the percentage of missing values for each column in the DataFrame
missing_percentage = data.isnull().sum() * 100 / len(data)

#Feature Engineering

In [28]:
# Sample 5 random rows from the DataFrame
sample_data = data.sample(5)

In [29]:
# Replace '3+' in the 'Dependents' column with '4'
data['Dependents'] = data['Dependents'].replace(to_replace="3+", value='4')

In [30]:
# Get unique values in the 'Dependents' column
unique_dependents = data['Dependents'].unique()

In [31]:
# Get unique values in the 'Loan_Status' column
unique_loan_status = data['Loan_Status'].unique()

In [32]:
# Convert categorical columns to numeric using mapping and convert to integer type
data['Gender'] = data['Gender'].map({'Male': 1, 'Female': 0}).astype('int')
data['Married'] = data['Married'].map({'Yes': 1, 'No': 0}).astype('int')
data['Education'] = data['Education'].map({'Graduate': 1, 'Not Graduate': 0}).astype('int')
data['Self_Employed'] = data['Self_Employed'].map({'Yes': 1, 'No': 0}).astype('int')
data['Property_Area'] = data['Property_Area'].map({'Rural': 0, 'Semiurban': 2, 'Urban': 1}).astype('int')
data['Loan_Status'] = data['Loan_Status'].map({'Y': 1, 'N': 0}).astype('int')

In [33]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,1
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,1
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,1
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,1,1


# Model Training,Selection,and Evaluation

In [34]:
# Create feature matrix X by dropping the 'Loan_Status' column
X = data.drop('Loan_Status', axis=1)

In [35]:
# Create target variable y containing 'Loan_Status'
y = data['Loan_Status']

In [36]:
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,0,2900,0.0,71.0,360.0,1.0,0
610,1,1,4,1,0,4106,0.0,40.0,180.0,1.0,0
611,1,1,1,1,0,8072,240.0,253.0,360.0,1.0,1
612,1,1,2,1,0,7583,0.0,187.0,360.0,1.0,1


In [37]:
y

1      0
2      1
3      1
4      1
5      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 553, dtype: int64

In [38]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,1
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,1
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,1
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,1,1


In [39]:
cols = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']

In [40]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
X[cols]=st.fit_transform(X[cols])

In [41]:
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,0,-0.128694,-0.049699,-0.214368,0.279961,1.0,0
2,1,1,0,1,1,-0.394296,-0.545638,-0.952675,0.279961,1.0,1
3,1,1,0,0,0,-0.464262,0.229842,-0.309634,0.279961,1.0,1
4,1,0,0,1,0,0.109057,-0.545638,-0.059562,0.279961,1.0,1
5,1,1,2,1,1,0.011239,0.834309,1.440866,0.279961,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,0,-0.411075,-0.545638,-0.893134,0.279961,1.0,0
610,1,1,4,1,0,-0.208727,-0.545638,-1.262287,-2.468292,1.0,0
611,1,1,1,1,0,0.456706,-0.466709,1.274152,0.279961,1.0,1
612,1,1,2,1,0,0.374659,-0.545638,0.488213,0.279961,1.0,1


In [42]:
# Import necessary modules for train-test split, cross-validation, accuracy score, and numpy
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [43]:
model_df={}
def model_val(model,X,y):
    X_train,X_test,y_train,y_test=train_test_split(X,y,
                                                   test_size=0.20,
                                                   random_state=42)
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    print(f"{model} accuracy is {accuracy_score(y_test,y_pred)}")

    score = cross_val_score(model,X,y,cv=5)
    print(f"{model} Avg cross val score is {np.mean(score)}")
    model_df[model]=round(np.mean(score)*100,2)

In [44]:
model_df

{}

#Logistic Regression

In [45]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model_val(model,X,y)

LogisticRegression() accuracy is 0.8018018018018018
LogisticRegression() Avg cross val score is 0.8047829647829647


#SVC

In [46]:
from sklearn import svm
from sklearn.svm import SVC
model = svm.SVC()
model_val(model,X,y)

SVC() accuracy is 0.7927927927927928
SVC() Avg cross val score is 0.7938902538902539


#Decision Tree Classifier

In [47]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model_val(model,X,y)

DecisionTreeClassifier() accuracy is 0.7567567567567568
DecisionTreeClassifier() Avg cross val score is 0.7125470925470926


#Random Forest Classifier

In [48]:
from sklearn.ensemble import RandomForestClassifier
model =RandomForestClassifier()
model_val(model,X,y)

RandomForestClassifier() accuracy is 0.7747747747747747
RandomForestClassifier() Avg cross val score is 0.7848648648648648


#Gradient Boosting Classifier

In [49]:
from sklearn.ensemble import GradientBoostingClassifier
model =GradientBoostingClassifier()
model_val(model,X,y)

GradientBoostingClassifier() accuracy is 0.7927927927927928
GradientBoostingClassifier() Avg cross val score is 0.7776085176085176


#Hyperparameter Tuning

In [50]:
from sklearn.model_selection import RandomizedSearchCV

In [51]:
#Logistic Regression
log_reg_grid={"C":np.logspace(-4,4,20),
             "solver":['liblinear']}


In [52]:
rs_log_reg=RandomizedSearchCV(LogisticRegression(),
                   param_distributions=log_reg_grid,
                  n_iter=20,cv=5,verbose=True)

In [53]:
rs_log_reg.fit(X,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [54]:
rs_log_reg.best_score_

0.8047829647829647

In [55]:
rs_log_reg.best_params_

{'solver': 'liblinear', 'C': 0.23357214690901212}

In [56]:
#SVC
svc_grid = {'C':[0.25,0.50,0.75,1],"kernel":["linear"]}


In [57]:
rs_svc=RandomizedSearchCV(svm.SVC(),
                  param_distributions=svc_grid,
                   cv=5,
                   n_iter=20,
                  verbose=True)

In [58]:
rs_svc.fit(X,y)



Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [59]:
RandomizedSearchCV(cv=5, estimator=SVC(), n_iter=20,
                   param_distributions={'C': [0.25, 0.5, 0.75, 1],
                                        'kernel': ['linear']},
                   verbose=True)

In [60]:
rs_svc.best_score_

0.8066011466011467

In [61]:
rs_svc.best_params_

{'kernel': 'linear', 'C': 0.25}

In [62]:
#Random Forest Classifier
RandomForestClassifier()

In [63]:
rf_grid={'n_estimators':np.arange(10,1000,10),
  'max_features':['auto','sqrt'],
 'max_depth':[None,3,5,10,20,30],
 'min_samples_split':[2,5,20,50,100],
 'min_samples_leaf':[1,2,5,10]
 }

In [64]:
rs_rf=RandomizedSearchCV(RandomForestClassifier(),
                  param_distributions=rf_grid,
                   cv=5,
                   n_iter=20,
                  verbose=True)

In [65]:
rs_rf.fit(X,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [66]:
rs_rf.best_score_

0.8066175266175266

In [67]:
rs_rf.best_params_

{'n_estimators': 850,
 'min_samples_split': 50,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 10}

LogisticRegression score Before Hyperparameter Tuning: 80.48

LogisticRegression score after Hyperparameter Tuning: 80.48
  
SVC score Before Hyperparameter Tuning: 79.38

SVC score after Hyperparameter Tuning: 80.6

RandomForestClassifier score Before Hyperparameter Tuning: 77.76

RandomForestClassifier score after Hyperparameter Tuning: 80.66

In [68]:
#Save The Model
X = data.drop('Loan_Status',axis=1)
y = data['Loan_Status']

In [69]:
rf = RandomForestClassifier(n_estimators=270,
 min_samples_split=5,
 min_samples_leaf=5,
 max_features='sqrt',
 max_depth=5)

In [70]:
rf.fit(X,y)

#Exporting the Model

In [71]:
import joblib

In [72]:
joblib.dump(rf,'loan_status_predict')

['loan_status_predict']

In [73]:
model = joblib.load('loan_status_predict')