In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [81]:
df=pd.read_csv("loan_prediction.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [82]:
df.drop("Loan_ID",axis=1,inplace=True)


In [83]:
df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [84]:
df.Gender.fillna("Unknown",inplace=True)
df.Married.fillna(df.Married.mode()[0],inplace=True)
df.Dependents.fillna(df.Dependents.mode()[0],inplace=True)
df.Self_Employed.fillna("NA",inplace=True)
df.Loan_Amount_Term.fillna(df.Loan_Amount_Term.mode()[0],inplace=True)
df.Credit_History.fillna(df.Credit_History.mode()[0],inplace=True)

In [85]:
df.LoanAmount.fillna(df.LoanAmount.median(),inplace=True)
df.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [86]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

categorical=df.select_dtypes(include='object')

for i in categorical:
    df[i]=le.fit_transform(df[i])

df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,1,5849,0.0,128.0,360.0,1.0,2,1
1,1,1,1,0,1,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,0,2,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0,1,1,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0,0,1,6000,0.0,141.0,360.0,1.0,2,1


In [87]:
df.shape

(614, 12)

In [88]:
from scipy.stats import zscore
z = np.abs(zscore(df))
df = df[(z<3).all(axis = 1)]

df.shape




(577, 12)

# skewness

In [89]:
df.skew()

Gender              -0.991250
Married             -0.630211
Dependents           1.052106
Education            1.306588
Self_Employed        0.520568
ApplicantIncome      2.148522
CoapplicantIncome    1.350517
LoanAmount           1.151525
Loan_Amount_Term    -2.098806
Credit_History      -1.976043
Property_Area       -0.055332
Loan_Status         -0.822635
dtype: float64

In [90]:
# removing skewness using yeo-jonson method to get better prediction

skew= ['ApplicantIncome','CoapplicantIncome','LoanAmount']

from sklearn.preprocessing import PowerTransformer
scaler = PowerTransformer(method='yeo-johnson')

In [91]:
df[skew]=scaler.fit_transform(df[skew].values)
df[skew].head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
0,0.68178,-1.122446,0.000771
1,0.234783,0.744117,0.000771
2,-0.527201,-1.122446,-1.437336
3,-0.791972,0.895786,-0.153545
4,0.728848,-1.122446,0.23826


In [92]:
df.skew()

Gender              -0.991250
Married             -0.630211
Dependents           1.052106
Education            1.306588
Self_Employed        0.520568
ApplicantIncome      0.027981
CoapplicantIncome   -0.191876
LoanAmount           0.048425
Loan_Amount_Term    -2.098806
Credit_History      -1.976043
Property_Area       -0.055332
Loan_Status         -0.822635
dtype: float64

In [93]:

X=df.drop("Loan_Status",axis=1)
Y=df["Loan_Status"]

In [94]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

X_scale=sc.fit_transform(X)

# Machine Learning Model Building

In [95]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report


# Finding best Random state

In [96]:
maxAccu=0
maxRS=0
for i in range(1,250):
    X_train,X_test,Y_train,Y_test = train_test_split(X_scale,Y,test_size = 0.3, random_state=i)
    log_reg=LogisticRegression()
    log_reg.fit(X_train,Y_train)
    y_pred=log_reg.predict(X_test)
    acc=accuracy_score(Y_test,y_pred)
    if acc>maxAccu:
        maxAccu=acc
        maxRS=i

print('Best accuracy is' , maxAccu , 'on Random_state' , maxRS)


Best accuracy is 0.8850574712643678 on Random_state 12


# Logistics Regression Model

In [97]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scale, Y, random_state=12, test_size=.3)
log_reg=LogisticRegression()
log_reg.fit(X_train,Y_train)
y_pred=log_reg.predict(X_test)


In [99]:
print(classification_report(y_pred,Y_test))

              precision    recall  f1-score   support

           0       0.55      0.96      0.70        24
           1       0.99      0.87      0.93       150

    accuracy                           0.89       174
   macro avg       0.77      0.92      0.81       174
weighted avg       0.93      0.89      0.90       174



In [100]:
df.Loan_Status.value_counts()

Loan_Status
1    398
0    179
Name: count, dtype: int64

In [101]:
print(confusion_matrix(y_pred,Y_test))

[[ 23   1]
 [ 19 131]]


In [None]:
# Classification --> RF algo.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix
from scipy.stats import zscore # for outliers
from sklearn.model_selection import train_test_split