In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")


In [14]:
df=pd.read_csv("loan.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [15]:
df.columns=df.columns.str.strip().str.replace(" ","_").str.title()


In [20]:
#Analyzing Dataframe
print(df.shape)
print(df.isnull().sum())
print(df.duplicated().sum())
df['Loan_Status'].value_counts()

(614, 13)
Loan_Id               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
Applicantincome       0
Coapplicantincome     0
Loanamount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64
0


Loan_Status
Y    422
N    192
Name: count, dtype: int64

In [21]:
#Dropping Null Values
df.dropna(inplace=True)

In [25]:
#Removing Loan_id Column not useful 
df.drop('Loan_Id', axis=1, inplace=True)

In [26]:
df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Applicantincome', 'Coapplicantincome', 'Loanamount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [32]:
df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})


In [33]:
#Checking Categorical Cols
categorical_cols=df.select_dtypes(include=['object', 'category']).columns.tolist()

In [34]:
print(categorical_cols)

['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']


In [35]:
df_encoded=pd.get_dummies(df,columns=categorical_cols,drop_first=True)

In [36]:
print(df_encoded.shape)
print(df_encoded.columns.tolist())

(480, 15)
['Applicantincome', 'Coapplicantincome', 'Loanamount', 'Loan_Amount_Term', 'Credit_History', 'Loan_Status', 'Gender_Male', 'Married_Yes', 'Dependents_1', 'Dependents_2', 'Dependents_3+', 'Education_Not Graduate', 'Self_Employed_Yes', 'Property_Area_Semiurban', 'Property_Area_Urban']


In [48]:
x = df_encoded.drop('Loan_Status', axis=1)
y = df_encoded['Loan_Status']

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=42,)

scaler=StandardScaler()
x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.transform(x_test)

In [53]:
print(x_train.shape)
print(x_test.shape)
print(y_train.value_counts(normalize=True)*100)


(384, 14)
(96, 14)
Loan_Status
1    69.270833
0    30.729167
Name: proportion, dtype: float64


## Logistic Regression

In [90]:
#training Model
logreg=LogisticRegression()
logreg.fit(x_train_scaled,y_train)

#Predict
y_pred_logreg=logreg.predict(x_test_scaled)

#Performance
print(confusion_matrix(y_test,y_pred_logreg))
print(classification_report(y_test,y_pred_logreg))
print("Training Accuracy",logreg_smote.score(x_train,y_train) * 100)
print("Test Accuracy",logreg_smote.score(x_test,y_test) * 100)



[[17 13]
 [ 3 63]]
              precision    recall  f1-score   support

           0       0.85      0.57      0.68        30
           1       0.83      0.95      0.89        66

    accuracy                           0.83        96
   macro avg       0.84      0.76      0.78        96
weighted avg       0.84      0.83      0.82        96

Training Accuracy 54.427083333333336
Test Accuracy 52.083333333333336


## Decision Tree Classifier

In [89]:
#Training
dts=DecisionTreeClassifier(random_state=42,max_depth=5,min_samples_split=10,min_samples_leaf=5)
dts.fit(x_train,y_train)

#Predict
y_pred_dts=dts.predict(x_test_scaled)

#Performance
print(confusion_matrix(y_test,y_pred_dts))
print(classification_report(y_test,y_pred_dts))
print("Training Accuracy",dts.score(x_train,y_train) * 100)
print("Test Accuracy",dts.score(x_test,y_test) * 100)



[[30  0]
 [66  0]]
              precision    recall  f1-score   support

           0       0.31      1.00      0.48        30
           1       0.00      0.00      0.00        66

    accuracy                           0.31        96
   macro avg       0.16      0.50      0.24        96
weighted avg       0.10      0.31      0.15        96

Training Accuracy 82.29166666666666
Test Accuracy 80.20833333333334


## Using SMOTE To Balanced Data

In [70]:
smote=SMOTE(random_state=42)
x_train_smote,y_train_smote=smote.fit_resample(x_train_scaled,y_train)
print(y_train_smote.value_counts())

Loan_Status
1    266
0    266
Name: count, dtype: int64


## Logistic Regression With SMOTE

In [87]:
#Training
logreg_smote=LogisticRegression(random_state=42)
logreg_smote.fit(x_train_smote,y_train_smote)


#Predict
y_pred_logreg_smote=logreg_smote.predict(x_test_scaled)

#Performance
print(confusion_matrix(y_test,y_pred_logreg_smote))
print(classification_report(y_test,y_pred_logreg_smote))
print("Training Accuracy",logreg_smote.score(x_train,y_train) * 100)
print("Test Accuracy",logreg_smote.score(x_test,y_test) * 100)




[[19 11]
 [13 53]]
              precision    recall  f1-score   support

           0       0.59      0.63      0.61        30
           1       0.83      0.80      0.82        66

    accuracy                           0.75        96
   macro avg       0.71      0.72      0.71        96
weighted avg       0.75      0.75      0.75        96

Training Accuracy 54.427083333333336
Test Accuracy 52.083333333333336


## Decison Tree With SMOTE

In [86]:
x_train_tree,y_train_tree=smote.fit_resample(x_train,y_train)

#Training
dts_smote= DecisionTreeClassifier(random_state=42,max_depth=5,min_samples_split=10,min_samples_leaf=5)
dts_smote.fit(x_train_tree,y_train_tree)

#Predict
y_pred_dts_smote=dts_smote.predict(x_test)

#Performance
print(confusion_matrix(y_test,y_pred_dts_smote))
print(classification_report(y_test,y_pred_dts_smote))
print("Training Accuracy",dts_smote.score(x_train,y_train) * 100)
print("Test Accuracy",dts_smote.score(x_test,y_test) * 100)


[[18 12]
 [12 54]]
              precision    recall  f1-score   support

           0       0.60      0.60      0.60        30
           1       0.82      0.82      0.82        66

    accuracy                           0.75        96
   macro avg       0.71      0.71      0.71        96
weighted avg       0.75      0.75      0.75        96

Training Accuracy 79.42708333333334
Test Accuracy 75.0
