# Loan Status Predictor

In [1]:
# pip install imblearn

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

# Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc

# 1. DATA COLLECTION

In [None]:
df = pd.read_csv("data/bankloan.csv")
df.head()

# 2. DATA CLEANING

In [None]:
df.shape

In [None]:
df.drop(["Loan_ID"], axis = 1, inplace=True)
df.head()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
# Filling NANs in Categorical variables
df["Gender"].fillna("Female", inplace = True)  
df["Married"].fillna("No", inplace=True)
df["Self_Employed"].fillna("Yes", inplace=True)

In [None]:
# Filling NaNs in Numerical variables
df["Dependents"].fillna(df.Dependents.median(), inplace=True)
df["LoanAmount"].fillna(df.LoanAmount.median(), inplace=True)
df["Loan_Amount_Term"].fillna(df.Loan_Amount_Term.median(), inplace=True)
df["Credit_History"].fillna(df.Credit_History.median(), inplace=True)

In [None]:
df.isna().sum()

In [None]:
df["Dependents"] = df["Dependents"].astype(int)
df["Credit_History"] = df["Credit_History"].astype(int)

In [None]:
df.dtypes

In [None]:
# Checking for duplicates
duplicates_df = df[df.duplicated()]
duplicates_df.shape

In [None]:
df.shape

# 3. EXPLORATORY DATA ANALYSIS

# 3.1. NUMERICAL FEATURES

In [None]:
df_num = df[["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term"]]

In [None]:
df_num.hist(figsize=(20,20), bins=50);

### 3.1.1. Loan Amount

In [None]:
plt.figure(figsize=(12,7))

sns.boxplot(df["LoanAmount"])

plt.show()

**OBSERVATION**:
- The data is skewed to the right.
- Q1 is 100.
- Maximum value is 700.

### 3.1.2. Applicant Income

In [None]:
plt.figure(figsize=(15,7))

sns.boxplot(df["ApplicantIncome"])

plt.show()

**OBSERVATION**:
- The data is skewed to the right.
-  Minimum value is 0.

### 3.1.3. Applicant Income vs Loan Amount

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x="ApplicantIncome", y="LoanAmount", data=df)
plt.xlabel("Applicant Income")
plt.ylabel("Loan Amount")
plt.title("Applicant Income vs Loan Amount")
plt.show()

### 3.1.4. Correlation

In [None]:
plt.figure(figsize=(15,10))

sns.heatmap(df_num.corr(), annot=True, cmap="YlGnBu");

**OBSERVATION**:
- **Applicant Income** and **Loan Amount** have a positive correlation.

## 3.2. CATEGORICAL FEATURES

### 3.2.1. GENDER

In [None]:
plt.figure(figsize=(10,5))
sns.set()
plt.hist(df.Gender)
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

### 3.2.2. MARRIED

In [None]:
plt.figure(figsize=(10,5))
sns.set()
plt.hist(df.Married)
plt.xlabel("Married")
plt.ylabel("Count")
plt.show()

### 3.2.3. EDUCATION

In [None]:
plt.figure(figsize=(10,5))
sns.set()
plt.hist(df.Education)
plt.xlabel("Education")
plt.ylabel("Count")
plt.show()

### 3.2.4. SELF-EMPLOYED

In [None]:
plt.figure(figsize=(10,5))
sns.set()
plt.hist(df.Self_Employed)
plt.xlabel("Self Employed")
plt.ylabel("Count")
plt.show()

### 3.2.5. PROPERY AREA

In [None]:
plt.figure(figsize=(10,5))
plt.hist(df.Property_Area)
plt.xlabel("Property Area")
plt.ylabel("Count")
plt.show()

### 3.2.6. DEPENDENTS

In [None]:
plt.figure(figsize=(10,5))
sns.set()
plt.hist(df.Dependents)
plt.xlabel("Dependents")
plt.ylabel("Count")

locs, labels = plt.xticks()  # Get the current locations and labels.
plt.xticks(np.arange(0, 1, step=0.2))  # Set label locations.
plt.xticks([0, 1, 2, 3], ['Zero', 'One', 'Two', 'Three'], rotation=20)  # Set text labels and properties.

plt.show()

### 3.2.7. CREDIT HISTORY

In [None]:
plt.figure(figsize=(10,5))
sns.set()
plt.hist(df.Credit_History)
plt.xlabel("Credit History")
plt.ylabel("Count")

locs, labels = plt.xticks()  # Get the current locations and labels.
plt.xticks(np.arange(0, 1, step=0.2))  # Set label locations.
plt.xticks([0, 1], ['No', 'Yes'], rotation=20)  # Set text labels and properties.


plt.show()

### 3.2.8. LOAN STATUS

In [None]:
plt.figure(figsize=(10,5))
sns.set()
plt.hist(df.Loan_Status)
plt.xlabel("Loan Status")
plt.ylabel("Count")
plt.show()

In [None]:
# Convert Categorical Features to Numeric
le = LabelEncoder()

# Apply le on categorical feature columns
df[['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']] = df[['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']].apply(lambda col: le.fit_transform(col))

# 4. BUILDING MODEL

## 4.1. Analysing Imbalanced Dataset

In [None]:
df.shape

In [None]:
loan_status = pd.value_counts(df["Loan_Status"], sort=True).sort_index()
print(loan_status)

In [None]:
loan_status.plot(kind="bar", figsize=(12,7), color="maroon")
plt.title("Loan Status Histogram")
plt.xlabel("Loan Status")
plt.ylabel("Frequency")
plt.show()

In [None]:
yes_status = len(df[df["Loan_Status"] == 1])
no_status = len(df[df["Loan_Status"] == 0])
total_status = yes_status + no_status

In [None]:
print("Loan Approved:", yes_status)
print("Loan Not Approved:", no_status)
print("Total Loan Applications:", total_status)

In [None]:
plt.figure(figsize=(12,6))
size = [422,192]
labels = ['yes', 'no']
colors = ['maroon', 'grey']
explode = [0, 0.2]

plt.pie(size, colors = colors, labels = labels, shadow = True, explode = explode, autopct = '%.2f%%')
plt.title('LOAN STATUS', fontsize = 20)
plt.legend()
plt.show() 

## 4.2. Undersampling

In [None]:
# Creating an undersampler object
rus = RandomUnderSampler(random_state=2)

In [None]:
X = df.drop("Loan_Status", axis=1)
y = df["Loan_Status"]

In [None]:
# Resample the features for training data and the target
X_sampled, y_sampled = rus.fit_sample(X,y)

In [None]:
# Revert resampled data into dataframe
X_sampled = pd.DataFrame(X_sampled)
y_sampled = pd.DataFrame(y_sampled)

In [None]:
X_sampled.shape

In [None]:
y_sampled.shape

In [None]:
undersampled_df = pd.concat([X_sampled, y_sampled], axis=1)

In [None]:
undersampled_df.head()

In [None]:
undersampled_df.shape

## 4.3. Data After Undersampling

In [None]:
loan_status_sampled = pd.value_counts(undersampled_df["Loan_Status"], sort=True).sort_index()
print(loan_status_sampled)

In [None]:
loan_status_sampled.plot(kind="bar", figsize=(12,7), color="darkgreen")
plt.title("Loan Status Histogram")
plt.xlabel("Loan Status")
plt.ylabel("Frequency")
plt.show()

In [None]:
yes_status = len(undersampled_df[undersampled_df["Loan_Status"] == 1])
no_status = len(undersampled_df[undersampled_df["Loan_Status"] == 0])
total_status = yes_status + no_status

In [None]:
print("Loan Approved:", yes_status)
print("Loan Not Approved:", no_status)
print("Total Loan Applications:", total_status)

In [None]:
plt.figure(figsize=(12,6))
size = [192,192]
labels = ['yes', 'no']
colors = ['darkgreen', 'grey']
# explode = [0, 0.2]

plt.pie(size, colors = colors, labels = labels, shadow = True, autopct = '%.2f%%')
plt.title('LOAN STATUS', fontsize = 20)
plt.legend()
plt.show() 

In [None]:
X = undersampled_df.drop("Loan_Status", axis=1)
y = undersampled_df["Loan_Status"]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=2)

## 4.4. Machine Learning Algorithms

### 4.4.1. Random Forest Classifier

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
y_pred_rf = rf.predict(X_test)

### 4.4.2. KNN

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
y_pred_knn = knn.predict(X_test)

### 4.4.3. Decision Tree

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [None]:
y_pred_dt = dt.predict(X_test)

### 4.4.4. SVC

In [None]:
svc = SVC()
svc.fit(X_train, y_train)

In [None]:
y_pred_svc = svc.predict(X_test)

### 4.4.5 GaussianNB

In [None]:
gn = GaussianNB()
gn.fit(X_train, y_train)

In [None]:
y_pred_gn = gn.predict(X_test)

## ACCURACY SCORE

In [None]:
scores = {}

In [None]:
acc_rf = accuracy_score(y_test, y_pred_rf)
scores['Random Forest'] = acc_rf
acc_knn = accuracy_score(y_test, y_pred_knn)
scores['KNN'] = acc_knn
acc_dt = accuracy_score(y_test, y_pred_dt)
scores['Decision Tree'] = acc_dt
acc_svc = accuracy_score(y_test, y_pred_svc)
scores['SVC'] = acc_svc
acc_gn = accuracy_score(y_test, y_pred_gn)
scores['GaussianNB'] = acc_gn
print(acc_rf, acc_knn, acc_dt, acc_svc, acc_gn)

In [None]:
colors = ["purple", "green", "orange", "magenta","#CFC60E","#0FBBAE"]

sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,100,10))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
sns.barplot(x=list(scores.keys()), y=list(scores.values()), palette=colors)
plt.show()

## CLASSIFICATION REPORT

In [None]:
# Random Forest
print(classification_report(y_test, y_pred_rf))

In [None]:
# KNN
print(classification_report(y_test, y_pred_knn))

In [None]:
# Decision Tree
print(classification_report(y_test, y_pred_dt))

In [None]:
# SVC
print(classification_report(y_test, y_pred_svc))

In [None]:
# GaussianNB
print(classification_report(y_test, y_pred_gn ))

## CONFUSION MATRIX

In [None]:
cm_rf = confusion_matrix(y_test,y_pred_rf)
cm_knn = confusion_matrix(y_test,y_pred_knn)
cm_dt = confusion_matrix(y_test,y_pred_dt)
cm_svc = confusion_matrix(y_test,y_pred_svc)
cm_gn = confusion_matrix(y_test,y_pred_gn)

In [None]:
plt.figure(figsize=(24,12))

plt.suptitle("Confusion Matrixes",fontsize=24)
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)

plt.subplot(2,3,1)
plt.title("Random Forest Confusion Matrix", fontsize=20)
sns.heatmap(cm_rf,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(2,3,2)
plt.title("K Nearest Neighbors Confusion Matrix", fontsize=20)
sns.heatmap(cm_knn,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})


plt.subplot(2,3,3)
plt.title("Decision Tree Confusion Matrix", fontsize=20)
sns.heatmap(cm_dt,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(2,3,4)
plt.title("SVC Confusion Matrix", fontsize=20)
sns.heatmap(cm_svc,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(2,3,5)
plt.title("GaussianNB Confusion Matrix", fontsize=20)
sns.heatmap(cm_gn,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24});


## ROC CURVE