In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier


In [2]:
df=pd.read_csv("/content/loan_prediction.csv")

In [3]:
df.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df=df.drop("Loan_ID",axis=1)


In [5]:
# missing values
df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
print(df.describe())

       ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
count       614.000000         614.000000  592.000000         600.00000   
mean       5403.459283        1621.245798  146.412162         342.00000   
std        6109.041673        2926.248369   85.587325          65.12041   
min         150.000000           0.000000    9.000000          12.00000   
25%        2877.500000           0.000000  100.000000         360.00000   
50%        3812.500000        1188.500000  128.000000         360.00000   
75%        5795.000000        2297.250000  168.000000         360.00000   
max       81000.000000       41667.000000  700.000000         480.00000   

       Credit_History  
count      564.000000  
mean         0.842199  
std          0.364878  
min          0.000000  
25%          1.000000  
50%          1.000000  
75%          1.000000  
max          1.000000  


In [7]:
# fill missing values in categorical columns with mode
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Married'].fillna(df['Married'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)


In [8]:
#Fill missing values in LoanAmount with the median
df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)

#Fill missing values in Loan_Amount_Term with the mode
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace=True)

#Fill missing values in Credit_History with the mode
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)




# **Exploratory Data Analysis**

In [13]:
# now let's have a look at the distribution of the loan status column
import plotly.express as px
loan_status_counts = df['Loan_Status'].value_counts()
fig = px.pie(loan_status_counts, names=loan_status_counts.index)
fig.show()

In [14]:
# now let's have a look at the dustribution of the gender column
gender_counts = df['Gender'].value_counts()
fig = px.bar(gender_counts, x=gender_counts.index,
             y=gender_counts.values,
             title='Gender Distribution')
fig.show()

In [16]:
#Now let’s have a look at the distribution of the martial status column:
married_count = df['Married'].value_counts()
fig_married = px.bar(married_count,
                     x=married_count.index,
                     y=married_count.values,
                     title='Marital Status Distribution')
fig_married.show()


In [17]:
#Now let’s have a look at the distribution of the education column:
education_count = df['Education'].value_counts()
fig_education = px.bar(education_count,
                       x=education_count.index,
                       y=education_count.values,
                       title='Education Distribution')
fig_education.show()



In [18]:
#Now let’s have a look at the distribution of the self-employment column:
self_employed_count = df['Self_Employed'].value_counts()
fig_self_employed = px.bar(self_employed_count,
                           x=self_employed_count.index,
                           y=self_employed_count.values,
                           title='Self-Employment Distribution')
fig_self_employed.show()

In [19]:
#Now let’s have a look at the distribution of the Applicant Income column:
fig_applicant_income = px.histogram(df,
                                    x='ApplicantIncome',
                                    title='Applicant Income Distribution')
fig_applicant_income.show()

In [20]:
#Now let’s have a look at the relationship between the income of the loan applicant and the loan status:
fig_applicant_income_loan_status = px.scatter(df,
                                              x='ApplicantIncome',
                                              y='Loan_Status',
                                              title='Applicant Income vs Loan Status')
fig_applicant_income_loan_status.show()

In [21]:
#The “ApplicantIncome” column contains outliers which need to be removed before moving further. Here’s how to remove the outliers:
df = df[df['ApplicantIncome'] < 20000]



In [22]:
#Now let’s have a look at the relationship between the income of the loan co-applicant and the loan status:
fig_coapplicant_income_loan_status = px.scatter(df,
                                               x='CoapplicantIncome',
                                               y='Loan_Status',
                                               title='Coapplicant Income vs Loan Status')
fig_coapplicant_income_loan_status



In [23]:
#The income of the loan co-applicant also contains outliers. Let’s remove the outliers from this column as well:
df = df[df['CoapplicantIncome'] < 10000]

In [24]:
#Now let’s have a look at the relationship between the loan amount and the loan status:
fig_loan_amount_loan_status = px.scatter(df,
                                         x='LoanAmount',
                                         y='Loan_Status',
                                         title='Loan Amount vs Loan Status')
fig_loan_amount_loan_status.show()

In [28]:
#Now let’s have a look at the relationship between credit history and loan status:
fig_credit_history = px.histogram(df,
                                  x='Credit_History',
                                  color='Loan_Status',
                                  title='Credit History vs Loan Status',
                                  barmode='group')
fig_credit_history.show()

In [29]:
#Now let’s have a look at the relationship between the property area and the loan status:
fig_property_area = px.histogram(df,
                                 x='Property_Area',
                                 color='Loan_Status',
                                 title='Property Area vs Loan Status',
                                 barmode='group')
fig_property_area.show()

# Data Preparation and Training Loan Approval Prediction Model
In this step, we will:


1.convert categorical columns into numerical ones;

2.split the data into training and test sets;

3.scale the numerical features;

4.train the loan approval prediction model.

In [30]:
#convert categorical columns to numerical using one-hot encoding
cat_cols= ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
df=pd.get_dummies(df, columns=cat_cols)

#split the dataset into features X and target y
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Scale the numerical columns using StandardScaler
scaler = StandardScaler()
num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

from sklearn.svm import SVC
# Train the loan approval prediction model
model = SVC(random_state=42)
model.fit(X_train, y_train)

In [32]:
#make predictons on the test set:
y_pred = model.predict(X_test)
print(y_pred)

['Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'N' 'N' 'Y' 'Y' 'N' 'N' 'Y' 'Y' 'N' 'N' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N'
 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'N' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'N'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y']


In [41]:
models={
    "Logisitic Regression":LogisticRegression(),
    "Decision Tree":DecisionTreeClassifier(),
    "Random Forest":RandomForestClassifier(),
    "Gradient Boost":GradientBoostingClassifier()
}
for name,model in models.items():
  model.fit(X_train,y_train)
  y_pred=model.predict(X_test)
  print(name)
  print(accuracy_score(y_test,y_pred))
  print(confusion_matrix(y_test,y_pred))
  print(classification_report(y_test,y_pred))
  print()







Logisitic Regression
0.7666666666666667
[[19 27]
 [ 1 73]]
              precision    recall  f1-score   support

           N       0.95      0.41      0.58        46
           Y       0.73      0.99      0.84        74

    accuracy                           0.77       120
   macro avg       0.84      0.70      0.71       120
weighted avg       0.81      0.77      0.74       120


Decision Tree
0.7166666666666667
[[27 19]
 [15 59]]
              precision    recall  f1-score   support

           N       0.64      0.59      0.61        46
           Y       0.76      0.80      0.78        74

    accuracy                           0.72       120
   macro avg       0.70      0.69      0.69       120
weighted avg       0.71      0.72      0.71       120


Random Forest
0.725
[[21 25]
 [ 8 66]]
              precision    recall  f1-score   support

           N       0.72      0.46      0.56        46
           Y       0.73      0.89      0.80        74

    accuracy                  

In [49]:
#Hyperparameter tuning
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}
logistic_params = {"C": np.logspace(-4, 4, 20),
                   "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"]


                   }


randomcv_models = [
                   ("RF", RandomForestClassifier(), rf_params),
                   ("LR", LogisticRegression(), logistic_params)

                   ]

#Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])


Fitting 3 folds for each of 100 candidates, totalling 300 fits



`max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.



Fitting 3 folds for each of 100 candidates, totalling 300 fits
---------------- Best Params for RF -------------------
{'n_estimators': 500, 'min_samples_split': 8, 'max_features': 'auto', 'max_depth': 8}
---------------- Best Params for LR -------------------
{'solver': 'lbfgs', 'C': 0.615848211066026}


In [50]:
models={

           "Random Forest":RandomForestClassifier(n_estimators=1000,min_samples_split=8,max_features=8,max_depth=15),
           "Logistic Regression":LogisticRegression(C=0.615848211066026,solver="lbfgs")


}
for name,model in models.items():
  model.fit(X_train,y_train)
  y_pred=model.predict(X_test)
  print(name)
  print(accuracy_score(y_test,y_pred))
  print(confusion_matrix(y_test,y_pred))
  print(classification_report(y_test,y_pred))
  print()


Random Forest
0.75
[[21 25]
 [ 5 69]]
              precision    recall  f1-score   support

           N       0.81      0.46      0.58        46
           Y       0.73      0.93      0.82        74

    accuracy                           0.75       120
   macro avg       0.77      0.69      0.70       120
weighted avg       0.76      0.75      0.73       120


Logistic Regression
0.775
[[19 27]
 [ 0 74]]
              precision    recall  f1-score   support

           N       1.00      0.41      0.58        46
           Y       0.73      1.00      0.85        74

    accuracy                           0.78       120
   macro avg       0.87      0.71      0.72       120
weighted avg       0.84      0.78      0.75       120


