# Import Required Libraries

In [2]:
# Data Manipulation
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.impute import SimpleImputer
from flask import Flask, request, jsonify
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

# Load and Inspect the Dataset

In [3]:
# Load dataset
data = pd.read_csv('loan_data.csv')

### Display basic information

In [15]:
print(data.head())

    Loan_ID  ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0  LP001003         0.707469           0.098695    0.812575             360.0   
1  LP001005        -0.408932          -0.546371   -1.376596             360.0   
2  LP001006        -0.703019           0.462294    0.530102             360.0   
3  LP001008         1.706799          -0.546371    1.271595             360.0   
4  LP001013        -0.879330           0.102118   -0.352629             360.0   

   Credit_History  Loan_Status  Gender_Male  Married_Yes  Dependents_1  \
0             1.0            0         True         True          True   
1             1.0            1         True         True         False   
2             1.0            1         True         True         False   
3             1.0            1         True        False         False   
4             1.0            1         True         True         False   

   Dependents_2  Dependents_3+  Education_Not Graduate  Self_Employe

In [16]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Loan_ID                  381 non-null    object 
 1   ApplicantIncome          381 non-null    float64
 2   CoapplicantIncome        381 non-null    float64
 3   LoanAmount               381 non-null    float64
 4   Loan_Amount_Term         381 non-null    float64
 5   Credit_History           381 non-null    float64
 6   Loan_Status              381 non-null    int64  
 7   Gender_Male              381 non-null    bool   
 8   Married_Yes              381 non-null    bool   
 9   Dependents_1             381 non-null    bool   
 10  Dependents_2             381 non-null    bool   
 11  Dependents_3+            381 non-null    bool   
 12  Education_Not Graduate   381 non-null    bool   
 13  Self_Employed_Yes        381 non-null    bool   
 14  Property_Area_Semiurban  3

In [17]:
print(data.describe())

       ApplicantIncome  CoapplicantIncome    LoanAmount  Loan_Amount_Term  \
count     3.810000e+02       3.810000e+02  3.810000e+02        381.000000   
mean      1.282147e-16      -3.963001e-17 -1.049030e-16        341.417323   
std       1.001315e+00       1.001315e+00  1.001315e+00         67.625957   
min      -2.418877e+00      -5.463709e-01 -3.389221e+00         12.000000   
25%      -6.910297e-01      -5.463709e-01 -5.291748e-01        360.000000   
50%      -1.740860e-01      -1.258801e-01  1.770094e-01        360.000000   
75%       4.994218e-01       3.159989e-01  7.772660e-01        360.000000   
max       4.318317e+00       1.392784e+01  1.589378e+00        480.000000   

       Credit_History  Loan_Status  
count      381.000000   381.000000  
mean         0.850394     0.711286  
std          0.357154     0.453761  
min          0.000000     0.000000  
25%          1.000000     0.000000  
50%          1.000000     1.000000  
75%          1.000000     1.000000  
max       

In [18]:
print(data.isnull().sum())

Loan_ID                    0
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Loan_Status                0
Gender_Male                0
Married_Yes                0
Dependents_1               0
Dependents_2               0
Dependents_3+              0
Education_Not Graduate     0
Self_Employed_Yes          0
Property_Area_Semiurban    0
Property_Area_Urban        0
dtype: int64


# Data Preprocessing
- Handle Missing Values
- Encode Categorical Variables
- Feature Scaling
- Split Dataset

In [5]:
# Fill missing categorical values with mode
for column in ['Gender', 'Married', 'Dependents', 'Self_Employed']:
    data[column].fillna(data[column].mode()[0], inplace=True)

In [6]:
# Fill missing numerical values with median
data['LoanAmount'].fillna(data['LoanAmount'].median(), inplace=True)
data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].median(), inplace=True)

In [7]:
# Fill missing Credit_History with mode
data['Credit_History'].fillna(data['Credit_History'].mode()[0], inplace=True)

In [8]:
# Confirm missing values handled
print(data.isnull().sum())

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


### Encode Categorical Variables

In [9]:
# Encode target variable
data['Loan_Status'] = data['Loan_Status'].map({'Y': 1, 'N': 0})

In [10]:
# Encode other categorical variables
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

print(data.head())

    Loan_ID  ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0  LP001003             4583             1508.0         128             360.0   
1  LP001005             3000                0.0          66             360.0   
2  LP001006             2583             2358.0         120             360.0   
3  LP001008             6000                0.0         141             360.0   
4  LP001013             2333             1516.0          95             360.0   

   Credit_History  Loan_Status  Gender_Male  Married_Yes  Dependents_1  \
0             1.0            0         True         True          True   
1             1.0            1         True         True         False   
2             1.0            1         True         True         False   
3             1.0            1         True        False         False   
4             1.0            1         True         True         False   

   Dependents_2  Dependents_3+  Education_Not Graduate  Self_Employe

### Feature Scaling

In [11]:
# Scale numerical features
scaler = StandardScaler()
data[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']] = scaler.fit_transform(data[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']])

### Split Dataset

In [12]:
# Define features and target
X = data.drop(columns=['Loan_ID', 'Loan_Status'])
y = data['Loan_Status']

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Model

In [13]:
# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the Model

In [14]:
# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

Accuracy: 0.8051948051948052
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.33      0.48        21
           1       0.80      0.98      0.88        56

    accuracy                           0.81        77
   macro avg       0.84      0.66      0.68        77
weighted avg       0.82      0.81      0.77        77

Confusion Matrix:
 [[ 7 14]
 [ 1 55]]
ROC AUC Score: 0.7827380952380952
