In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score

In [None]:
df = pd.read_csv('Loan.csv')

df.head() 

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,...,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore
0,2018-01-01,45,39948,617,Employed,Master,22,13152,48,Married,...,3329.0,0.724972,11,126928,0.199652,0.22759,419.805992,0.181077,0,49.0
1,2018-01-02,38,39709,628,Employed,Associate,15,26045,48,Single,...,3309.083333,0.935132,3,43609,0.207045,0.201077,794.054238,0.389852,0,52.0
2,2018-01-03,47,40724,570,Employed,Bachelor,26,17627,36,Married,...,3393.666667,0.872241,6,5205,0.217627,0.212548,666.406688,0.462157,0,52.0
3,2018-01-04,58,69084,545,Employed,High School,34,37898,96,Single,...,5757.0,0.896155,5,99452,0.300398,0.300911,1047.50698,0.313098,0,54.0
4,2018-01-05,37,103264,594,Employed,Associate,17,9184,36,Married,...,8605.333333,0.941369,5,227019,0.197184,0.17599,330.17914,0.07021,1,36.0


In [None]:

df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 36 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ApplicationDate             20000 non-null  object 
 1   Age                         20000 non-null  int64  
 2   AnnualIncome                20000 non-null  int64  
 3   CreditScore                 20000 non-null  int64  
 4   EmploymentStatus            20000 non-null  object 
 5   EducationLevel              20000 non-null  object 
 6   Experience                  20000 non-null  int64  
 7   LoanAmount                  20000 non-null  int64  
 8   LoanDuration                20000 non-null  int64  
 9   MaritalStatus               20000 non-null  object 
 10  NumberOfDependents          20000 non-null  int64  
 11  HomeOwnershipStatus         20000 non-null  object 
 12  MonthlyDebtPayments         20000 non-null  int64  
 13  CreditCardUtilizationRate   200

In [None]:
df.isnull().sum()

Unnamed: 0,0
ApplicationDate,0
Age,0
AnnualIncome,0
CreditScore,0
EmploymentStatus,0
EducationLevel,0
Experience,0
LoanAmount,0
LoanDuration,0
MaritalStatus,0


In [None]:

categorical_columns = ['MaritalStatus', 'EmploymentStatus', 'EducationLevel',
                       'HomeOwnershipStatus', 'LoanPurpose']

df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=False)  

print(df_encoded.head())

  ApplicationDate  Age  AnnualIncome  CreditScore  Experience  LoanAmount  \
0      2018-01-01   45         39948          617          22       13152   
1      2018-01-02   38         39709          628          15       26045   
2      2018-01-03   47         40724          570          26       17627   
3      2018-01-04   58         69084          545          34       37898   
4      2018-01-05   37        103264          594          17        9184   

   LoanDuration  NumberOfDependents  MonthlyDebtPayments  \
0            48                   2                  183   
1            48                   1                  496   
2            36                   2                  902   
3            96                   1                  755   
4            36                   1                  274   

   CreditCardUtilizationRate  ...  EducationLevel_Master  \
0                   0.354418  ...                   True   
1                   0.087827  ...                  False

In [None]:
df = df_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))

print(df.head())

  ApplicationDate  Age  AnnualIncome  CreditScore  Experience  LoanAmount  \
0      2018-01-01   45         39948          617          22       13152   
1      2018-01-02   38         39709          628          15       26045   
2      2018-01-03   47         40724          570          26       17627   
3      2018-01-04   58         69084          545          34       37898   
4      2018-01-05   37        103264          594          17        9184   

   LoanDuration  NumberOfDependents  MonthlyDebtPayments  \
0            48                   2                  183   
1            48                   1                  496   
2            36                   2                  902   
3            96                   1                  755   
4            36                   1                  274   

   CreditCardUtilizationRate  ...  EducationLevel_Master  \
0                   0.354418  ...                      1   
1                   0.087827  ...                      0

In [None]:
from sklearn.preprocessing import MinMaxScaler

columns_to_scale = [
    'Age', 'AnnualIncome', 'CreditScore', 'Experience',
    'LoanAmount', 'LoanDuration', 'MonthlyDebtPayments',
    'DebtToIncomeRatio'
]

scaler = MinMaxScaler()

df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

df.head()

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,Experience,LoanAmount,LoanDuration,NumberOfDependents,MonthlyDebtPayments,CreditCardUtilizationRate,...,EducationLevel_Master,HomeOwnershipStatus_Mortgage,HomeOwnershipStatus_Other,HomeOwnershipStatus_Own,HomeOwnershipStatus_Rent,LoanPurpose_Auto,LoanPurpose_Debt Consolidation,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other
0,2018-01-01,0.435484,0.053042,0.742547,0.360656,0.052348,0.333333,2,0.046358,0.354418,...,1,0,0,1,0,0,0,0,1,0
1,2018-01-02,0.322581,0.052534,0.772358,0.245902,0.123557,0.333333,1,0.155455,0.087827,...,0,1,0,0,0,0,1,0,0,0
2,2018-01-03,0.467742,0.054692,0.615176,0.42623,0.077064,0.222222,2,0.296968,0.137414,...,0,0,0,0,1,0,0,1,0,0
3,2018-01-04,0.645161,0.114989,0.547425,0.557377,0.189022,0.777778,1,0.24573,0.267587,...,0,1,0,0,0,0,0,0,1,0
4,2018-01-05,0.306452,0.18766,0.680217,0.278689,0.030432,0.222222,1,0.078076,0.320535,...,0,1,0,0,0,0,1,0,0,0


In [None]:
from sklearn.model_selection import train_test_split

# Define target variables
classification_target = 'LoanApproved'  
regression_target = 'RiskScore'         

# Features and targets
X = df.drop(columns=[classification_target, regression_target, 'ApplicationDate'])  
y_classification = df[classification_target]
y_regression = df[regression_target]

X_trainval_class, X_test_class, y_trainval_class, y_test_class = train_test_split(
    X, y_classification, test_size=0.2, random_state=42, stratify=y_classification)

X_trainval_reg, X_test_reg, y_trainval_reg, y_test_reg = train_test_split(
    X, y_regression, test_size=0.2, random_state=30)

X_train_class, X_val_class, y_train_class, y_val_class = train_test_split(
    X_trainval_class, y_trainval_class, test_size=0.25, random_state=25, stratify=y_trainval_class)

X_train_reg, X_val_reg, y_train_reg, y_val_reg = train_test_split(
    X_trainval_reg, y_trainval_reg, test_size=0.25, random_state=60)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Train Linear Regression
linear_reg = LinearRegression()
linear_reg.fit(X_train_reg, y_train_reg)

# Validate the model
y_val_pred_reg_lr = linear_reg.predict(X_val_reg)
print("Linear Regression Validation MSE:", mean_squared_error(y_val_reg, y_val_pred_reg_lr))
print("Linear Regression Validation R² Score:", r2_score(y_val_reg, y_val_pred_reg_lr))


Linear Regression Validation MSE: 12.25511501332463
Linear Regression Validation R² Score: 0.7969821863671835


In [None]:
# Linear Regression
y_test_pred_reg_lr = linear_reg.predict(X_test_reg)
print("Linear Regression Test MSE:", mean_squared_error(y_test_reg, y_test_pred_reg_lr))
print("Linear Regression Test R² Score:", r2_score(y_test_reg, y_test_pred_reg_lr))


Linear Regression Test MSE: 14.150967647433852
Linear Regression Test R² Score: 0.7720800995174577


In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

# Initialize Ridge Regression
ridge_reg = Ridge(alpha=1.0) 

# Train Ridge Regression
ridge_reg.fit(X_train_reg, y_train_reg)

# Validate the model
y_val_pred_reg_ridge = ridge_reg.predict(X_val_reg)
y_test_pred_reg_ridge = ridge_reg.predict(X_test_reg)
print("Ridge Regression Test MSE:", mean_squared_error(y_test_reg, y_test_pred_reg_ridge))
print("Ridge Regression Test R² Score:", r2_score(y_test_reg, y_test_pred_reg_ridge))


Ridge Regression Test MSE: 14.160835703642249
Ridge Regression Test R² Score: 0.7719211615250171


In [None]:
linear_reg.score(X_train_reg, y_train_reg)

0.7857033920109311

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the XGBoost Regressor
xgb_reg = XGBRegressor(
    n_estimators=200,         # Number of trees
    max_depth=6,              # Maximum depth of trees
    learning_rate=0.1,        # Step size shrinkage
    random_state=70
)

# Train the model
xgb_reg.fit(X_train_reg, y_train_reg)

# Validate the model
y_val_pred_reg_xgb = xgb_reg.predict(X_val_reg)
print("XGBoost Validation MSE:", mean_squared_error(y_val_reg, y_val_pred_reg_xgb))
print("XGBoost Validation R² Score:", r2_score(y_val_reg, y_val_pred_reg_xgb))

# Test the model
y_test_pred_reg_xgb = xgb_reg.predict(X_test_reg)
print("XGBoost Test MSE:", mean_squared_error(y_test_reg, y_test_pred_reg_xgb))
print("XGBoost Test R² Score:", r2_score(y_test_reg, y_test_pred_reg_xgb))


XGBoost Validation MSE: 5.424948668355845
XGBoost Validation R² Score: 0.910130487023387
XGBoost Test MSE: 5.90979577044919
XGBoost Test R² Score: 0.9048149852764883


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train Logistic Regression
log_reg = LogisticRegression(random_state=48, max_iter=1000)
log_reg.fit(X_train_class, y_train_class)

# Validate the model
y_val_pred_class_lr = log_reg.predict(X_val_class)
print("Logistic Regression Validation Accuracy:", accuracy_score(y_val_class, y_val_pred_class_lr))
print("Classification Report:\n", classification_report(y_val_class, y_val_pred_class_lr))


Logistic Regression Validation Accuracy: 0.8705
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.92      3044
           1       0.78      0.64      0.70       956

    accuracy                           0.87      4000
   macro avg       0.84      0.79      0.81      4000
weighted avg       0.87      0.87      0.87      4000



In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=77)
rf_clf.fit(X_train_class, y_train_class)

# Validate the model
y_val_pred_class_rf = rf_clf.predict(X_val_class)
print("Random Forest Validation Accuracy:", accuracy_score(y_val_class, y_val_pred_class_rf))
print("Classification Report:\n", classification_report(y_val_class, y_val_pred_class_rf))


Random Forest Validation Accuracy: 0.9245
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95      3044
           1       0.88      0.79      0.83       956

    accuracy                           0.92      4000
   macro avg       0.91      0.88      0.89      4000
weighted avg       0.92      0.92      0.92      4000



In [None]:
from xgboost import XGBClassifier

# Train XGBoost Classifier
xgb_clf = XGBClassifier(random_state=22, use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train_class, y_train_class)

# Validate the model
y_val_pred_class_xgb = xgb_clf.predict(X_val_class)
print("XGBoost Validation Accuracy:", accuracy_score(y_val_class, y_val_pred_class_xgb))
print("Classification Report:\n", classification_report(y_val_class, y_val_pred_class_xgb))


XGBoost Validation Accuracy: 0.95175
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      3044
           1       0.91      0.89      0.90       956

    accuracy                           0.95      4000
   macro avg       0.94      0.93      0.93      4000
weighted avg       0.95      0.95      0.95      4000



In [None]:
# Logistic Regression
y_test_pred_class_lr = log_reg.predict(X_test_class)
print("Logistic Regression Test Accuracy:", accuracy_score(y_test_class, y_test_pred_class_lr))
print("Logistic Regression Test Report:\n", classification_report(y_test_class, y_test_pred_class_lr))

# Random Forest Classifier
y_test_pred_class_rf = rf_clf.predict(X_test_class)
print("Random Forest Test Accuracy:", accuracy_score(y_test_class, y_test_pred_class_rf))
print("Random Forest Test Report:\n", classification_report(y_test_class, y_test_pred_class_rf))

# XGBoost Classifier
y_test_pred_class_xgb = xgb_clf.predict(X_test_class)
print("XGBoost Test Accuracy:", accuracy_score(y_test_class, y_test_pred_class_xgb))
print("XGBoost Test Report:\n", classification_report(y_test_class, y_test_pred_class_xgb))

Logistic Regression Test Accuracy: 0.8765
Logistic Regression Test Report:
               precision    recall  f1-score   support

           0       0.90      0.95      0.92      3044
           1       0.79      0.65      0.72       956

    accuracy                           0.88      4000
   macro avg       0.85      0.80      0.82      4000
weighted avg       0.87      0.88      0.87      4000

Random Forest Test Accuracy: 0.92825
Random Forest Test Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95      3044
           1       0.89      0.80      0.84       956

    accuracy                           0.93      4000
   macro avg       0.91      0.88      0.90      4000
weighted avg       0.93      0.93      0.93      4000

XGBoost Test Accuracy: 0.95725
XGBoost Test Report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.97      3044
           1       0.92      0.90      0.91