**Problem Statement:** Predict whether a loan application will be approved or rejected based on applicant details such as income, credit history, employment status, education, and loan amount.

**Dataset:** Indian Bank Loan Prediction dataset

**Datasource:** Kaaggle

In [2]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score
from sklearn.metrics import recall_score, f1_score, matthews_corrcoef
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

import zipfile


In [None]:
#!pip install kaggle

In [None]:


#os.environ["KAGGLE_USERNAME"] = "saharehemant"
#os.environ["KAGGLE_KEY"] = "KGAT_2b152c052f57b6fbd48275a78b4bc3a8"

#export KAGGLE_API_TOKEN=KGAT_2b152c052f57b6fbd48275a78b4bc3a8
#kaggle competitions list

#!kaggle datasets download -d altruistdelhite04/loan-prediction-problem-dataset

#with zipfile.ZipFile("loan-prediction-problem-dataset.zip", 'r') as zip_ref:
 #   zip_ref.extractall()


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#df = pd.read_csv('/content/train.csv')
#print(df.shape)
#df.head()
#print(df.columns)

"""Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')"""

In [4]:
#Loading and preprocessing the load data from kaggle


df = pd.read_csv('train.csv')
df.head()
print(df.columns)

# handling missing value

df.fillna(df.mode().iloc[0], inplace=True)


# Encode categorical variables

le = LabelEncoder()

for col in df.select_dtypes(include='object').columns:
    df[col] = le.fit_transform(df[col])

X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')


Training the 6 Models

In [5]:
#Logistic regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

#Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

#KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

#Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)

#Random Forrest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

#XGBoost (Ensemble)
xgb = XGBClassifier(eval_metric='logloss')
xgb.fit(X_train, y_train)




In [6]:
def evaluate_model(model):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }


In [7]:
results = {
    "Logistic Regression": evaluate_model(lr),
    "Decision Tree": evaluate_model(dt),
    "KNN": evaluate_model(knn),
    "Naive Bayes": evaluate_model(nb),
    "Random Forest": evaluate_model(rf),
    "XGBoost": evaluate_model(xgb)
}

comparison_df = pd.DataFrame(results).T
print(comparison_df)


                     Accuracy       AUC  Precision  Recall        F1       MCC
Logistic Regression  0.788618  0.752035   0.759615  0.9875  0.858696  0.535826
Decision Tree        0.731707  0.696948   0.783133  0.8125  0.797546  0.400951
KNN                  0.764228  0.656977   0.742857  0.9750  0.843243  0.468268
Naive Bayes          0.780488  0.726453   0.757282  0.9750  0.852459  0.508635
Random Forest        0.780488  0.757849   0.757282  0.9750  0.852459  0.508635
XGBoost              0.764228  0.740988   0.768421  0.9125  0.834286  0.455873


In [8]:
import joblib
import os

# Create the 'model' directory if it doesn't exist
os.makedirs('model', exist_ok=True)

joblib.dump(lr, "model/lr.pkl")
joblib.dump(dt, "model/dt.pkl")
joblib.dump(knn, "model/knn.pkl")
joblib.dump(nb, "model/nb.pkl")
joblib.dump(rf, "model/rf.pkl")
joblib.dump(xgb, "model/xgb.pkl")

['model/xgb.pkl']

Downloading the Models and manually uploading to Git

In [9]:
!zip -r model.zip model

from google.colab import files
files.download("model.zip")


  adding: model/ (stored 0%)
  adding: model/dt.pkl (deflated 76%)
  adding: model/knn.pkl (deflated 74%)
  adding: model/xgb.pkl (deflated 72%)
  adding: model/rf.pkl (deflated 83%)
  adding: model/nb.pkl (deflated 22%)
  adding: model/lr.pkl (deflated 31%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>