In [1]:
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations
from sklearn.model_selection import train_test_split  # For splitting data into train and test sets
from sklearn.preprocessing import LabelEncoder  # For encoding categorical variables
from sklearn.neighbors import KNeighborsClassifier  # K-Nearest Neighbors classifier
from sklearn.naive_bayes import GaussianNB  # Naive Bayes classifier
from sklearn.tree import DecisionTreeClassifier  # Decision Tree classifier
from sklearn.ensemble import RandomForestClassifier  # Random Forest classifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score  # Evaluation metrics
import warnings  # To suppress warnings

warnings.filterwarnings('ignore')


# Load dataset

In [2]:
df = pd.read_csv("loan_approval_dataset.csv")

# Clean column names by removing leading/trailing spaces

In [3]:
df.columns = df.columns.str.strip()

# Show actual column names

In [4]:
print("Actual columns:", df.columns.tolist())

Actual columns: ['Applicant_ID', 'Age', 'Income', 'Credit_Score', 'Loan_Amount', 'Loan_Term', 'Interest_Rate', 'Employment_Status', 'Debt_to_Income_Ratio', 'Marital_Status', 'Number_of_Dependents', 'Property_Ownership', 'Loan_Purpose', 'Previous_Defaults']


# Display basic info

In [5]:
print("Dataset Shape:", df.shape)
df.head()

Dataset Shape: (5000, 14)


Unnamed: 0,Applicant_ID,Age,Income,Credit_Score,Loan_Amount,Loan_Term,Interest_Rate,Employment_Status,Debt_to_Income_Ratio,Marital_Status,Number_of_Dependents,Property_Ownership,Loan_Purpose,Previous_Defaults
0,1,56,21920,639,452748,72,4.53,Unemployed,43.35,Married,2,Mortgage,Business,0
1,2,69,126121,655,257134,60,5.38,Unemployed,10.42,Divorced,0,Mortgage,Education,0
2,3,46,96872,467,226437,72,3.46,Self-Employed,45.39,Divorced,4,Mortgage,Car,0
3,4,32,101132,751,310480,12,14.0,Unemployed,8.58,Single,2,Mortgage,Business,0
4,5,60,22093,404,13070,12,9.13,Self-Employed,20.7,Divorced,1,Mortgage,Home,0


# Fill missing values using forward fill

In [6]:
df.fillna(method='ffill', inplace=True)

# Encode Categorical Variables

In [7]:
label_enc = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = label_enc.fit_transform(df[col])

# Feature and Target Separation

In [8]:
X = df.drop('Previous_Defaults', axis=1)
y = df['Previous_Defaults']

# Train-Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define Model Evaluation Function

In [10]:
def evaluate_model(model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\nModel: {model_name}")
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("Confusion Matrix:\n", cm)

    return [model_name, acc, prec, rec]

# Initialize and Evaluate Models

In [11]:
# Initialize models
knn = KNeighborsClassifier(n_neighbors=5)
nb = GaussianNB()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

In [12]:
# Evaluate each model
results = []
results.append(evaluate_model(knn, "K-Nearest Neighbors"))
results.append(evaluate_model(nb, "Naive Bayes"))
results.append(evaluate_model(dt, "Decision Tree"))
results.append(evaluate_model(rf, "Random Forest"))


Model: K-Nearest Neighbors
Accuracy: 0.903
Precision: 0.16666666666666666
Recall: 0.010752688172043012
Confusion Matrix:
 [[902   5]
 [ 92   1]]

Model: Naive Bayes
Accuracy: 0.907
Precision: 0.0
Recall: 0.0
Confusion Matrix:
 [[907   0]
 [ 93   0]]

Model: Decision Tree
Accuracy: 0.816
Precision: 0.13008130081300814
Recall: 0.17204301075268819
Confusion Matrix:
 [[800 107]
 [ 77  16]]

Model: Random Forest
Accuracy: 0.907
Precision: 0.0
Recall: 0.0
Confusion Matrix:
 [[907   0]
 [ 93   0]]


# Compare Models

In [13]:
comparison_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall'])
print("\n--- Model Comparison ---")
comparison_df


--- Model Comparison ---


Unnamed: 0,Model,Accuracy,Precision,Recall
0,K-Nearest Neighbors,0.903,0.166667,0.010753
1,Naive Bayes,0.907,0.0,0.0
2,Decision Tree,0.816,0.130081,0.172043
3,Random Forest,0.907,0.0,0.0


# Bonus

In [14]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Build neural network model
dl_model = Sequential()
dl_model.add(Dense(64, input_dim=X_train_dl.shape[1], activation='relu'))
dl_model.add(Dense(32, activation='relu'))
dl_model.add(Dense(1, activation='sigmoid'))
dl_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
dl_model.fit(X_train_dl, y_train_dl, epochs=50, batch_size=16, verbose=0)

# Evaluate the model
y_pred_dl = (dl_model.predict(X_test_dl) > 0.5).astype("int32").flatten()

acc_dl = accuracy_score(y_test_dl, y_pred_dl)
prec_dl = precision_score(y_test_dl, y_pred_dl)
rec_dl = recall_score(y_test_dl, y_pred_dl)
cm_dl = confusion_matrix(y_test_dl, y_pred_dl)

print("\nModel: Deep Learning (Neural Network)")
print("Accuracy:", acc_dl)
print("Precision:", prec_dl)
print("Recall:", rec_dl)
print("Confusion Matrix:\n", cm_dl)

results.append(["Deep Learning", acc_dl, prec_dl, rec_dl])


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 

Model: Deep Learning (Neural Network)
Accuracy: 0.884
Precision: 0.17142857142857143
Recall: 0.06451612903225806
Confusion Matrix:
 [[878  29]
 [ 87   6]]


# Save Results to CSV

In [15]:
comparison_df.to_csv("model_comparison_results.csv", index=False)