In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


In [3]:
# Load the dataset
file_path = r"C:\Users\ganna\PycharmProjects\DS-ASSIGNMENT-2\loan_approval_dataset.csv"
#file_path = r"C:\Users\elena\Downloads\archive\loan_approval_dataset.csv"

columns = [
    "Applicant_ID", "Age", "Income", "Credit_Score", "Loan_Amount", "Loan_Term",
    "Interest_Rate", "Employment_Status", "Debt_to_Income_Ratio", "Marital_Status",
    "Number_of_Dependents", "Property_Ownership", "Loan_Purpose", "Previous_Defaults"
]
data = pd.read_csv(file_path, header=0, names=columns)

# Display the first few rows of the dataset
print(data.head())

   Applicant_ID  Age  Income  Credit_Score  Loan_Amount  Loan_Term  \
0             1   56   21920           639       452748         72   
1             2   69  126121           655       257134         60   
2             3   46   96872           467       226437         72   
3             4   32  101132           751       310480         12   
4             5   60   22093           404        13070         12   

   Interest_Rate Employment_Status  Debt_to_Income_Ratio Marital_Status  \
0           4.53        Unemployed                 43.35        Married   
1           5.38        Unemployed                 10.42       Divorced   
2           3.46     Self-Employed                 45.39       Divorced   
3          14.00        Unemployed                  8.58         Single   
4           9.13     Self-Employed                 20.70       Divorced   

   Number_of_Dependents Property_Ownership Loan_Purpose  Previous_Defaults  
0                     2           Mortgage     Busi

In [4]:
# Categorical Encoding
categorical_columns = ["Employment_Status", "Marital_Status","Property_Ownership", "Loan_Purpose"]
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le


In [5]:
# Split the Data
X = data.drop(columns=['Loan_Purpose'])
y = data['Loan_Purpose']
print(X)
print(y)


      Applicant_ID  Age  Income  Credit_Score  Loan_Amount  Loan_Term  \
0                1   56   21920           639       452748         72   
1                2   69  126121           655       257134         60   
2                3   46   96872           467       226437         72   
3                4   32  101132           751       310480         12   
4                5   60   22093           404        13070         12   
...            ...  ...     ...           ...          ...        ...   
4995          4996   24  169594           755       299944         48   
4996          4997   66  162728           829        15886         24   
4997          4998   26  166965           468       477830         48   
4998          4999   53   36493           442       205981         60   
4999          5000   36  154704           336       183308         36   

      Interest_Rate  Employment_Status  Debt_to_Income_Ratio  Marital_Status  \
0              4.53                  2     

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# 1. Feature Scaling (Critical for KNN!)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Use the same scaler!

# 2. Initialize and train KNN (start with k=5)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

# 3. Predict and evaluate
y_pred_knn = knn.predict(X_test_scaled)

# 4. Metrics
print("\n=== KNN Results ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn):.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))
print("Classification Report:")
print(classification_report(y_test, y_pred_knn))


=== KNN Results ===
Accuracy: 0.19
Confusion Matrix:
[[54 30 35 23 19]
 [55 45 44 29 33]
 [59 51 32 28 33]
 [70 53 40 28 37]
 [68 44 37 23 30]]
Classification Report:
              precision    recall  f1-score   support

           0       0.18      0.34      0.23       161
           1       0.20      0.22      0.21       206
           2       0.17      0.16      0.16       203
           3       0.21      0.12      0.16       228
           4       0.20      0.15      0.17       202

    accuracy                           0.19      1000
   macro avg       0.19      0.20      0.19      1000
weighted avg       0.19      0.19      0.18      1000



In [8]:
#NAIVE BAYES

# initialize the naive bayes classifier
nb_classifier = GaussianNB()

# train the classifier on the training data
nb_classifier.fit(X_train, y_train)

# make predictions on the test set
y_pred = nb_classifier.predict(X_test)

# evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.184
Confusion Matrix:
[[34 38 23 10 56]
 [54 44 28 19 61]
 [63 37 26 14 63]
 [54 60 28 18 68]
 [48 37 37 18 62]]
Classification Report:
              precision    recall  f1-score   support

           0       0.13      0.21      0.16       161
           1       0.20      0.21      0.21       206
           2       0.18      0.13      0.15       203
           3       0.23      0.08      0.12       228
           4       0.20      0.31      0.24       202

    accuracy                           0.18      1000
   macro avg       0.19      0.19      0.18      1000
weighted avg       0.19      0.18      0.18      1000



In [9]:
# DECISION TREE

# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the classifier on the training data
dt_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_dt = dt_classifier.predict(X_test)

# Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {accuracy_dt}")

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))


Decision Tree Accuracy: 0.217
Confusion Matrix:
[[34 38 23 10 56]
 [54 44 28 19 61]
 [63 37 26 14 63]
 [54 60 28 18 68]
 [48 37 37 18 62]]
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.14      0.15      0.14       161
           1       0.21      0.21      0.21       206
           2       0.21      0.18      0.20       203
           3       0.25      0.24      0.24       228
           4       0.25      0.29      0.27       202

    accuracy                           0.22      1000
   macro avg       0.21      0.21      0.21      1000
weighted avg       0.22      0.22      0.22      1000



In [14]:
# 1. Initialize Random Forest
# (Use 100 trees and set random_state for reproducibility)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# 2. Train the model (no scaling needed for tree-based models!)
rf.fit(X_train, y_train)

# 3. Predict and evaluate
y_pred_rf = rf.predict(X_test)

# 4. Metrics
print("\n=== Random Forest Results ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))




=== Random Forest Results ===
Accuracy: 0.21
Confusion Matrix:
[[40 36 31 23 31]
 [43 39 43 34 47]
 [57 49 48 20 29]
 [48 50 48 32 50]
 [54 32 41 23 52]]
Classification Report:
              precision    recall  f1-score   support

           0       0.17      0.25      0.20       161
           1       0.19      0.19      0.19       206
           2       0.23      0.24      0.23       203
           3       0.24      0.14      0.18       228
           4       0.25      0.26      0.25       202

    accuracy                           0.21      1000
   macro avg       0.21      0.21      0.21      1000
weighted avg       0.22      0.21      0.21      1000

