In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv('/content/data.csv')

print("Dataset shape:", data.shape)
print("Columns:", data.columns.tolist())
print(data.head())

Dataset shape: (517, 15)
Columns: ['Unnamed: 0', 'Loan_ID', 'Dependents', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'gender_Male', 'married_Yes', 'education_Not Graduate', 'property_area_Semiurban', 'property_area_Urban', 'self_employed_Yes', 'Loan_status_Y']
   Unnamed: 0   Loan_ID Dependents  ApplicantIncome  CoapplicantIncome  \
0           1  LP001003          1             4583             1508.0   
1           2  LP001005          0             3000                0.0   
2           3  LP001006          0             2583             2358.0   
3           4  LP001008          0             6000                0.0   
4           5  LP001011          2             5417             4196.0   

   LoanAmount  Loan_Amount_Term  Credit_History  gender_Male  married_Yes  \
0       128.0             360.0             1.0            1            1   
1        66.0             360.0             1.0            1            1   
2       120.0 

In [3]:
df = data.copy()

df = df.drop(['Unnamed: 0', 'Loan_ID'], axis=1)

df['Dependents'] = df['Dependents'].astype(str)
df['Dependents'] = df['Dependents'].replace('-3', np.nan)
df['Dependents'] = pd.to_numeric(df['Dependents'], errors='coerce')

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

print("Missing values after imputation:")
print(df.isnull().sum())
print("\nData types:")
print(df.dtypes)
print("\nData shape after preprocessing:", df.shape)

Missing values after imputation:
Dependents                 0
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
gender_Male                0
married_Yes                0
education_Not Graduate     0
property_area_Semiurban    0
property_area_Urban        0
self_employed_Yes          0
Loan_status_Y              0
dtype: int64

Data types:
Dependents                 float64
ApplicantIncome            float64
CoapplicantIncome          float64
LoanAmount                 float64
Loan_Amount_Term           float64
Credit_History             float64
gender_Male                float64
married_Yes                float64
education_Not Graduate     float64
property_area_Semiurban    float64
property_area_Urban        float64
self_employed_Yes          float64
Loan_status_Y              float64
dtype: object

Data shape after preprocessing: (517, 13)


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop('Loan_status_Y', axis=1)
y = df['Loan_status_Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
print("y_train distribution:", y_train.value_counts().to_dict())
print("y_test distribution:", y_test.value_counts().to_dict())

Training set shape: (361, 12)
Testing set shape: (156, 12)
y_train distribution: {1.0: 251, 0.0: 110}
y_test distribution: {1.0: 109, 0.0: 47}


In [5]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bagging_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=42),
    n_estimators=100,
    max_samples=0.8,
    max_features=0.8,
    random_state=42,
    n_jobs=-1
)

bagging_model.fit(X_train_scaled, y_train)

y_pred_bagging = bagging_model.predict(X_test_scaled)

from sklearn.metrics import accuracy_score, classification_report

bagging_accuracy = accuracy_score(y_test, y_pred_bagging)
print("Bagging (Random Forest) Test Accuracy:", bagging_accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_bagging))

Bagging (Random Forest) Test Accuracy: 0.8076923076923077

Classification Report:
              precision    recall  f1-score   support

         0.0       0.76      0.53      0.62        47
         1.0       0.82      0.93      0.87       109

    accuracy                           0.81       156
   macro avg       0.79      0.73      0.75       156
weighted avg       0.80      0.81      0.80       156



In [6]:
from sklearn.ensemble import AdaBoostClassifier

weak_learner = DecisionTreeClassifier(max_depth=2, random_state=42)

adaboost_model = AdaBoostClassifier(
    estimator=weak_learner,
    n_estimators=100,
    learning_rate=1.0,
    random_state=42
)

adaboost_model.fit(X_train_scaled, y_train)

y_pred_adaboost = adaboost_model.predict(X_test_scaled)

adaboost_accuracy = accuracy_score(y_test, y_pred_adaboost)
print("AdaBoost Test Accuracy:", adaboost_accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_adaboost))

AdaBoost Test Accuracy: 0.8205128205128205

Classification Report:
              precision    recall  f1-score   support

         0.0       0.79      0.55      0.65        47
         1.0       0.83      0.94      0.88       109

    accuracy                           0.82       156
   macro avg       0.81      0.74      0.76       156
weighted avg       0.82      0.82      0.81       156



In [7]:
from sklearn.ensemble import GradientBoostingClassifier

gradient_boost_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

gradient_boost_model.fit(X_train_scaled, y_train)

y_pred_gradient = gradient_boost_model.predict(X_test_scaled)

gradient_accuracy = accuracy_score(y_test, y_pred_gradient)
print("Gradient Boosting Test Accuracy:", gradient_accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gradient))

Gradient Boosting Test Accuracy: 0.8012820512820513

Classification Report:
              precision    recall  f1-score   support

         0.0       0.72      0.55      0.63        47
         1.0       0.82      0.91      0.86       109

    accuracy                           0.80       156
   macro avg       0.77      0.73      0.75       156
weighted avg       0.79      0.80      0.79       156



In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score

models = {
    "Bagging (Random Forest)": y_pred_bagging,
    "AdaBoost": y_pred_adaboost,
    "Gradient Boosting": y_pred_gradient
}

print("MODEL COMPARISON")
print("=" * 60)

for name, predictions in models.items():
    acc = accuracy_score(y_test, predictions)
    prec = precision_score(y_test, predictions)
    rec = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)

    print(f"\n{name}:")
    print(f"  Accuracy:  {acc:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall:    {rec:.4f}")
    print(f"  F1-Score:  {f1:.4f}")

print("\n" + "=" * 60)
print("SUMMARY: Best model by metric")

best_accuracy = max(
    accuracy_score(y_test, y_pred_bagging),
    accuracy_score(y_test, y_pred_adaboost),
    accuracy_score(y_test, y_pred_gradient)
)

if best_accuracy == accuracy_score(y_test, y_pred_bagging):
    print("Best Accuracy: Bagging (Random Forest)")
elif best_accuracy == accuracy_score(y_test, y_pred_adaboost):
    print("Best Accuracy: AdaBoost")
else:
    print("Best Accuracy: Gradient Boosting")

MODEL COMPARISON

Bagging (Random Forest):
  Accuracy:  0.8077
  Precision: 0.8211
  Recall:    0.9266
  F1-Score:  0.8707

AdaBoost:
  Accuracy:  0.8205
  Precision: 0.8293
  Recall:    0.9358
  F1-Score:  0.8793

Gradient Boosting:
  Accuracy:  0.8013
  Precision: 0.8250
  Recall:    0.9083
  F1-Score:  0.8646

SUMMARY: Best model by metric
Best Accuracy: AdaBoost
