In [1]:
# Install XGBoost quietly
!pip install xgboost

# Core libraries
import pandas as pd
import numpy as np

# Scikit-learn tools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, roc_auc_score,
    precision_score, recall_score,
    f1_score, classification_report
)

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb




In [6]:
from google.colab import files

# Upload the file
uploaded = files.upload()
for fn in uploaded.keys():

    df = pd.read_csv(fn, sep=';')

print("Data shape:", df.shape)
df.head()


Saving bank-additional-full (1).csv to bank-additional-full (1) (1).csv
Data shape: (41199, 21)


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56.0,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1.0,999.0,0.0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57.0,services,married,high.school,unknown,no,no,telephone,may,mon,...,1.0,999.0,0.0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37.0,services,married,high.school,no,yes,no,telephone,may,mon,...,1.0,999.0,0.0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40.0,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1.0,999.0,0.0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56.0,services,married,high.school,no,no,yes,telephone,may,mon,...,1.0,999.0,0.0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [7]:
print("Initial shape:", df.shape)
dups = df.duplicated().sum()
print(f"Duplicates found: {dups}")
if dups > 0:
    df = df.drop_duplicates()
    print("Shape after dropping duplicates:", df.shape)

# Check for missing values
missing_counts = df.isnull().sum()
print("\nMissing values per column:")
print(missing_counts[missing_counts > 0])

Initial shape: (41199, 21)
Duplicates found: 15
Shape after dropping duplicates: (41184, 21)

Missing values per column:
age               2
job               3
marital           3
education         3
default           2
housing           1
loan              2
contact           2
month             1
day_of_week       1
duration          1
campaign          1
pdays             1
previous          1
poutcome          3
emp.var.rate      1
cons.price.idx    2
cons.conf.idx     1
euribor3m         1
nr.employed       1
y                 1
dtype: int64


In [8]:
# Remove any rows with missing target
print("Shape before dropping missing target:", df.shape)
df = df.dropna(subset=['y']).reset_index(drop=True)
print("Shape after dropping missing target :", df.shape)


Shape before dropping missing target: (41184, 21)
Shape after dropping missing target : (41183, 21)


In [9]:
# Define Features & Target
TARGET = 'y'  # adjust if your target column is named differently

# Split into features and label
X = df.drop(columns=[TARGET])
y = df[TARGET].map({'yes': 1, 'no': 0}) if df[TARGET].dtype == object else df[TARGET]

# Quick sanity check
print("Features shape:", X.shape)
print("Target shape  :", y.shape)


Features shape: (41183, 20)
Target shape  : (41183,)


In [10]:
# Identify Numeric vs. Categorical Columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

print("Numerical columns   :", num_cols)
print("Categorical columns :", cat_cols)


Numerical columns   : ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
Categorical columns : ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']


In [11]:
# Build Preprocessing Pipelines
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Numeric pipeline: median imputation + standard scaling
numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',  StandardScaler())
])

# Categorical pipeline: most-frequent imputation + one-hot encoding
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot',  OneHotEncoder(handle_unknown='ignore'))
])

# Combine into a single preprocessor
preprocessor = ColumnTransformer([
    ('num', numeric_pipe,     num_cols),
    ('cat', categorical_pipe, cat_cols)
])


In [12]:
# Train/Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,       # 80/20 split
    random_state=42,     # reproducible
    stratify=y           # preserve class balance
)

print("Training set:", X_train.shape, y_train.shape)
print("Test set    :", X_test.shape,  y_test.shape)


Training set: (32946, 20) (32946,)
Test set    : (8237, 20) (8237,)


In [13]:
#  Evaluation
from sklearn.metrics import (
    accuracy_score, roc_auc_score,
    precision_score, recall_score,
    f1_score, classification_report
)

def eval_model(name, model, X_test, y_test):
    """
    Prints key performance metrics for a trained classifier.
    """
    y_pred  = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print(f"--- {name} ---")
    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("ROC AUC  :", roc_auc_score(y_test, y_proba))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall   :", recall_score(y_test, y_pred))
    print("F1 Score :", f1_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [14]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Build a pipeline that applies our preprocessor, then fits LR
lr_pipe = Pipeline([
    ('prep', preprocessor),
    ('clf',  LogisticRegression(
                 max_iter=1000,
                 solver='liblinear',
                 random_state=42
             ))
])

# Train
lr_pipe.fit(X_train, y_train)

# Evaluate
eval_model("Logistic Regression", lr_pipe, X_test, y_test)


--- Logistic Regression ---
Accuracy : 0.9096758528590506
ROC AUC  : 0.9389967376073899
Precision: 0.6455696202531646
Recall   : 0.4396551724137931
F1 Score : 0.5230769230769231

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.97      0.95      7309
           1       0.65      0.44      0.52       928

    accuracy                           0.91      8237
   macro avg       0.79      0.70      0.74      8237
weighted avg       0.90      0.91      0.90      8237



In [15]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Build pipeline with our preprocessor + RF
rf_pipe = Pipeline([
    ('prep', preprocessor),
    ('clf',  RandomForestClassifier(
                 n_estimators=100,
                 random_state=42,
                 n_jobs=-1
             ))
])

# Train
rf_pipe.fit(X_train, y_train)

# Evaluate
eval_model("Random Forest", rf_pipe, X_test, y_test)


--- Random Forest ---
Accuracy : 0.9144105863785359
ROC AUC  : 0.9432753106703592
Precision: 0.6637298091042585
Recall   : 0.4870689655172414
F1 Score : 0.5618396519577378

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95      7309
           1       0.66      0.49      0.56       928

    accuracy                           0.91      8237
   macro avg       0.80      0.73      0.76      8237
weighted avg       0.91      0.91      0.91      8237



In [16]:
# Cell 11 – XGBoost with GridSearch
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Pipeline: preprocessing + XGB classifier
xgb_pipe = Pipeline([
    ('prep', preprocessor),
    ('clf',  xgb.XGBClassifier(
                 objective='binary:logistic',
                 eval_metric='auc',
                 use_label_encoder=False,
                 random_state=42
             ))
])

# Hyperparameter grid
param_grid = {
    'clf__n_estimators':   [100, 200],
    'clf__max_depth':      [3, 5],
    'clf__learning_rate':  [0.1, 0.01]
}

# Grid search
grid = GridSearchCV(
    xgb_pipe,
    param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

# Train & tune
grid.fit(X_train, y_train)

# Best parameters
print("Best params:", grid.best_params_)

# Evaluate best model
best_xgb = grid.best_estimator_
eval_model("XGBoost (Tuned)", best_xgb, X_test, y_test)


Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.



Best params: {'clf__learning_rate': 0.1, 'clf__max_depth': 5, 'clf__n_estimators': 100}
--- XGBoost (Tuned) ---
Accuracy : 0.9191453198980212
ROC AUC  : 0.9518732956770348
Precision: 0.6649874055415617
Recall   : 0.5689655172413793
F1 Score : 0.6132404181184669

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.95      7309
           1       0.66      0.57      0.61       928

    accuracy                           0.92      8237
   macro avg       0.81      0.77      0.78      8237
weighted avg       0.91      0.92      0.92      8237



In [17]:
# Cell 12 – Extract & display feature importances
import pandas as pd

# Get feature names from the preprocessor
# Numeric features come first, then one-hot columns for categoricals
num_features = num_cols
cat_features = list(best_xgb.named_steps['prep']
                    .named_transformers_['cat']
                    .named_steps['onehot']
                    .get_feature_names_out(cat_cols))
all_features = num_features + cat_features

# Get importances and build a DataFrame
importances = best_xgb.named_steps['clf'].feature_importances_
feat_imp_df = pd.DataFrame({
    'feature': all_features,
    'importance': importances
}).sort_values('importance', ascending=False)

# Show top 10
print(feat_imp_df.head(10))


             feature  importance
9        nr.employed    0.393895
1           duration    0.082166
62  poutcome_success    0.057070
7      cons.conf.idx    0.056045
5       emp.var.rate    0.029998
53         month_oct    0.028454
3              pdays    0.023089
8          euribor3m    0.022773
51         month_may    0.021313
6     cons.price.idx    0.015324


In [18]:

import joblib

# Save the entire pipeline (preprocessor + XGBoost model)
model_filename = 'loan_approval_xgb_pipeline.joblib'
joblib.dump(best_xgb, model_filename)
print(f"Saved model to {model_filename}")


Saved model to loan_approval_xgb_pipeline.joblib


In [19]:
from google.colab import files

# This will prompt a download of your .joblib file
files.download('loan_approval_xgb_pipeline.joblib')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>