In [2]:
# 📌 Cell 1: Install dependencies (if needed)
!pip install -q pandas numpy scikit-learn

In [3]:
# 📌 Cell 2: Imports and Config
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Set seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [4]:
# 📌 Cell 3: Define generation function
def generate_loan_data(n_samples=200_000):
    data = []
    for _ in range(n_samples):
        age = random.randint(18, 75)
        gender = random.choice(['male', 'female', 'other'])
        income = round(np.random.uniform(20000, 150000), 2)
        employment_type = random.choice(['salaried', 'self-employed'])
        company_reputation = random.choice(['good', 'bad', 'unknown'])
        credit_score = random.randint(300, 850)
        existing_loans_count = random.randint(0, 6)
        known_defaulter = random.choice([True, False])
        political_affiliation = random.choice(['none', 'member', 'holds_office'])  # instead of 'party_leader'
        loan_amount = round(np.random.uniform(5000, 100000), 2)
        loan_purpose = random.choice(['personal', 'home', 'auto', 'medical'])

        # Label logic
        label = 'approved'  # default

        # Hard rejection rules
        if known_defaulter:
            label = 'declined'
        elif political_affiliation == 'holds_office':
            label = 'declined'
        elif employment_type == 'self-employed' and company_reputation == 'bad':
            label = 'declined'
        elif age >= 65:
            label = 'declined'
        elif credit_score < 500:
            label = 'declined'
        # On-hold rules
        elif existing_loans_count >= 3:
            label = 'on-hold'
        # Probabilistic logic for remaining
        else:
            approval_prob = (credit_score - 300) / 550  # scale to 0–1
            income_factor = min(income / 100000, 1)
            combined_score = 0.6 * approval_prob + 0.4 * income_factor
            if combined_score > 0.7:
                label = 'approved'
            elif combined_score > 0.5:
                label = 'on-hold'
            else:
                label = 'declined'

        data.append([
            age, gender, income, employment_type, company_reputation,
            credit_score, existing_loans_count, known_defaulter,
            political_affiliation, loan_amount, loan_purpose, label
        ])

    columns = [
        'age', 'gender', 'income', 'employment_type', 'company_reputation',
        'credit_score', 'existing_loans_count', 'known_defaulter',
        'political_affiliation', 'loan_amount', 'loan_purpose', 'approval_status'
    ]
    return pd.DataFrame(data, columns=columns)

In [5]:
# 📌 Cell 4: Generate dataset
df = generate_loan_data()
print("✅ Dataset shape:", df.shape)
df.head()

✅ Dataset shape: (200000, 12)


Unnamed: 0,age,gender,income,employment_type,company_reputation,credit_score,existing_loans_count,known_defaulter,political_affiliation,loan_amount,loan_purpose,approval_status
0,58,male,68690.22,salaried,unknown,581,1,True,none,95317.86,personal,declined
1,61,other,115159.21,salaried,unknown,732,0,True,none,61872.56,home,declined
2,32,other,40282.42,salaried,unknown,503,5,False,none,19819.48,medical,on-hold
3,55,female,27550.87,salaried,good,732,2,False,none,87286.73,home,on-hold
4,66,female,98144.95,salaried,good,689,0,False,member,72266.89,auto,declined


In [6]:
# 📌 Cell 5: Encode categorical columns
categorical_cols = ['gender', 'employment_type', 'company_reputation',
                    'political_affiliation', 'loan_purpose']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store for inverse_transform later

# Encode target
target_le = LabelEncoder()
df['approval_status'] = target_le.fit_transform(df['approval_status'])



In [13]:
# 📌 Cell 6: Train-test split
X = df.drop(columns=['approval_status'])
y = df['approval_status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

print(f"✅ Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print("✅ Class distribution (train):\n", pd.Series(y_train).value_counts(normalize=True))

# 📌 Ensure all data is numeric (avoid object dtype issues)
X_train = X_train.copy().astype(np.float32)
X_test = X_test.copy().astype(np.float32)
y_train = y_train.copy().astype(np.int64)
y_test = y_test.copy().astype(np.int64)

✅ Train shape: (160000, 11), Test shape: (40000, 11)
✅ Class distribution (train):
 approval_status
1    0.862519
2    0.104869
0    0.032613
Name: proportion, dtype: float64


In [14]:
# 📌 Install required packages
!pip install -q pytorch-tabnet optuna



In [15]:
# 📌 Cell 8: Import Training Libraries
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
from sklearn.metrics import classification_report, confusion_matrix
import optuna


In [16]:
# 📌 Cell 9: Define Objective Function for Optuna
def objective(trial):
    params = {
        'n_d': trial.suggest_int("n_d", 8, 32),
        'n_a': trial.suggest_int("n_a", 8, 32),
        'n_steps': trial.suggest_int("n_steps", 3, 7),
        'gamma': trial.suggest_float("gamma", 1.0, 2.0),
        'lambda_sparse': trial.suggest_float("lambda_sparse", 1e-5, 1e-3),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': dict(lr=trial.suggest_float("lr", 1e-3, 1e-1)),
        'mask_type': 'entmax'
    }

    clf = TabNetClassifier(**params, seed=SEED, verbose=0)
    clf.fit(
        X_train.values, y_train.values,
        eval_set=[(X_test.values, y_test.values)],
        eval_name=["valid"],
        eval_metric=["accuracy"],
        max_epochs=3,  # Keep low for quick tuning
        patience=2,
        batch_size=trial.suggest_categorical("batch_size", [1024, 2048]),
        virtual_batch_size=128,
        num_workers=0
    )

    preds = clf.predict(X_test.values)
    acc = (preds == y_test.values).mean()
    return acc


In [18]:
# 📌 Cell 10: Run Optuna Study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=3)

print("✅ Best trial:")
print("  Value (Accuracy):", study.best_trial.value)
print("  Params:", study.best_trial.params)


[I 2025-07-14 21:25:27,872] A new study created in memory with name: no-name-9b68cffc-884f-43d7-b09e-0de1b7a00d04


Stop training because you reached max_epochs = 3 with best_epoch = 2 and best_valid_accuracy = 0.995


[I 2025-07-14 21:26:08,731] Trial 0 finished with value: 0.995 and parameters: {'n_d': 23, 'n_a': 30, 'n_steps': 4, 'gamma': 1.1146114568395187, 'lambda_sparse': 0.00014966411217314974, 'lr': 0.011799731897538857, 'batch_size': 1024}. Best is trial 0 with value: 0.995.


Stop training because you reached max_epochs = 3 with best_epoch = 2 and best_valid_accuracy = 0.95365


[I 2025-07-14 21:26:35,339] Trial 1 finished with value: 0.95365 and parameters: {'n_d': 15, 'n_a': 8, 'n_steps': 3, 'gamma': 1.7244270172158478, 'lambda_sparse': 0.0006533422460236161, 'lr': 0.04939068550587974, 'batch_size': 2048}. Best is trial 0 with value: 0.995.


Stop training because you reached max_epochs = 3 with best_epoch = 2 and best_valid_accuracy = 0.95142


[I 2025-07-14 21:27:06,890] Trial 2 finished with value: 0.951425 and parameters: {'n_d': 16, 'n_a': 13, 'n_steps': 4, 'gamma': 1.4081378385304189, 'lambda_sparse': 0.0007171803340786223, 'lr': 0.001972918252138111, 'batch_size': 2048}. Best is trial 0 with value: 0.995.


✅ Best trial:
  Value (Accuracy): 0.995
  Params: {'n_d': 23, 'n_a': 30, 'n_steps': 4, 'gamma': 1.1146114568395187, 'lambda_sparse': 0.00014966411217314974, 'lr': 0.011799731897538857, 'batch_size': 1024}


In [19]:
# 📌 Cell 11: Train Best Model & Evaluate
best_params = study.best_trial.params

# Final training on best config
final_model = TabNetClassifier(
    n_d=best_params['n_d'],
    n_a=best_params['n_a'],
    n_steps=best_params['n_steps'],
    gamma=best_params['gamma'],
    lambda_sparse=best_params['lambda_sparse'],
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=best_params['lr']),
    mask_type='entmax',
    seed=SEED,
    verbose=1
)

final_model.fit(
    X_train.values, y_train.values,
    eval_set=[(X_test.values, y_test.values)],
    eval_name=["valid"],
    eval_metric=["accuracy"],
    max_epochs=10,
    patience=5,
    batch_size=best_params['batch_size'],
    virtual_batch_size=128,
    num_workers=0
)

# Predict and evaluate
y_pred = final_model.predict(X_test.values)
print("✅ Classification Report:\n", classification_report(y_test, y_pred, target_names=target_le.classes_))




epoch 0  | loss: 0.09142 | valid_accuracy: 0.90762 |  0:00:11s
epoch 1  | loss: 0.02487 | valid_accuracy: 0.97735 |  0:00:21s
epoch 2  | loss: 0.02104 | valid_accuracy: 0.995   |  0:00:32s
epoch 3  | loss: 0.01825 | valid_accuracy: 0.99358 |  0:00:43s
epoch 4  | loss: 0.01836 | valid_accuracy: 0.99522 |  0:00:53s
epoch 5  | loss: 0.01868 | valid_accuracy: 0.99752 |  0:01:04s
epoch 6  | loss: 0.01716 | valid_accuracy: 0.99415 |  0:01:15s
epoch 7  | loss: 0.01715 | valid_accuracy: 0.9952  |  0:01:26s
epoch 8  | loss: 0.0154  | valid_accuracy: 0.9952  |  0:01:37s
epoch 9  | loss: 0.01569 | valid_accuracy: 0.99798 |  0:01:49s
epoch 10 | loss: 0.01397 | valid_accuracy: 0.99612 |  0:01:59s
epoch 11 | loss: 0.01606 | valid_accuracy: 0.99382 |  0:02:10s
epoch 12 | loss: 0.01514 | valid_accuracy: 0.99398 |  0:02:21s
epoch 13 | loss: 0.01404 | valid_accuracy: 0.99642 |  0:02:31s
epoch 14 | loss: 0.014   | valid_accuracy: 0.99495 |  0:02:42s

Early stopping occurred at epoch 14 with best_epoch = 



✅ Classification Report:
               precision    recall  f1-score   support

    approved       1.00      0.97      0.99      1304
    declined       1.00      1.00      1.00     34501
     on-hold       0.99      0.99      0.99      4195

    accuracy                           1.00     40000
   macro avg       1.00      0.99      0.99     40000
weighted avg       1.00      1.00      1.00     40000



In [20]:
#  📌 Cell 12: Install/Import XGBoost
!pip install -q xgboost

import xgboost as xgb


In [21]:
# 📌 Cell 13: Define XGBoost Optuna Objective
def xgb_objective(trial):
    param = {
        'verbosity': 0,
        'objective': 'multi:softmax',
        'num_class': len(target_le.classes_),
        'tree_method': 'gpu_hist',  # GPU training on T4
        'predictor': 'gpu_predictor',
        'eval_metric': 'mlogloss',
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test, label=y_test)

    booster = xgb.train(param, dtrain, num_boost_round=100,
                        evals=[(dvalid, 'validation')],
                        early_stopping_rounds=10,
                        verbose_eval=False)

    preds = booster.predict(dvalid)
    acc = (preds == y_test.values).mean()
    return acc


In [22]:
# 📌 Cell 14: Run XGBoost Tuning
xgb_study = optuna.create_study(direction="maximize")
xgb_study.optimize(xgb_objective, n_trials=10)

print("✅ Best XGBoost Accuracy:", xgb_study.best_trial.value)
print("✅ Best Params:", xgb_study.best_trial.params)


[I 2025-07-14 21:31:56,811] A new study created in memory with name: no-name-88470274-d94a-4257-966c-298712c1ec40
[I 2025-07-14 21:32:00,224] Trial 0 finished with value: 0.9982 and parameters: {'max_depth': 10, 'eta': 0.0905208544479054, 'gamma': 4.140466518963298, 'subsample': 0.9040231634815812, 'colsample_bytree': 0.6168353864741383}. Best is trial 0 with value: 0.9982.
[I 2025-07-14 21:32:01,596] Trial 1 finished with value: 0.99845 and parameters: {'max_depth': 9, 'eta': 0.1702590595653581, 'gamma': 4.584215025501665, 'subsample': 0.6153793880577558, 'colsample_bytree': 0.7049797008608685}. Best is trial 1 with value: 0.99845.
[I 2025-07-14 21:32:02,544] Trial 2 finished with value: 0.990425 and parameters: {'max_depth': 3, 'eta': 0.08621318312849344, 'gamma': 4.910590970713942, 'subsample': 0.9416848638643115, 'colsample_bytree': 0.7620553998514444}. Best is trial 1 with value: 0.99845.
[I 2025-07-14 21:32:04,394] Trial 3 finished with value: 0.998675 and parameters: {'max_depth

✅ Best XGBoost Accuracy: 0.9987
✅ Best Params: {'max_depth': 10, 'eta': 0.2689707361838444, 'gamma': 0.1373645531248069, 'subsample': 0.785759891651717, 'colsample_bytree': 0.992316738766722}


In [23]:
# 📌 Cell 15: Train Best XGBoost Model and Evaluate
best_xgb_params = xgb_study.best_trial.params
best_xgb_params.update({
    'objective': 'multi:softmax',
    'num_class': len(target_le.classes_),
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'eval_metric': 'mlogloss'
})

dtrain_final = xgb.DMatrix(X_train, label=y_train)
dtest_final = xgb.DMatrix(X_test, label=y_test)

final_xgb_model = xgb.train(best_xgb_params, dtrain_final, num_boost_round=100,
                            evals=[(dtest_final, 'test')],
                            early_stopping_rounds=10,
                            verbose_eval=False)

y_xgb_pred = final_xgb_model.predict(dtest_final)

print("✅ XGBoost Classification Report:\n", classification_report(y_test, y_xgb_pred, target_names=target_le.classes_))


✅ XGBoost Classification Report:
               precision    recall  f1-score   support

    approved       0.99      1.00      0.99      1304
    declined       1.00      1.00      1.00     34501
     on-hold       0.99      0.99      0.99      4195

    accuracy                           1.00     40000
   macro avg       0.99      1.00      1.00     40000
weighted avg       1.00      1.00      1.00     40000



In [24]:
# Cell 16: Compare Models and Select Best
from sklearn.metrics import accuracy_score, f1_score

# Calculate F1 scores
tabnet_f1 = f1_score(y_test, y_pred, average='weighted')
xgb_f1 = f1_score(y_test, y_xgb_pred, average='weighted')

# Select best
best_model_name = "TabNet" if tabnet_f1 >= xgb_f1 else "XGBoost"
print(f"✅ Best Model Based on F1 Score: {best_model_name}")
print(f"TabNet F1: {tabnet_f1:.4f} | XGBoost F1: {xgb_f1:.4f}")


✅ Best Model Based on F1 Score: XGBoost
TabNet F1: 0.9980 | XGBoost F1: 0.9987


In [25]:
# Cell 17: Inference on Sample Test Set
# Select test sample
sample_indices = np.random.choice(X_test.index, size=5, replace=False)
X_sample = X_test.loc[sample_indices]
y_true_sample = y_test.loc[sample_indices]

# Predict
if best_model_name == "TabNet":
    y_pred_sample = final_model.predict(X_sample.values)
else:
    d_sample = xgb.DMatrix(X_sample)
    y_pred_sample = final_xgb_model.predict(d_sample)

# Decode labels
decoded_preds = target_le.inverse_transform(y_pred_sample.astype(int))
decoded_truth = target_le.inverse_transform(y_true_sample.astype(int))

# Display predictions
for i in range(len(decoded_preds)):
    print(f"\nCustomer {i+1}")
    print(X_sample.iloc[i])
    print(f"🔮 Predicted: {decoded_preds[i]} | ✅ Actual: {decoded_truth[i]}")



Customer 1
age                         63.000000
gender                       1.000000
income                   64486.980469
employment_type              1.000000
company_reputation           1.000000
credit_score               674.000000
existing_loans_count         3.000000
known_defaulter              0.000000
political_affiliation        0.000000
loan_amount              54453.781250
loan_purpose                 1.000000
Name: 83991, dtype: float32
🔮 Predicted: declined | ✅ Actual: declined

Customer 2
age                          48.000000
gender                        1.000000
income                   116793.742188
employment_type               1.000000
company_reputation            1.000000
credit_score                486.000000
existing_loans_count          3.000000
known_defaulter               0.000000
political_affiliation         1.000000
loan_amount               44807.980469
loan_purpose                  3.000000
Name: 11010, dtype: float32
🔮 Predicted: declined | ✅ Actu

In [29]:
# 📌 Cell 18: Install & Load T5
!pip install -q transformers sentencepiece

from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load the T5 model (use 't5-small' for speed or 't5-base' if you're okay with a bit more memory)
tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small").to("cuda")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [30]:
# 📌 Cell 19: Define Summarization Function
def summarize_with_t5(input_text):
    prompt = "summarize: " + input_text.strip().replace("\n", " ")
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to("cuda")
    summary_ids = t5_model.generate(inputs.input_ids, max_new_tokens=50, do_sample=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [36]:
# 📌 Cell 20: Generate Summaries for Each Sample
def format_profile_as_text(row):
    gender_map = {0: "male", 1: "female", 2: "other"}
    employment_map = {0: "salaried", 1: "self-employed"}
    company_map = {0: "good", 1: "bad", 2: "unknown"}
    politics_map = {0: "none", 1: "member", 2: "holds office"}
    purpose_map = {0: "personal", 1: "home", 2: "auto", 3: "medical"}

    profile = f"""
    - Age: {int(row['age'])}
    - Gender: {gender_map.get(int(row['gender']), 'unknown')}
    - Income: ${row['income']:.2f}
    - Employment Type: {employment_map.get(int(row['employment_type']), 'unknown')}
    - Company Reputation: {company_map.get(int(row['company_reputation']), 'unknown')}
    - Credit Score: {int(row['credit_score'])}
    - Existing Loans: {int(row['existing_loans_count'])}
    - Known Defaulter: {"Yes" if row['known_defaulter'] else "No"}
    - Political Affiliation: {politics_map.get(int(row['political_affiliation']), 'unknown')}
    - Loan Amount: ${row['loan_amount']:.2f}
    - Loan Purpose: {purpose_map.get(int(row['loan_purpose']), 'unknown')}
    """
    return profile


# 🔁 Better summarization loop
for i in range(len(X_sample)):
    print(f"\n📋 Customer {i+1}")
    profile_text = format_profile_as_text(X_sample.iloc[i])
    prediction = decoded_preds[i]
    actual = decoded_truth[i]

    prompt_text = f"""
    A machine learning model reviewed the following loan applicant profile and predicted the loan status as **{prediction}**.
    Profile:
    {profile_text}

    Please summarize in simple, clear language why this decision might have been made.
    """

    summary = summarize_with_t5(prompt_text)
    print(f"🧾 Summary:\n{summary}")



📋 Customer 1
🧾 Summary:
gender: woman - income: $64486.98 - Employment Type: self-employed. 63 - Existing Loans: 3 - Known Defaulter: No. Lack of Political Af

📋 Customer 2
🧾 Summary:
model reviewed loan applicant profile of 48-year-old. predicted approval as **declined**.

📋 Customer 3
🧾 Summary:
model has predict a loan status which is **on-hold**. Age: 42.. income: $1141428.27. - Employment Type: salaried.

📋 Customer 4
🧾 Summary:
machines-researched applicant's profile was based on 'declined' status. employed as self-employed - Company Reputation: good - Credit Score: 585.

📋 Customer 5
🧾 Summary:
Model predicted to have a loan status of **declined**..
