<a href="https://colab.research.google.com/github/ASHOKEKUMAR-S/loan_process_agnetic_ai/blob/main/loan_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 📌 Cell 1: Install dependencies (if needed)
!pip install -q pandas numpy scikit-learn

In [2]:
# 📌 Cell 2: Imports and Config
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Set seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [3]:
# 📌 Cell 3: Define generation function
def generate_loan_data(n_samples=200_000):
  # facker for data gen, Randmonseed
    data = []
    for _ in range(n_samples):
        age = random.randint(18, 75)
        gender = random.choice(['male', 'female', 'other'])
        income = round(np.random.uniform(20000, 150000), 2)
        employment_type = random.choice(['salaried', 'self-employed'])
        company_reputation = random.choice(['good', 'bad', 'unknown'])
        credit_score = random.randint(300, 850)
        existing_loans_count = random.randint(0, 6)
        known_defaulter = random.choice([True, False])
        political_affiliation = random.choice(['none', 'member', 'holds_office'])  # instead of 'party_leader'
        loan_amount = round(np.random.uniform(5000, 100000), 2)
        loan_purpose = random.choice(['personal', 'home', 'auto', 'medical'])

        # Label logic
        label = 'approved'  # default

        # Hard rejection rules
        if known_defaulter:
            label = 'declined'
        elif political_affiliation == 'holds_office':
            label = 'declined'
        elif employment_type == 'self-employed' and company_reputation == 'bad':
            label = 'declined'
        elif age >= 65:
            label = 'declined'
        elif credit_score < 500:
            label = 'declined'
        # On-hold rules
        elif existing_loans_count >= 3:
            label = 'on-hold'
        # Probabilistic logic for remaining
        else:
            approval_prob = (credit_score - 300) / 550  # scale to 0–1
            income_factor = min(income / 100000, 1)
            combined_score = 0.6 * approval_prob + 0.4 * income_factor
            if combined_score > 0.7:
                label = 'approved'
            elif combined_score > 0.5:
                label = 'on-hold'
            else:
                label = 'declined'

        data.append([
            age, gender, income, employment_type, company_reputation,
            credit_score, existing_loans_count, known_defaulter,
            political_affiliation, loan_amount, loan_purpose, label
        ])

    columns = [
        'age', 'gender', 'income', 'employment_type', 'company_reputation',
        'credit_score', 'existing_loans_count', 'known_defaulter',
        'political_affiliation', 'loan_amount', 'loan_purpose', 'approval_status'
    ]
    return pd.DataFrame(data, columns=columns)

In [4]:
# 📌 Cell 4: Generate dataset
df = generate_loan_data()
print("✅ Dataset shape:", df.shape)
df.head()

✅ Dataset shape: (200000, 12)


Unnamed: 0,age,gender,income,employment_type,company_reputation,credit_score,existing_loans_count,known_defaulter,political_affiliation,loan_amount,loan_purpose,approval_status
0,58,male,68690.22,salaried,unknown,581,1,True,none,95317.86,personal,declined
1,61,other,115159.21,salaried,unknown,732,0,True,none,61872.56,home,declined
2,32,other,40282.42,salaried,unknown,503,5,False,none,19819.48,medical,on-hold
3,55,female,27550.87,salaried,good,732,2,False,none,87286.73,home,on-hold
4,66,female,98144.95,salaried,good,689,0,False,member,72266.89,auto,declined


In [5]:
# 📌 Cell 5: Encode categorical columns
categorical_cols = ['gender', 'employment_type', 'company_reputation',
                    'political_affiliation', 'loan_purpose']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store for inverse_transform later

# Encode target
target_le = LabelEncoder()
df['approval_status'] = target_le.fit_transform(df['approval_status'])



In [6]:
# 📌 Cell 6: Train-test split
X = df.drop(columns=['approval_status'])
y = df['approval_status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

print(f"✅ Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print("✅ Class distribution (train):\n", pd.Series(y_train).value_counts(normalize=True))

# 📌 Ensure all data is numeric (avoid object dtype issues)
X_train = X_train.copy().astype(np.float32)
X_test = X_test.copy().astype(np.float32)
y_train = y_train.copy().astype(np.int64)
y_test = y_test.copy().astype(np.int64)

✅ Train shape: (160000, 11), Test shape: (40000, 11)
✅ Class distribution (train):
 approval_status
1    0.862519
2    0.104869
0    0.032613
Name: proportion, dtype: float64


In [None]:
# 📌 Cell 7: Install required packages
!pip install -q pytorch-tabnet optuna



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/395.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# 📌 Cell 8: Import Training Libraries
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
from sklearn.metrics import classification_report, confusion_matrix
import optuna


In [None]:
# 📌 Cell 9: Define Objective Function for Optuna
def objective(trial):
    params = {
        'n_d': trial.suggest_int("n_d", 8, 32),
        'n_a': trial.suggest_int("n_a", 8, 32),
        'n_steps': trial.suggest_int("n_steps", 3, 7),
        'gamma': trial.suggest_float("gamma", 1.0, 2.0),
        'lambda_sparse': trial.suggest_float("lambda_sparse", 1e-5, 1e-3),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': dict(lr=trial.suggest_float("lr", 1e-3, 1e-1)),
        'mask_type': 'entmax'
    }

    clf = TabNetClassifier(**params, seed=SEED, verbose=0)
    clf.fit(
        X_train.values, y_train.values,
        eval_set=[(X_test.values, y_test.values)],
        eval_name=["valid"],
        eval_metric=["accuracy"],
        max_epochs=3,  # Keep low for quick tuning
        patience=2,
        batch_size=trial.suggest_categorical("batch_size", [1024, 2048]),
        virtual_batch_size=128,
        num_workers=0
    )

    preds = clf.predict(X_test.values)
    acc = (preds == y_test.values).mean()
    return acc


In [None]:
# 📌 Cell 10: Run Optuna Study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=3)

print("✅ Best trial:")
print("  Value (Accuracy):", study.best_trial.value)
print("  Params:", study.best_trial.params)


[I 2025-07-16 02:48:04,466] A new study created in memory with name: no-name-e3a73b7b-b67e-4ca8-b378-631880782297


Stop training because you reached max_epochs = 3 with best_epoch = 2 and best_valid_accuracy = 0.9883


[I 2025-07-16 02:48:32,025] Trial 0 finished with value: 0.9883 and parameters: {'n_d': 31, 'n_a': 31, 'n_steps': 3, 'gamma': 1.5381504236721244, 'lambda_sparse': 0.0006952614269900663, 'lr': 0.024822082150953016, 'batch_size': 1024}. Best is trial 0 with value: 0.9883.


Stop training because you reached max_epochs = 3 with best_epoch = 2 and best_valid_accuracy = 0.952


[I 2025-07-16 02:49:07,947] Trial 1 finished with value: 0.952 and parameters: {'n_d': 20, 'n_a': 24, 'n_steps': 6, 'gamma': 1.317138695881268, 'lambda_sparse': 0.00015897329841742816, 'lr': 0.09629045499524543, 'batch_size': 2048}. Best is trial 0 with value: 0.9883.


Stop training because you reached max_epochs = 3 with best_epoch = 2 and best_valid_accuracy = 0.9902


[I 2025-07-16 02:49:37,444] Trial 2 finished with value: 0.9902 and parameters: {'n_d': 30, 'n_a': 26, 'n_steps': 3, 'gamma': 1.4669016815864184, 'lambda_sparse': 0.00010474742922881776, 'lr': 0.040392657833643306, 'batch_size': 1024}. Best is trial 2 with value: 0.9902.


✅ Best trial:
  Value (Accuracy): 0.9902
  Params: {'n_d': 30, 'n_a': 26, 'n_steps': 3, 'gamma': 1.4669016815864184, 'lambda_sparse': 0.00010474742922881776, 'lr': 0.040392657833643306, 'batch_size': 1024}


In [None]:
# 📌 Cell 11: Train Best Model & Evaluate
best_params = study.best_trial.params

# Final training on best config for cabnet
final_model = TabNetClassifier(
    n_d=best_params['n_d'],
    n_a=best_params['n_a'],
    n_steps=best_params['n_steps'],
    gamma=best_params['gamma'],
    lambda_sparse=best_params['lambda_sparse'],
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=best_params['lr']),
    mask_type='entmax',
    seed=SEED,
    verbose=1
)

final_model.fit(
    X_train.values, y_train.values,
    eval_set=[(X_test.values, y_test.values)],
    eval_name=["valid"],
    eval_metric=["accuracy"],
    max_epochs=10,
    patience=5,
    batch_size=best_params['batch_size'],
    virtual_batch_size=128,
    num_workers=0
)

# Predict and evaluate
y_pred = final_model.predict(X_test.values)
print("✅ Classification Report:\n", classification_report(y_test, y_pred, target_names=[str(c) for c in target_le.classes_]))



epoch 0  | loss: 0.08354 | valid_accuracy: 0.89438 |  0:00:07s
epoch 1  | loss: 0.0297  | valid_accuracy: 0.97618 |  0:00:14s
epoch 2  | loss: 0.02282 | valid_accuracy: 0.9902  |  0:00:22s
epoch 3  | loss: 0.02085 | valid_accuracy: 0.99652 |  0:00:30s
epoch 4  | loss: 0.02025 | valid_accuracy: 0.99445 |  0:00:37s
epoch 5  | loss: 0.01893 | valid_accuracy: 0.99512 |  0:00:45s
epoch 6  | loss: 0.01904 | valid_accuracy: 0.9954  |  0:00:52s
epoch 7  | loss: 0.01801 | valid_accuracy: 0.99235 |  0:01:00s
epoch 8  | loss: 0.01679 | valid_accuracy: 0.99155 |  0:01:08s

Early stopping occurred at epoch 8 with best_epoch = 3 and best_valid_accuracy = 0.99652




✅ Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      1304
           1       1.00      1.00      1.00     34501
           2       0.99      0.98      0.98      4195

    accuracy                           1.00     40000
   macro avg       0.99      0.99      0.99     40000
weighted avg       1.00      1.00      1.00     40000



In [None]:
#  📌 Cell 12: Install/Import XGBoost
!pip install -q xgboost

import xgboost as xgb


In [None]:
# 📌 Cell 13: Define XGBoost Optuna Objective
def xgb_objective(trial):
    param = {
        'verbosity': 0,
        'objective': 'multi:softmax',
        'num_class': len(target_le.classes_),
        'tree_method': 'gpu_hist',  # GPU training on T4
        'predictor': 'gpu_predictor',
        'eval_metric': 'mlogloss',
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test, label=y_test)

    booster = xgb.train(param, dtrain, num_boost_round=100,
                        evals=[(dvalid, 'validation')],
                        early_stopping_rounds=10,
                        verbose_eval=False)

    preds = booster.predict(dvalid)
    acc = (preds == y_test.values).mean()
    return acc


In [None]:
# 📌 Cell 14: Run XGBoost Tuning
xgb_study = optuna.create_study(direction="maximize")
xgb_study.optimize(xgb_objective, n_trials=10)

print("✅ Best XGBoost Accuracy:", xgb_study.best_trial.value)
print("✅ Best Params:", xgb_study.best_trial.params)


[I 2025-07-16 02:58:05,299] A new study created in memory with name: no-name-d5783ec7-77ad-4384-9f87-87296c24505a
[I 2025-07-16 02:58:07,248] Trial 0 finished with value: 0.998675 and parameters: {'max_depth': 7, 'eta': 0.11687463268770469, 'gamma': 0.5158016776718938, 'subsample': 0.7028503950298204, 'colsample_bytree': 0.8369423306337174}. Best is trial 0 with value: 0.998675.
[I 2025-07-16 02:58:08,460] Trial 1 finished with value: 0.998725 and parameters: {'max_depth': 10, 'eta': 0.2544680763643283, 'gamma': 0.2173247672772899, 'subsample': 0.7823988065625935, 'colsample_bytree': 0.9822164535496257}. Best is trial 1 with value: 0.998725.
[I 2025-07-16 02:58:09,722] Trial 2 finished with value: 0.9985 and parameters: {'max_depth': 10, 'eta': 0.09208121463863221, 'gamma': 4.151662974496196, 'subsample': 0.7195454676201156, 'colsample_bytree': 0.803253168366983}. Best is trial 1 with value: 0.998725.
[I 2025-07-16 02:58:11,070] Trial 3 finished with value: 0.998425 and parameters: {'m

✅ Best XGBoost Accuracy: 0.998725
✅ Best Params: {'max_depth': 10, 'eta': 0.2544680763643283, 'gamma': 0.2173247672772899, 'subsample': 0.7823988065625935, 'colsample_bytree': 0.9822164535496257}


In [None]:
# 📌 Cell 15: Train Best XGBoost Model and Evaluate
best_xgb_params = xgb_study.best_trial.params
best_xgb_params.update({
    'objective': 'multi:softmax',
    'num_class': len(target_le.classes_),
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'eval_metric': 'mlogloss'
})

dtrain_final = xgb.DMatrix(X_train, label=y_train)
dtest_final = xgb.DMatrix(X_test, label=y_test)

final_xgb_model = xgb.train(best_xgb_params, dtrain_final, num_boost_round=100,
                            evals=[(dtest_final, 'test')],
                            early_stopping_rounds=10,
                            verbose_eval=False)

y_xgb_pred = final_xgb_model.predict(dtest_final)

print("✅ XGBoost Classification Report:\n", classification_report(y_test, y_xgb_pred, target_names=[str(c) for c in target_le.classes_]))

✅ XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      1304
           1       1.00      1.00      1.00     34501
           2       0.99      0.99      0.99      4195

    accuracy                           1.00     40000
   macro avg       0.99      1.00      1.00     40000
weighted avg       1.00      1.00      1.00     40000



In [None]:
# Cell 16: Compare Models and Select Best
from sklearn.metrics import accuracy_score, f1_score

# Calculate F1 scores
tabnet_f1 = f1_score(y_test, y_pred, average='weighted')
xgb_f1 = f1_score(y_test, y_xgb_pred, average='weighted')

# Select best
best_model_name = "TabNet" if tabnet_f1 >= xgb_f1 else "XGBoost"
print(f"✅ Best Model Based on F1 Score: {best_model_name}")
print(f"TabNet F1: {tabnet_f1:.4f} | XGBoost F1: {xgb_f1:.4f}")


✅ Best Model Based on F1 Score: XGBoost
TabNet F1: 0.9965 | XGBoost F1: 0.9987


In [None]:
# Cell 17: Inference on Sample Test Set
# Select test sample
sample_indices = np.random.choice(X_test.index, size=5, replace=False)
X_sample = X_test.loc[sample_indices]
y_true_sample = y_test.loc[sample_indices]

# Predict
if best_model_name == "TabNet":
    y_pred_sample = final_model.predict(X_sample.values)
else:
    d_sample = xgb.DMatrix(X_sample)
    y_pred_sample = final_xgb_model.predict(d_sample)

# Decode labels
decoded_preds = target_le.inverse_transform(y_pred_sample.astype(int))
decoded_truth = target_le.inverse_transform(y_true_sample.astype(int))

# Display predictions
for i in range(len(decoded_preds)):
    print(f"\nCustomer {i+1}")
    print(X_sample.iloc[i])
    print(f"🔮 Predicted: {decoded_preds[i]} | ✅ Actual: {decoded_truth[i]}")



Customer 1
age                          37.000000
gender                        1.000000
income                   143976.265625
employment_type               1.000000
company_reputation            2.000000
credit_score                572.000000
existing_loans_count          3.000000
known_defaulter               1.000000
political_affiliation         2.000000
loan_amount               22474.580078
loan_purpose                  0.000000
Name: 178568, dtype: float32
🔮 Predicted: 1 | ✅ Actual: 1

Customer 2
age                         71.000000
gender                       0.000000
income                   92066.757812
employment_type              1.000000
company_reputation           2.000000
credit_score               302.000000
existing_loans_count         4.000000
known_defaulter              1.000000
political_affiliation        1.000000
loan_amount              20633.740234
loan_purpose                 1.000000
Name: 91963, dtype: float32
🔮 Predicted: 1 | ✅ Actual: 1

Customer 3
ag

In [None]:
# 📌 Cell 18: Install & Load T5
!pip install -q transformers sentencepiece

from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load the T5 model (use 't5-small' for speed or 't5-base' if you're okay with a bit more memory)
tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small").to("cuda")


In [None]:
# 📌 Cell 19: Define Summarization Function
def summarize_with_t5(input_text):
    prompt = "summarize: " + input_text.strip().replace("\n", " ")
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to("cuda")
    summary_ids = t5_model.generate(inputs.input_ids, max_new_tokens=50, do_sample=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [None]:
# 📌 Cell 20: Generate Summaries for Each Sample
def format_profile_as_text(row):
    gender_map = {0: "male", 1: "female", 2: "other"}
    employment_map = {0: "salaried", 1: "self-employed"}
    company_map = {0: "good", 1: "bad", 2: "unknown"}
    politics_map = {0: "none", 1: "member", 2: "holds office"}
    purpose_map = {0: "personal", 1: "home", 2: "auto", 3: "medical"}

    profile = f"""
    - Age: {int(row['age'])}
    - Gender: {gender_map.get(int(row['gender']), 'unknown')}
    - Income: ${row['income']:.2f}
    - Employment Type: {employment_map.get(int(row['employment_type']), 'unknown')}
    - Company Reputation: {company_map.get(int(row['company_reputation']), 'unknown')}
    - Credit Score: {int(row['credit_score'])}
    - Existing Loans: {int(row['existing_loans_count'])}
    - Known Defaulter: {"Yes" if row['known_defaulter'] else "No"}
    - Political Affiliation: {politics_map.get(int(row['political_affiliation']), 'unknown')}
    - Loan Amount: ${row['loan_amount']:.2f}
    - Loan Purpose: {purpose_map.get(int(row['loan_purpose']), 'unknown')}
    """
    return profile


# 🔁 Better summarization loop
for i in range(len(X_sample)):
    print(f"\n📋 Customer {i+1}")
    profile_text = format_profile_as_text(X_sample.iloc[i])
    prediction = decoded_preds[i]
    actual = decoded_truth[i]

    prompt_text = f"""
    A machine learning model reviewed the following loan applicant profile and predicted the loan status as **{prediction}**.
    Profile:
    {profile_text}

    Please summarize in simple, clear language why this decision might have been made.
    """

    summary = summarize_with_t5(prompt_text)
    print(f"🧾 Summary:\n{summary}")



📋 Customer 1
🧾 Summary:
machine learning model predicted loan status *1**. age: 37. gender: female.

📋 Customer 2
🧾 Summary:
model predicts loan status to be **1**, however, depending on the applicant..

📋 Customer 3
🧾 Summary:
model examines an applicant profile and predicts loan status **1. average age - age: 32 - gender: other - income: $78798.41 - Employment Type: self-employed.

📋 Customer 4
🧾 Summary:
83-plus female-grade lender profile predicted loan status as **1**.

📋 Customer 5
🧾 Summary:
models predicted the loan status as **1**, with age: 3 or 32. status of personal Please explain in simple, clear language why this decision might have been made.


In [None]:
# 📌 Cell 6: Train-test split
X = df.drop(columns=['approval_status'])
y = df['approval_status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

print(f"✅ Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print("✅ Class distribution (train):\n", pd.Series(y_train).value_counts(normalize=True))

# 📌 Ensure all data is numeric (avoid object dtype issues)
X_train = X_train.copy().astype(np.float32)
X_test = X_test.copy().astype(np.float32)
y_train = y_train.copy().astype(np.int64)
y_test = y_test.copy().astype(np.int64)

✅ Train shape: (160000, 11), Test shape: (40000, 11)
✅ Class distribution (train):
 approval_status
1    0.862519
2    0.104869
0    0.032613
Name: proportion, dtype: float64


In [None]:
# 📌 Cell 10: Run Optuna Study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=3)

print("✅ Best trial:")
print("  Value (Accuracy):", study.best_trial.value)
print("  Params:", study.best_trial.params)

[I 2025-07-16 02:44:05,250] A new study created in memory with name: no-name-7fe90d9c-b201-4dd5-9424-95d1563ca23b


Stop training because you reached max_epochs = 3 with best_epoch = 2 and best_valid_accuracy = 0.99325


[I 2025-07-16 02:44:45,711] Trial 0 finished with value: 0.99325 and parameters: {'n_d': 28, 'n_a': 19, 'n_steps': 4, 'gamma': 1.4075424422263965, 'lambda_sparse': 0.00023977751458180386, 'lr': 0.06300719232680903, 'batch_size': 1024}. Best is trial 0 with value: 0.99325.


Stop training because you reached max_epochs = 3 with best_epoch = 2 and best_valid_accuracy = 0.97562


[I 2025-07-16 02:45:26,347] Trial 1 finished with value: 0.975625 and parameters: {'n_d': 30, 'n_a': 19, 'n_steps': 5, 'gamma': 1.9161090222249553, 'lambda_sparse': 0.0006497972108056005, 'lr': 0.09491456988344085, 'batch_size': 1024}. Best is trial 0 with value: 0.99325.


Stop training because you reached max_epochs = 3 with best_epoch = 2 and best_valid_accuracy = 0.99205


[I 2025-07-16 02:45:54,453] Trial 2 finished with value: 0.99205 and parameters: {'n_d': 15, 'n_a': 27, 'n_steps': 3, 'gamma': 1.8592964616182832, 'lambda_sparse': 0.00021629059310946525, 'lr': 0.026563450723760876, 'batch_size': 1024}. Best is trial 0 with value: 0.99325.


✅ Best trial:
  Value (Accuracy): 0.99325
  Params: {'n_d': 28, 'n_a': 19, 'n_steps': 4, 'gamma': 1.4075424422263965, 'lambda_sparse': 0.00023977751458180386, 'lr': 0.06300719232680903, 'batch_size': 1024}


In [None]:
# Cell 16: Compare Models and Select Best
from sklearn.metrics import accuracy_score, f1_score

# Calculate F1 scores
tabnet_f1 = f1_score(y_test, y_pred, average='weighted')
xgb_f1 = f1_score(y_test, y_xgb_pred, average='weighted')

# Select best
best_model_name = "TabNet" if tabnet_f1 >= xgb_f1 else "XGBoost"
print(f"✅ Best Model Based on F1 Score: {best_model_name}")
print(f"TabNet F1: {tabnet_f1:.4f} | XGBoost F1: {xgb_f1:.4f}")

✅ Best Model Based on F1 Score: XGBoost
TabNet F1: 0.9965 | XGBoost F1: 0.9987


In [None]:
# 📌 Cell 15: Train Best XGBoost Model and Evaluate
best_xgb_params = xgb_study.best_trial.params
best_xgb_params.update({
    'objective': 'multi:softmax',
    'num_class': len(target_le.classes_),
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'eval_metric': 'mlogloss'
})

dtrain_final = xgb.DMatrix(X_train, label=y_train)
dtest_final = xgb.DMatrix(X_test, label=y_test)

final_xgb_model = xgb.train(best_xgb_params, dtrain_final, num_boost_round=100,
                            evals=[(dtest_final, 'test')],
                            early_stopping_rounds=10,
                            verbose_eval=False)

y_xgb_pred = final_xgb_model.predict(dtest_final)

print("✅ XGBoost Classification Report:\n", classification_report(y_test, y_xgb_pred, target_names=target_le.classes_))

NameError: name 'xgb_study' is not defined

In [None]:
# 📌 Cell 14: Run XGBoost Tuning
xgb_study = optuna.create_study(direction="maximize")
xgb_study.optimize(xgb_objective, n_trials=10)

print("✅ Best XGBoost Accuracy:", xgb_study.best_trial.value)
print("✅ Best Params:", xgb_study.best_trial.params)

[I 2025-07-16 02:56:31,355] A new study created in memory with name: no-name-3807fdff-4a48-4dec-b23c-1ee5f06dbaff


NameError: name 'xgb_objective' is not defined

In [None]:
# 📌 Cell 15: Train Best XGBoost Model and Evaluate
best_xgb_params = xgb_study.best_trial.params
best_xgb_params.update({
    'objective': 'multi:softmax',
    'num_class': len(target_le.classes_),
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'eval_metric': 'mlogloss'
})

dtrain_final = xgb.DMatrix(X_train, label=y_train)
dtest_final = xgb.DMatrix(X_test, label=y_test)

final_xgb_model = xgb.train(best_xgb_params, dtrain_final, num_boost_round=100,
                            evals=[(dtest_final, 'test')],
                            early_stopping_rounds=10,
                            verbose_eval=False)

y_xgb_pred = final_xgb_model.predict(dtest_final)

print("✅ XGBoost Classification Report:\n", classification_report(y_test, y_xgb_pred, target_names=target_le.classes_))

ValueError: No trials are completed yet.

In [None]:
# Cell 16: Compare Models and Select Best
from sklearn.metrics import accuracy_score, f1_score

# Calculate F1 scores
tabnet_f1 = f1_score(y_test, y_pred, average='weighted')
xgb_f1 = f1_score(y_test, y_xgb_pred, average='weighted')

# Select best
best_model_name = "TabNet" if tabnet_f1 >= xgb_f1 else "XGBoost"
print(f"✅ Best Model Based on F1 Score: {best_model_name}")
print(f"TabNet F1: {tabnet_f1:.4f} | XGBoost F1: {xgb_f1:.4f}")

✅ Best Model Based on F1 Score: XGBoost
TabNet F1: 0.9965 | XGBoost F1: 0.9987


In [None]:
# Cell 17: Inference on Sample Test Set
# Select test sample
sample_indices = np.random.choice(X_test.index, size=5, replace=False)
X_sample = X_test.loc[sample_indices]
y_true_sample = y_test.loc[sample_indices]

# Predict
if best_model_name == "TabNet":
    y_pred_sample = final_model.predict(X_sample.values)
else:
    d_sample = xgb.DMatrix(X_sample)
    y_pred_sample = final_xgb_model.predict(d_sample)

# Decode labels
decoded_preds = target_le.inverse_transform(y_pred_sample.astype(int))
decoded_truth = target_le.inverse_transform(y_true_sample.astype(int))

# Display predictions
for i in range(len(decoded_preds)):
    print(f"\nCustomer {i+1}")
    print(X_sample.iloc[i])
    print(f"🔮 Predicted: {decoded_preds[i]} | ✅ Actual: {decoded_truth[i]}")


Customer 1
age                         63.000000
gender                       1.000000
income                   64486.980469
employment_type              1.000000
company_reputation           1.000000
credit_score               674.000000
existing_loans_count         3.000000
known_defaulter              0.000000
political_affiliation        0.000000
loan_amount              54453.781250
loan_purpose                 1.000000
Name: 83991, dtype: float32
🔮 Predicted: 1 | ✅ Actual: 1

Customer 2
age                          48.000000
gender                        1.000000
income                   116793.742188
employment_type               1.000000
company_reputation            1.000000
credit_score                486.000000
existing_loans_count          3.000000
known_defaulter               0.000000
political_affiliation         1.000000
loan_amount               44807.980469
loan_purpose                  3.000000
Name: 11010, dtype: float32
🔮 Predicted: 1 | ✅ Actual: 1

Customer 3
age

In [None]:
# 📌 Cell 18: Install & Load T5
!pip install -q transformers sentencepiece

from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load the T5 model (use 't5-small' for speed or 't5-base' if you're okay with a bit more memory)
tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small").to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# 📌 Cell 19: Define Summarization Function
def summarize_with_t5(input_text):
    prompt = "summarize: " + input_text.strip().replace("\n", " ")
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to("cuda")
    summary_ids = t5_model.generate(inputs.input_ids, max_new_tokens=50, do_sample=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
# 📌 Cell 20: Generate Summaries for Each Sample
def format_profile_as_text(row):
    gender_map = {0: "male", 1: "female", 2: "other"}
    employment_map = {0: "salaried", 1: "self-employed"}
    company_map = {0: "good", 1: "bad", 2: "unknown"}
    politics_map = {0: "none", 1: "member", 2: "holds office"}
    purpose_map = {0: "personal", 1: "home", 2: "auto", 3: "medical"}

    profile = f"""
    - Age: {int(row['age'])}
    - Gender: {gender_map.get(int(row['gender']), 'unknown')}
    - Income: ${row['income']:.2f}
    - Employment Type: {employment_map.get(int(row['employment_type']), 'unknown')}
    - Company Reputation: {company_map.get(int(row['company_reputation']), 'unknown')}
    - Credit Score: {int(row['credit_score'])}
    - Existing Loans: {int(row['existing_loans_count'])}
    - Known Defaulter: {"Yes" if row['known_defaulter'] else "No"}
    - Political Affiliation: {politics_map.get(int(row['political_affiliation']), 'unknown')}
    - Loan Amount: ${row['loan_amount']:.2f}
    - Loan Purpose: {purpose_map.get(int(row['loan_purpose']), 'unknown')}
    """
    return profile


# 🔁 Better summarization loop
for i in range(len(X_sample)):
    print(f"\n📋 Customer {i+1}")
    profile_text = format_profile_as_text(X_sample.iloc[i])
    prediction = decoded_preds[i]
    actual = decoded_truth[i]

    prompt_text = f"""
    A machine learning model reviewed the following loan applicant profile and predicted the loan status as **{prediction}**.
    Profile:
    {profile_text}

    Please summarize in simple, clear language why this decision might have been made.
    """

    summary = summarize_with_t5(prompt_text)
    print(f"🧾 Summary:\n{summary}")

NameError: name 'X_sample' is not defined

In [None]:
# Cell 16: Compare Models and Select Best
from sklearn.metrics import accuracy_score, f1_score

# Calculate F1 scores
tabnet_f1 = f1_score(y_test, y_pred, average='weighted')
xgb_f1 = f1_score(y_test, y_xgb_pred, average='weighted')

# Select best
best_model_name = "TabNet" if tabnet_f1 >= xgb_f1 else "XGBoost"
print(f"✅ Best Model Based on F1 Score: {best_model_name}")
print(f"TabNet F1: {tabnet_f1:.4f} | XGBoost F1: {xgb_f1:.4f}")