In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings("ignore")

train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593994 entries, 0 to 593993
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    593994 non-null  int64  
 1   annual_income         593994 non-null  float64
 2   debt_to_income_ratio  593994 non-null  float64
 3   credit_score          593994 non-null  int64  
 4   loan_amount           593994 non-null  float64
 5   interest_rate         593994 non-null  float64
 6   gender                593994 non-null  object 
 7   marital_status        593994 non-null  object 
 8   education_level       593994 non-null  object 
 9   employment_status     593994 non-null  object 
 10  loan_purpose          593994 non-null  object 
 11  grade_subgrade        593994 non-null  object 
 12  loan_paid_back        593994 non-null  float64
dtypes: float64(5), int64(2), object(6)
memory usage: 58.9+ MB
None


In [10]:
features_to_use = [
    'annual_income','debt_to_income_ratio','credit_score','loan_amount','interest_rate',
    'gender','marital_status','education_level','employment_status','loan_purpose',
    'grade_subgrade'
]

X = train_df[features_to_use]
y = train_df['loan_paid_back']
X_test = test_df[features_to_use]

categorical_features = [
    'gender','marital_status','education_level','employment_status','loan_purpose',
    'grade_subgrade'

]

In [11]:
def prepare_xgboost_features_simple(X, X_test=None):
    X_xgb = X.copy()

    if X_test is not None:
        X_test_xgb = X_test.copy()
    else:
        X_test_xgb=None

    
    categorical_cols = ['gender','marital_status','education_level','employment_status','loan_purpose',
    'grade_subgrade']
    
    for col in categorical_cols:
        le = LabelEncoder()
        X_xgb[col]=le.fit_transform(X_xgb[col].astype(str))
        if X_test is not None:
            unique_train = set(le.classes_)
            X_test_xgb[col] = X_test_xgb[col].astype(str).apply(
                lambda x: le.transform([x])[0] if x in unique_train else -1
            )
    
    if X_test is not None:
        return X_xgb, X_test_xgb
    return X_xgb

In [12]:
from lightgbm import LGBMRegressor

X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.2,random_state=42)

print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_val.shape[0]}")

# Preparing data for all models
X_train_cat = X_train.copy()
X_val_cat = X_val.copy()
X_test_cat = X_test.copy()

# Preparing XGBoost features (label encoded)
X_train_xgb, X_val_xgb = prepare_xgboost_features_simple(X_train, X_val)
X_test_xgb = prepare_xgboost_features_simple(X_test)

# For LightGBM model
X_train_lgb, X_val_lgb = prepare_xgboost_features_simple(X_train, X_val)
X_test_lgb = prepare_xgboost_features_simple(X_test)

# CatBoost model
cat_model = CatBoostRegressor(
    cat_features=categorical_features,
    random_state=42,
    verbose=False,
    iterations=800,  
    learning_rate=0.03,  
    depth=8,  
    l2_leaf_reg=3,  
    random_strength=0.5, 
    bagging_temperature=0.8,  
    early_stopping_rounds=50
)

# XGBoost model
xgb_model = XGBRegressor(
    random_state=42,
    n_estimators=800,  
    learning_rate=0.03,
    max_depth=8, 
    subsample=0.85,  
    colsample_bytree=0.8,
    colsample_bylevel=0.8,  
    reg_alpha=0.2,  
    reg_lambda=0.3,  
    gamma=0.1,  
    eval_metric='rmse',
    early_stopping_rounds=50,
    verbosity=0  
)

# LightGBM model
lgb_model = LGBMRegressor(
    random_state=42,
    n_estimators=800,  
    learning_rate=0.03,  
    max_depth=8,  
    num_leaves=45,  
    subsample=0.85,  
    colsample_bytree=0.8,
    reg_alpha=0.2,  
    reg_lambda=0.3,  
    min_child_samples=25,  
    min_child_weight=0.001, 
    verbose=-1
)


# Model training
print("Training models...")

print("Training CatBoost...", end=" ")
cat_model.fit(X_train_cat, y_train, eval_set=[(X_val_cat, y_val)], verbose=False)
print("✓")

print("Training XGBoost...", end=" ")
xgb_model.fit(X_train_xgb, y_train, eval_set=[(X_val_xgb, y_val)], verbose=False)
print("✓")

print("Training LightGBM...", end=" ")
lgb_model.fit(X_train_lgb, y_train)
print("✓")

print("\nAll models trained successfully!")

# Base predictions
cat_val_pred = cat_model.predict(X_val_cat)
cat_test_pred = cat_model.predict(X_test_cat)

xgb_val_pred = xgb_model.predict(X_val_xgb)
xgb_test_pred = xgb_model.predict(X_test_xgb)

lgb_val_pred = lgb_model.predict(X_val_lgb)
lgb_test_pred = lgb_model.predict(X_test_lgb)

print("\nPredictions ready for ensemble!")





Training samples: 475195
Validation samples: 118799
Training models...
Training CatBoost... Training models...
Training CatBoost... ✓
Training XGBoost... ✓
Training XGBoost... ✓
Training LightGBM... ✓
Training LightGBM... ✓

All models trained successfully!
✓

All models trained successfully!

Predictions ready for ensemble!

Predictions ready for ensemble!


In [13]:
from sklearn.linear_model import Ridge

print("Creating stacking ensemble...")

# Getting predictions from all models
cat_train_pred = cat_model.predict(X_train_cat)
cat_val_pred = cat_model.predict(X_val_cat)
cat_test_pred = cat_model.predict(X_test_cat)

xgb_train_pred = xgb_model.predict(X_train_xgb)
xgb_val_pred = xgb_model.predict(X_val_xgb)
xgb_test_pred = xgb_model.predict(X_test_xgb)

lgb_train_pred = lgb_model.predict(X_train_lgb)
lgb_val_pred = lgb_model.predict(X_val_lgb)
lgb_test_pred = lgb_model.predict(X_test_lgb)

# Create stacking features (use all 3 models)
level1_train = np.column_stack([cat_train_pred, xgb_train_pred, lgb_train_pred])
level1_val = np.column_stack([cat_val_pred, xgb_val_pred, lgb_val_pred])
level1_test = np.column_stack([cat_test_pred, xgb_test_pred, lgb_test_pred])

print(f"Stacking feature shapes:")
print(f"Train: {level1_train.shape}")
print(f"Val: {level1_val.shape}") 
print(f"Test: {level1_test.shape}")

# Train meta-model
meta_model = Ridge(alpha=0.1)
meta_model.fit(level1_train, y_train)

# Make stacking predictions
stacking_val_pred = meta_model.predict(level1_val)
stacking_test_pred = meta_model.predict(level1_test)

print("Stacking ensemble training completed!")

Creating stacking ensemble...
Stacking feature shapes:
Train: (475195, 3)
Val: (118799, 3)
Test: (254569, 3)
Stacking ensemble training completed!
Stacking feature shapes:
Train: (475195, 3)
Val: (118799, 3)
Test: (254569, 3)
Stacking ensemble training completed!


In [None]:
# Calculate individual model performances with multiple metrics
cat_val_rmse = np.sqrt(mean_squared_error(y_val, cat_val_pred))
xgb_val_rmse = np.sqrt(mean_squared_error(y_val, xgb_val_pred))
lgb_val_rmse = np.sqrt(mean_squared_error(y_val, lgb_val_pred))
stack_val_rmse = np.sqrt(mean_squared_error(y_val, stacking_val_pred))

cat_mae = mean_absolute_error(y_val, cat_val_pred)
xgb_mae = mean_absolute_error(y_val, xgb_val_pred)
lgb_mae = mean_absolute_error(y_val, lgb_val_pred)

# Combined score (RMSE + MAE)
def combined_score(rmse, mae):
    return 0.7 * rmse + 0.3 * mae  # Weight RMSE more heavily

cat_score = combined_score(cat_val_rmse, cat_mae)
xgb_score = combined_score(xgb_val_rmse, xgb_mae)
lgb_score = combined_score(lgb_val_rmse, lgb_mae)
stack_score = combined_score(stack_val_rmse, 0)

# Performance-based weights with exponential decay (better models get much higher weight)
models_scores = {
    'CatBoost': cat_score,
    'XGBoost': xgb_score,
    'LightGBM': lgb_score,
    'Stacking': stack_score
}

# Exponential weighting (emphasizes differences between models)
weights = {}
total_weight = 0
for name, score in models_scores.items():
    weights[name] = np.exp(-score * 5) 
    total_weight += weights[name]

# Normalize weights
for name in weights:
    weights[name] /= total_weight

print("\nSmart Model Weights:")
for name, weight in weights.items():
    print(f"{name}: {weight:.3f}")

# Super ensemble with smart weighting
super_ensemble_val = (
    weights['CatBoost'] * cat_val_pred +
    weights['XGBoost'] * xgb_val_pred +
    weights['LightGBM'] * lgb_val_pred +
    weights['Stacking'] * stacking_val_pred
)

super_ensemble_test = (
    weights['CatBoost'] * cat_test_pred +
    weights['XGBoost'] * xgb_test_pred +
    weights['LightGBM'] * lgb_test_pred +
    weights['Stacking'] * stacking_test_pred
)