In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import HistGradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pickle

In [2]:
train_df = pd.read_csv('../data/processed/train_processed.csv')
test_df = pd.read_csv('../data/processed/test_processed.csv')

y = train_df['Tm']
X = train_df.drop(columns=['id', 'Tm'])
X_test = test_df.drop(columns=['id'])

X_test = X_test[X.columns]

X.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X.columns]
X_test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_test.columns]

print(f"Data Loaded. Train shape: {X.shape}, Test shape: {X_test.shape}")

Data Loaded. Train shape: (2662, 322), Test shape: (666, 322)


In [None]:
# Define base models
print("Defining Base Models for Stacking")

lgb_params_tuned = {
    'objective': 'regression_l1',
    'metric': 'mae',
    'n_estimators': 2000, 
    'learning_rate': 0.05445820875847297,
    'max_depth': 11,
    'num_leaves': 28,
    'subsample': 0.8096640142442519,
    'colsample_bytree': 0.7222682997643721,
    'lambda_l1': 0.07754417154115359,
    'lambda_l2': 3.330947586954466e-07,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42
}

histgb_params = {
    'max_iter': 500,
    'max_depth': 10,
    'learning_rate': 0.05,
    'random_state': 42
}

xgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'max_depth': 8,
    'tree_method': 'hist',
    'verbosity': 0,
    'random_state': 42,
    'n_jobs': -1
}

estimators = [
    ('lgbm', lgb.LGBMRegressor(**lgb_params_tuned)),
    ('hist', HistGradientBoostingRegressor(**histgb_params)),
    ('xgb', xgb.XGBRegressor(**xgb_params))
]

meta_model = RidgeCV()

Defining Base Models for Stacking


In [4]:
print("Building and Evaluating the Stackin Regressor")

stack =  StackingRegressor(
    estimators=estimators,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"Starting Fold {fold+1}/5")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    stack.fit(X_train, y_train)

    y_pred_log = stack.predict(X_val)

    mae = mean_absolute_error(np.exp(y_val), np.exp(y_pred_log))
    mae_scores.append(mae)
    print(f"Fold {fold+1} MAE: {mae:.5f}")

mean_mae = np.mean(mae_scores)
std_mae = np.std(mae_scores)
print(f"\nStacking Ensemble CV Finished")
print(f"Average MAE: {mean_mae:.5f} (+/- {std_mae:.5f})")

Building and Evaluating the Stackin Regressor
Starting Fold 1/5
Fold 1 MAE: 27.82745
Starting Fold 2/5
Fold 2 MAE: 26.86074
Starting Fold 3/5
Fold 3 MAE: 26.75183
Starting Fold 4/5
Fold 4 MAE: 28.01798
Starting Fold 5/5
Fold 5 MAE: 27.23825

Stacking Ensemble CV Finished
Average MAE: 27.33925 (+/- 0.50661)


In [None]:
# 4. Train Final Model and Generate Submission ---
print("\n--- 4. Training Final Model on All Data ---")
stack.fit(X, y)
print("Final model training complete.")

# Save the trained model
with open('../models/stacking_model.pkl', 'wb') as f:
    pickle.dump(stack, f)
print("Model saved to ../models/stacking_model.pkl")

print("\n--- Generating Predictions for Submission ---")
test_predictions_log = stack.predict(X_test)
test_predictions = np.exp(test_predictions_log) 

submission_df = pd.DataFrame({'id': test_df['id'], 'Tm': test_predictions})
submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' created successfully!")
print("Top 5 rows of submission file:")
print(submission_df.head())



--- 4. Training Final Model on All Data ---
Final model training complete.
Model saved to ../models/stacking_model.pkl

--- Generating Predictions for Submission ---

Submission file 'submission.csv' created successfully!
Top 5 rows of submission file:
     id          Tm
0  1022  363.963334
1  1146  338.906736
2    79  193.900438
3  2279  194.612145
4  1342  227.035622
