In [21]:
# SECTION 1: IMPORT LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor



In [22]:

# SECTION 2: LOAD DATA
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
transactions = pd.read_csv("transaction.csv.csv")

In [23]:

# SECTION 3: BASIC FEATURE ENGINEERING
for df in [train, test]:
    df["doj"] = pd.to_datetime(df["doj"])
    df["dayofweek"] = df["doj"].dt.dayofweek
    df["month"] = df["doj"].dt.month
    df["day"] = df["doj"].dt.day
    df["weekend"] = df["dayofweek"].isin([5, 6]).astype(int)

# Route popularity
route_stats = train.groupby(['srcid', 'destid'])['final_seatcount'].agg(['mean', 'std', 'count']).reset_index()
route_stats.columns = ['srcid', 'destid', 'route_mean', 'route_std', 'route_count']
for col in ['route_mean', 'route_std', 'route_count']:
    if col in train.columns:
        train.drop(columns=[col], inplace=True)
    if col in test.columns:
        test.drop(columns=[col], inplace=True)
train = train.merge(route_stats, on=['srcid', 'destid'], how='left')
test = test.merge(route_stats, on=['srcid', 'destid'], how='left')

In [24]:

# SECTION 4: ADD TRANSACTIONAL INFO
transactions["doj"] = pd.to_datetime(transactions["doj"])  # ensure datetime type
train["doj"] = pd.to_datetime(train["doj"])
test["doj"] = pd.to_datetime(test["doj"])

# Multi-dbd aggregation
for dbd_val in [1, 3, 7, 15]:
    temp = transactions[transactions["dbd"] == dbd_val]
    agg = temp.groupby(['doj', 'srcid', 'destid'])[["cumsum_seatcount", "cumsum_searchcount"]].sum().reset_index()
    agg.columns = ['doj', 'srcid', 'destid'] + [f"{col}_dbd{dbd_val}" for col in ["cumsum_seatcount", "cumsum_searchcount"]]
    train = train.merge(agg, on=["doj", "srcid", "destid"], how="left")
    test = test.merge(agg, on=["doj", "srcid", "destid"], how="left")

# Add missing data indicators for dbd15
train['is_missing_trans'] = train.get('cumsum_seatcount_dbd15').isna().astype(int)
test['is_missing_trans'] = test.get('cumsum_seatcount_dbd15').isna().astype(int)

# Impute transactional values with group averages where possible
for dbd_val in [1, 3, 7, 15]:
    for col in ['cumsum_seatcount', 'cumsum_searchcount']:
        col_name = f"{col}_dbd{dbd_val}"
        if col_name in train.columns:
            avg_col = train.groupby(['srcid', 'destid'])[col_name].transform('mean')
            train[col_name] = train[col_name].fillna(avg_col)
        if col_name in test.columns:
            test[col_name] = test[col_name].fillna(train[col_name].mean())

# Fill remaining missing values
for col in ['route_mean', 'route_std', 'route_count'] + \
           [f"cumsum_seatcount_dbd{d}" for d in [1, 3, 7, 15]] + \
           [f"cumsum_searchcount_dbd{d}" for d in [1, 3, 7, 15]]:
    if col in train.columns:
        train[col] = train[col].fillna(0)
    if col in test.columns:
        test[col] = test[col].fillna(0)


In [25]:

# SECTION 5: MODEL STACKING FUNCTION

def blend_models(X, y, X_test, n_folds=5):
    tscv = TimeSeriesSplit(n_splits=n_folds)
    oof_preds = np.zeros((X.shape[0], 2))
    test_preds = np.zeros((X_test.shape[0], 2))

    models = [
        XGBRegressor(n_estimators=500, learning_rate=0.03, max_depth=8, subsample=0.9, colsample_bytree=0.9, random_state=42),
        LGBMRegressor(n_estimators=500, learning_rate=0.03, max_depth=8, subsample=0.9, colsample_bytree=0.9, random_state=42)
    ]

    for i, model in enumerate(models):
        test_fold_preds = np.zeros((X_test.shape[0], n_folds))
        for j, (train_idx, val_idx) in enumerate(tscv.split(X)):
            X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

            model.fit(X_tr, y_tr)
            oof_preds[val_idx, i] = model.predict(X_val)
            test_fold_preds[:, j] = model.predict(X_test)

        test_preds[:, i] = test_fold_preds.mean(axis=1)

    return oof_preds, test_preds


In [26]:
# SECTION 6: MODEL EXECUTION
features = [
    "dayofweek", "month", "day", "weekend",
    "route_mean", "route_std", "route_count",
    "cumsum_seatcount_dbd1", "cumsum_searchcount_dbd1",
    "cumsum_seatcount_dbd3", "cumsum_searchcount_dbd3",
    "cumsum_seatcount_dbd7", "cumsum_searchcount_dbd7",
    "cumsum_seatcount_dbd15", "cumsum_searchcount_dbd15",
    "is_missing_trans"
]
X = train[features]
y = np.log1p(train["final_seatcount"])
X_test = test[features]

oof_preds, test_preds = blend_models(X, y, X_test)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000765 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2276
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 14
[LightGBM] [Info] Start training from score 7.365986
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007643 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2292
[LightGBM] [Info] Number of data points in the train set: 22400, number of used features: 14
[LightGBM] [Info] Start training from score 7.304197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002549 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 33600, number of used features: 14
[LightGBM] [Info] Start tra

In [27]:
# SECTION 7: META MODEL
meta_model = Ridge(alpha=1.0)
meta_model.fit(oof_preds, y)
final_predictions = np.expm1(meta_model.predict(test_preds))

# SECTION 8: EXPORT SUBMISSION
submission = test[["route_key"]].copy()
submission["final_seatcount"] = final_predictions
submission["final_seatcount"] = submission["final_seatcount"].clip(lower=0)
submission.to_csv("submission_final_boosted.csv", index=False)


In [28]:
# SECTION 8: EXPORT SUBMISSION
submission = test[["route_key"]].copy()
submission["final_seatcount"] = final_predictions
submission["final_seatcount"] = submission["final_seatcount"].clip(lower=0)
submission.to_csv("submission_final_boosted.csv", index=False)


In [30]:

# SECTION 9: OPTIONAL FEATURE IMPORTANCE
# (Only works for tree-based models like LGBM)
def plot_importance(model, feature_names, title="Feature Importance"):
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.bar(range(len(importances)), importances[indices])
    plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.show()

# Example: plot_importance(models[1], features, title="LGBM Feature Importance")
