In [None]:
# !pip install lightgbm holidays

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import holidays
from datetime import timedelta

# LOAD your pre-merged lag-enhanced datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test_8gqdJqH.csv")
submission = pd.read_csv("sample_submission_TQv3O0x.csv")

# 🧊 Filter for dbd = 15
transactions = pd.read_csv("transactions.csv")
trans_15 = transactions[transactions['dbd'] == 15]
train = pd.merge(train, trans_15, on=['doj', 'srcid', 'destid'], how='inner')
test = pd.merge(test, trans_15, on=['doj', 'srcid', 'destid'], how='left')


# ---------------------- CALENDAR FEATURES ----------------------
def enrich_calendar(df):
    df['doj'] = pd.to_datetime(df['doj'])
    df['month'] = df['doj'].dt.month
    df['dayofweek'] = df['doj'].dt.dayofweek
    df['is_weekend'] = df['dayofweek'].isin([5,6]).astype(int)
    df['is_month_start'] = df['doj'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['doj'].dt.is_month_end.astype(int)
    ind_holidays = holidays.India(years=[2023,2024,2025])
    df['is_holiday'] = df['doj'].isin(ind_holidays).astype(int)
    df['is_pre_holiday'] = df['doj'].apply(lambda x: (x + timedelta(days=1)) in ind_holidays).astype(int)
    df['is_post_holiday'] = df['doj'].apply(lambda x: (x - timedelta(days=1)) in ind_holidays).astype(int)
    df['is_school_vacation'] = df['month'].isin([5,6,10,11,12]).astype(int)
    return df

train = enrich_calendar(train)
test = enrich_calendar(test)

  df['is_holiday'] = df['doj'].isin(ind_holidays).astype(int)
  df['is_holiday'] = df['doj'].isin(ind_holidays).astype(int)


In [None]:
# ---------------------- ENCODE CATEGORICAL ----------------------
cat_cols = ['srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# ---------------------- FEATURE ENGINEERING ----------------------
train['search_per_seat'] = (train['cumsum_searchcount'] + 1) / (train['cumsum_seatcount'] + 1)
test['search_per_seat'] = (test['cumsum_searchcount'] + 1) / (test['cumsum_seatcount'] + 1)
train['tier_diff'] = abs(train['srcid_tier'] - train['destid_tier'])
test['tier_diff'] = abs(test['srcid_tier'] - test['destid_tier'])
train['region_match'] = (train['srcid_region'] == train['destid_region']).astype(int)
test['region_match'] = (test['srcid_region'] == test['destid_region']).astype(int)

# ROUTE STATS
route_stats = train.groupby(['srcid', 'destid'])['final_seatcount'].agg(['mean', 'std', 'median']).reset_index()
route_stats.columns = ['srcid', 'destid', 'route_mean_seatcount', 'route_std_seatcount', 'route_median_seatcount']
train = pd.merge(train, route_stats, on=['srcid', 'destid'], how='left')
test = pd.merge(test, route_stats, on=['srcid', 'destid'], how='left')

# Lag/Momentum/Volatility Features
def create_lag_features(df, group_cols, value_col, lags):
    df_lag = df.copy()
    for lag in lags:
        df_lag[f'{value_col}_lag_{lag}'] = df_lag.groupby(group_cols)[value_col].shift(lag)
    return df_lag

def create_rolling_features(df, group_cols, value_col, windows):
    df_rolling = df.copy()
    for window in windows:
        df_rolling[f'{value_col}_rolling_mean_{window}'] = df_rolling.groupby(group_cols)[value_col].transform(lambda x: x.rolling(window=window).mean())
        df_rolling[f'{value_col}_rolling_std_{window}'] = df_rolling.groupby(group_cols)[value_col].transform(lambda x: x.rolling(window=window).std())
    return df_rolling

train = create_lag_features(train, ['srcid', 'destid'], 'cumsum_searchcount', [1, 3])
train = create_lag_features(train, ['srcid', 'destid'], 'cumsum_seatcount', [1, 3])

train['search_growth_1d'] = train['cumsum_searchcount'] - train['cumsum_searchcount_lag_1']
train['seat_growth_1d'] = train['cumsum_seatcount'] - train['cumsum_seatcount_lag_1']
train['search_growth_3d'] = train['cumsum_searchcount'] - train['cumsum_searchcount_lag_3']
train['seat_growth_3d'] = train['cumsum_seatcount'] - train['cumsum_seatcount_lag_3']

train = create_rolling_features(train, ['srcid', 'destid'], 'final_seatcount', [7])
train['seatcount_volatility'] = train['final_seatcount_rolling_std_7']


test = create_lag_features(test, ['srcid', 'destid'], 'cumsum_searchcount', [1, 3])
test = create_lag_features(test, ['srcid', 'destid'], 'cumsum_seatcount', [1, 3])

test['search_growth_1d'] = test['cumsum_searchcount'] - test['cumsum_searchcount_lag_1']
test['seat_growth_1d'] = test['cumsum_seatcount'] - test['cumsum_seatcount_lag_1']
test['search_growth_3d'] = test['cumsum_searchcount'] - test['cumsum_searchcount_lag_3']
test['seat_growth_3d'] = test['cumsum_seatcount'] - test['cumsum_seatcount_lag_3']



 #FINAL FEATURE SET
features = [
    # Original
    'srcid', 'destid', 'srcid_region', 'destid_region', 'srcid_tier', 'destid_tier',
    'cumsum_seatcount', 'cumsum_searchcount', 'search_per_seat',
    'tier_diff', 'region_match',
    'month', 'dayofweek', 'is_weekend', 'is_month_start', 'is_month_end',
    'is_holiday', 'is_pre_holiday', 'is_post_holiday', 'is_school_vacation',
    'route_mean_seatcount', 'route_std_seatcount', 'route_median_seatcount',

    # 🔥 Lag/Momentum/Volatility
    'search_growth_1d', 'seat_growth_1d',
    'search_growth_3d', 'seat_growth_3d',
    'seatcount_volatility'
]

target = 'final_seatcount'

In [None]:
features_for_training = [f for f in features if f != 'seatcount_volatility']

X_train, X_val, y_train, y_val = train_test_split(train[features_for_training], train[target], test_size=0.2, random_state=42)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.02,
    'num_leaves': 128,
    'max_depth': 12,
    'min_data_in_leaf': 50,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 5,
    'lambda_l1': 3.0,
    'lambda_l2': 6.0,
    'verbosity': -1,
    'random_state': 42
}

lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_val = lgb.Dataset(X_val, label=y_val)

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    num_boost_round=3000,
    callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=100)]
)

# ---------------------- EVALUATE ----------------------
val_preds = model.predict(X_val, num_iteration=model.best_iteration)
rmse = mean_squared_error(y_val, val_preds)
rmse = np.sqrt(rmse)
print(f"✅ FINAL VALIDATION RMSE: {rmse:.2f}")

# ---------------------- PREDICT & SUBMIT ----------------------
test_preds = model.predict(test[features_for_training], num_iteration=model.best_iteration)
submission['final_seatcount'] = np.round(test_preds).astype(int)
submission.to_csv("final_submission_lag_boosted.csv", index=False)
print("📦 Submission saved as: final_submission_lag_boosted.csv")

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2618]	training's rmse: 233.112	valid_1's rmse: 376.979
✅ FINAL VALIDATION RMSE: 376.98
📦 Submission saved as: final_submission_lag_boosted.csv
