In [3]:
# demand_forecast.py

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from lightgbm import early_stopping, log_evaluation


# Load data
train = pd.read_csv("data/train.csv", parse_dates=["doj"])
test = pd.read_csv("data/test.csv", parse_dates=["doj"])
transactions = pd.read_csv("data/transactions.csv", parse_dates=["doj", "doi"])

# --- Feature Engineering ---
def create_date_features(df, date_col):
    df['year'] = df[date_col].dt.year
    df['month'] = df[date_col].dt.month
    df['day'] = df[date_col].dt.day
    df['weekday'] = df[date_col].dt.weekday
    df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
    return df

train = create_date_features(train, "doj")
test = create_date_features(test, "doj")

# Filter 15 days before journey
transactions_15 = transactions[transactions['dbd'] == 15]

# Aggregate cumsum features
agg_15 = transactions_15.groupby(['doj', 'srcid', 'destid']).agg({
    'cumsum_seatcount': 'max',
    'cumsum_searchcount': 'max'
}).reset_index()

agg_15.rename(columns={
    'cumsum_seatcount': 'seats_15_days_before',
    'cumsum_searchcount': 'search_15_days_before'
}, inplace=True)

# Merge with train/test
train = pd.merge(train, agg_15, on=['doj', 'srcid', 'destid'], how='left')
test = pd.merge(test, agg_15, on=['doj', 'srcid', 'destid'], how='left')

# Region and tier info
region_tier = transactions[['srcid', 'srcid_region', 'srcid_tier', 'destid', 'destid_region', 'destid_tier']].drop_duplicates()
train = pd.merge(train, region_tier, on=['srcid', 'destid'], how='left')
test = pd.merge(test, region_tier, on=['srcid', 'destid'], how='left')

# Label encode categorical columns
cat_cols = ['srcid_region', 'srcid_tier', 'destid_region', 'destid_tier']
le = LabelEncoder()
for col in cat_cols:
    train[col] = le.fit_transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# Define features
features = [
    'month', 'day', 'weekday', 'is_weekend',
    'seats_15_days_before', 'search_15_days_before',
    'srcid_region', 'srcid_tier',
    'destid_region', 'destid_tier'
]

X = train[features]
y = train['final_seatcount']
X_test = test[features]

# --- Model Training ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(train))
preds = np.zeros(len(test))

for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
    X_train, y_train = X.iloc[tr_idx], y.iloc[tr_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    model = LGBMRegressor(
        learning_rate=0.03,
        n_estimators=1000,
        max_depth=6,
        num_leaves=31,
        random_state=42
    )


    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[early_stopping(stopping_rounds=50), log_evaluation(period=100)]
    )


    oof[val_idx] = model.predict(X_val)
    preds += model.predict(X_test) / kf.n_splits

# --- Evaluation ---
rmse = np.sqrt(mean_squared_error(y, oof))
print(f"\n✅ CV RMSE: {rmse:.4f}")

# --- Final Submission ---
submission = pd.DataFrame({
    'route_key': test['route_key'],
    'final_seatcount': np.round(preds).astype(int)
})
submission.to_csv("final_submission.csv", index=False)
print("✅ Submission saved to final_submission.csv")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000601 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 592
[LightGBM] [Info] Number of data points in the train set: 53760, number of used features: 10
[LightGBM] [Info] Start training from score 2003.632533
Training until validation scores don't improve for 50 rounds
[100]	valid_0's l2: 460890
[200]	valid_0's l2: 377730
[300]	valid_0's l2: 343922
[400]	valid_0's l2: 326827
[500]	valid_0's l2: 315534
[600]	valid_0's l2: 305607
[700]	valid_0's l2: 297212
[800]	valid_0's l2: 290796
[900]	valid_0's l2: 287121
[1000]	valid_0's l2: 284390
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 284390
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000645 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e