<a href="https://colab.research.google.com/github/DevilNReality/RedBus-Hackathon/blob/main/Code%20File%20/%20RedBus_Hackathon_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!unzip '/content/train_JDXlpm8.zip'

Archive:  /content/train_JDXlpm8.zip
   creating: train/
  inflating: train/train.csv         
  inflating: train/transactions.csv  


# **STEP 1: Imports and Setup**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('/content/train/train.csv')
test = pd.read_csv('/content/test_8gqdJqH.csv')
sample_submission = pd.read_csv('/content/sample_submission_TQv3O0x.csv')
transactions = pd.read_csv('/content/train/transactions.csv')

# **STEP 3: Transactions Feature Extraction (dbd = 15 only)**

In [None]:
tx_dbd15 = transactions[transactions['dbd'] == 15].copy()

# Derive unique route key for joining
tx_dbd15['route_key'] = tx_dbd15['doj'].astype(str) + "_" + \
                        tx_dbd15['srcid'].astype(str) + "_" + \
                        tx_dbd15['destid'].astype(str)

# Keep only useful columns
tx_features = tx_dbd15[['route_key', 'cumsum_seatcount', 'cumsum_searchcount']]
tx_features.rename(columns={
    'cumsum_seatcount': 'cumsum_seats_dbd15',
    'cumsum_searchcount': 'cumsum_search_dbd15'
}, inplace=True)

# **STEP 4: Process Train and Test DataFrames**

In [None]:
for df in [train, test]:
    df['route_key'] = df['doj'].astype(str) + "_" + df['srcid'].astype(str) + "_" + df['destid'].astype(str)

train = train.merge(tx_features, on='route_key', how='left')
test = test.merge(tx_features, on='route_key', how='left')

for df in [train, test]:
    df['doj'] = pd.to_datetime(df['doj'])
    df['month'] = df['doj'].dt.month
    df['weekday'] = df['doj'].dt.weekday
    df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
    df['day_of_month'] = df['doj'].dt.day
    df['is_month_end'] = (df['day_of_month'] >= 25).astype(int)

for df in [train, test]:
    df['search_seat_ratio'] = df['cumsum_search_dbd15'] / (df['cumsum_seats_dbd15'] + 1)

route_stats = train.groupby(['srcid', 'destid'])['final_seatcount'].agg([
    ('route_avg_demand', 'mean'),
    ('route_median_demand', 'median'),
    ('route_std_demand', 'std')
]).reset_index()

train = train.merge(route_stats, on=['srcid', 'destid'], how='left')
test = test.merge(route_stats, on=['srcid', 'destid'], how='left')

# **STEP 5: Model Training**

## **LightGB MODEL**

In [None]:
# Select features
features = [ 'srcid', 'destid','month', 'weekday', 'is_weekend',
    'day_of_month', 'is_month_end', 'cumsum_seats_dbd15', 'cumsum_search_dbd15',
    'search_seat_ratio', 'route_avg_demand', 'route_std_demand']
target = 'final_seatcount'

# Handle missing values
train[features] = train[features].fillna(0)
test[features] = test[features].fillna(0)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    train[features], train[target], test_size=0.2, random_state=42
)

# Initialize model
model = LGBMRegressor(n_estimators=1000, learning_rate=0.05, random_state=42)

# Train with early stopping
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    # early_stopping_rounds=50,
    # verbose=100
)

# Evaluate
val_preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print(f"Validation RMSE: {rmse:.4f}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003717 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1115
[LightGBM] [Info] Number of data points in the train set: 53760, number of used features: 12
[LightGBM] [Info] Start training from score 2003.632533
Validation RMSE: 374.3534


## **ENSEMBLING - XGB & LGBM**

In [None]:
from xgboost import XGBRegressor

features = [ 'srcid', 'destid','month', 'weekday', 'is_weekend',
    'day_of_month', 'is_month_end', 'cumsum_seats_dbd15', 'cumsum_search_dbd15',
    'search_seat_ratio', 'route_avg_demand', 'route_std_demand']
target = 'final_seatcount'

train[features] = train[features].fillna(0)
test[features] = test[features].fillna(0)

X_train, X_val, y_train, y_val = train_test_split(
    train[features], train[target], test_size=0.2, random_state=42
)

lgb_model = LGBMRegressor(n_estimators=1000, learning_rate=0.05, random_state=42)
lgb_model.fit(X_train, y_train)
lgb_val_preds = lgb_model.predict(X_val)

xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_val_preds = xgb_model.predict(X_val)

ensemble_preds = (lgb_val_preds + xgb_val_preds) / 2
rmse = np.sqrt(mean_squared_error(y_val, ensemble_preds))
print(f"Ensemble Validation RMSE: {rmse:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003686 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1115
[LightGBM] [Info] Number of data points in the train set: 53760, number of used features: 12
[LightGBM] [Info] Start training from score 2003.632533
Ensemble Validation RMSE: 371.3349


## **ENSEMBLING - LGBM & CATBOOST**

In [None]:
! pip install catboost -q
from catboost import CatBoostRegressor, Pool

# Identify categorical columns (by name or index)
categorical_features = ['srcid', 'destid', 'month', 'weekday', 'is_weekend',
                        'day_of_month', 'is_month_end']

# Train CatBoost
cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    loss_function='RMSE',
    verbose=100
)

cat_model.fit(
    X_train, y_train,
    cat_features=categorical_features,
    eval_set=(X_val, y_val),
    use_best_model=True
)

# CatBoost predictions
cat_val_preds = cat_model.predict(X_val)

# Ensemble: LightGBM + CatBoost
ensemble_preds = (lgb_val_preds + cat_val_preds) / 2
rmse = np.sqrt(mean_squared_error(y_val, ensemble_preds))
print(f"LightGBM + CatBoost RMSE: {rmse:.4f}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h0:	learn: 1154.6195386	test: 1160.8980117	best: 1160.8980117 (0)	total: 145ms	remaining: 2m 24s
100:	learn: 503.5354780	test: 501.4171058	best: 501.4171058 (100)	total: 6.19s	remaining: 55.1s
200:	learn: 450.1682101	test: 445.6440330	best: 445.6440330 (200)	total: 12.4s	remaining: 49.2s
300:	learn: 422.7660435	test: 419.4982165	best: 419.4982165 (300)	total: 17.8s	remaining: 41.4s
400:	learn: 402.8267513	test: 396.6880180	best: 396.6880180 (400)	total: 24.3s	remaining: 36.3s
500:	learn: 390.3588980	test: 382.8787183	best: 382.8787183 (500)	total: 29.3s	remaining: 29.2s
600:	learn: 382.0595359	test: 375.3897491	best: 375.3897491 (600)	total: 35.8s	remaining: 23.8s
700:	learn: 376.9988230	test: 370.7445337	best: 370.7445337 (700)	total: 40.7s	remaining: 17.4s
800:	learn: 372.5155703	test: 367.3171320	best: 367.3171320 (800)	total: 46.8s	remaining: 11.6s
900:	learn: 

## **ENSEMBLING - XGB & CATBOOST**

In [None]:
# Predict on validation set from both models
xgb_val_preds = xgb_model.predict(X_val)
cat_val_preds = cat_model.predict(X_val)

# Average
ensemble_preds = (xgb_val_preds + cat_val_preds) / 2

# Evaluate
rmse = np.sqrt(mean_squared_error(y_val, ensemble_preds))
print(f"XGBoost + CatBoost Ensemble RMSE: {rmse:.4f}")

XGBoost + CatBoost Ensemble RMSE: 356.8689


## **ENSEMBLING - XGB & CATBOOST & LGBM**

In [None]:
triple_preds = (lgb_val_preds + xgb_val_preds + cat_val_preds) / 3
rmse = np.sqrt(mean_squared_error(y_val, triple_preds))
print(f"Triple Ensemble RMSE: {rmse:.4f}")

Triple Ensemble RMSE: 358.9779


# **STEP 6: Hyperparameter Tuning**

In [None]:
from sklearn.model_selection import RandomizedSearchCV

cat_model = CatBoostRegressor(loss_function='RMSE', verbose=0, random_seed=42)

cat_params = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5, 7, 10],
    'iterations': [500, 800, 1000]
}

cat_search = RandomizedSearchCV(cat_model, cat_params, n_iter=10, scoring='neg_root_mean_squared_error', cv=3, verbose=1, n_jobs=-1)
cat_search.fit(X_train, y_train, cat_features=categorical_features)

best_cat_model = cat_search.best_estimator_
cat_val_preds = best_cat_model.predict(X_val)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
xgb_model = XGBRegressor(random_state=42)

xgb_params = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [500, 800, 1000]
}

xgb_search = RandomizedSearchCV(xgb_model, xgb_params, n_iter=10, scoring='neg_root_mean_squared_error', cv=3, verbose=1, n_jobs=-1)
xgb_search.fit(X_train, y_train)

best_xgb_model = xgb_search.best_estimator_
xgb_val_preds = best_xgb_model.predict(X_val)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
ensemble_preds = (cat_val_preds + xgb_val_preds) / 2
rmse = np.sqrt(mean_squared_error(y_val, ensemble_preds))
print(f"Tuned XGB + CatBoost RMSE: {rmse:.4f}")

Tuned XGB + CatBoost RMSE: 333.7984


# **Step 7: Final Prediction & Submission**

In [None]:
xgb_test_preds = best_xgb_model.predict(test[features])
cat_test_preds = best_cat_model.predict(test[features])

final_preds = (xgb_test_preds + cat_test_preds) / 2

sample_submission['final_seatcount'] = final_preds
sample_submission.to_csv('submission.csv', index=False)

print("✅ Submission saved as submission.csv")

✅ Submission saved as submission.csv
