# Colab Modeling Notebook

Ce notebook permet dinstaller les dpendances, verifier les donnes, faire une CV rapide et gnerer une soumission avec LightGBM (et CatBoost optionnel).

Prrequis ct: le repo est clon dans Colab et le dossier `data/` contient:
- `data/train.csv`, `data/test.csv`
- `data/external/traffic/traffic_hourly_agg.parquet`
- `data/external/weather/meteostat_hourly_from_daily.parquet`
- `data/external/calendar/zone_c_holidays.csv`
- `data/external/france_lockdowns.csv`
- (optionnel) `data/external/events/events.csv`



In [None]:
# Install minimal deps (Colab)
import sys, subprocess
pkgs = ["pandas","numpy","scikit-learn","lightgbm","catboost","pyarrow","workalendar"]
for p in pkgs:
    try:
        __import__(p.split("==")[0])
    except Exception:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", p])
print("Ready.")



In [None]:
# Verify data paths
from pathlib import Path
import os
base = Path('.')
paths = [
    base/'data/train.csv',
    base/'data/test.csv',
    base/'data/external/traffic/traffic_hourly_agg.parquet',
    base/'data/external/weather/meteostat_hourly_from_daily.parquet',
    base/'data/external/calendar/zone_c_holidays.csv',
]
for p in paths:
    print(p, 'OK' if p.exists() else 'MISSING')



In [None]:
# Quick EDA
import pandas as pd
train = pd.read_csv('data/train.csv', parse_dates=['id'])
print(train.describe())
print(train.isna().mean())



In [None]:
# CV and train utilities
import numpy as np
from pathlib import Path
from feature_engineering import build_features, TARGET_COLUMNS
from winsorize_targets import winsorize
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.metrics import mean_absolute_error

# Load
train_df = pd.read_csv('data/train.csv', parse_dates=['id'])
train_df = winsorize(train_df)
test_df = pd.read_csv('data/test.csv', parse_dates=['id'])
train_df['is_train'] = 1
test_df['is_train'] = 0
test_df = test_df.assign(**{c: np.nan for c in TARGET_COLUMNS})
full = pd.concat([train_df, test_df], ignore_index=True)
full_feat = build_features(full)
feat_cols = [c for c in full_feat.columns if c not in ('id','is_train') and c not in TARGET_COLUMNS]
train_feat = full_feat[full_feat['is_train']==1].reset_index(drop=True)
test_feat = full_feat[full_feat['is_train']==0].reset_index(drop=True)
print('Features:', len(feat_cols))



In [None]:
# 2-fold CV then fit full and write submission
from sklearn.model_selection import TimeSeriesSplit

preds = {"id": test_feat["id"].dt.strftime("%Y-%m-%d %H")}
for target in TARGET_COLUMNS:
    y = train_feat[target].values
    X = train_feat[feat_cols]
    valid_mask = ~np.isnan(y)
    for c in feat_cols:
        valid_mask &= ~X[c].isna().values
    Xv, yv = X[valid_mask], y[valid_mask]

    model = LGBMRegressor(n_estimators=2000, learning_rate=0.03, num_leaves=63,
                          subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1,
                          reg_lambda=0.1, random_state=42, n_jobs=-1)
    # simple split
    n = len(Xv); val = min(24*7, max(24*7, n//10))
    tr = n - val
    X_tr, y_tr = Xv.iloc[:tr], yv[:tr]
    X_va, y_va = Xv.iloc[tr:], yv[tr:]
    model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='l1', callbacks=[early_stopping(50), log_evaluation(0)])
    preds[target] = model.predict(test_feat[feat_cols])

sub = pd.DataFrame(preds)
sub = sub[["id"] + TARGET_COLUMNS]
sub.to_csv('submissions/submission_colab_lgbm.csv', index=False)
print('Wrote submissions/submission_colab_lgbm.csv')



In [None]:
# CatBoost training and submission
from catboost import CatBoostRegressor, Pool

preds_cb = {"id": test_feat["id"].dt.strftime("%Y-%m-%d %H")}
for target in TARGET_COLUMNS:
    y = train_feat[target].values
    X = train_feat[feat_cols]
    valid_mask = ~np.isnan(y)
    for c in feat_cols:
        valid_mask &= ~X[c].isna().values
    Xv, yv = X[valid_mask], y[valid_mask]

    model = CatBoostRegressor(
        iterations=2000, learning_rate=0.03, depth=8, l2_leaf_reg=3.0,
        loss_function="MAE", eval_metric="MAE", random_seed=42,
        verbose=False, allow_writing_files=False
    )
    # simple split
    n = len(Xv)
    val = min(24*7, max(24*7, n//10))
    tr = n - val
    X_tr, y_tr = Xv.iloc[:tr], yv[:tr]
    X_va, y_va = Xv.iloc[tr:], yv[tr:]
    model.fit(Pool(X_tr, y_tr), eval_set=Pool(X_va, y_va), use_best_model=True)
    preds_cb[target] = model.predict(test_feat[feat_cols])

sub_cb = pd.DataFrame(preds_cb)
sub_cb = sub_cb[["id"] + TARGET_COLUMNS]
sub_cb.to_csv("submissions/submission_colab_catboost.csv", index=False)
print("Wrote submissions/submission_colab_catboost.csv")
