## Import libraries

In [None]:
# Schema-agnostic baseline with safe preprocessing + LightGBM
import os, gc, warnings, re
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

from lightgbm import LGBMRegressor

## Config

In [None]:
TRAIN_PATH = "/Train.csv"
TEST_PATH  = "/Test.csv"
SAMPLE_SUB_PATH = "/SampleSubmission.csv"
TARGET_COL = "target"
RANDOM_STATE = 42
N_SPLITS = 5

## Utils

In [None]:
def find_id_col(df):
    # Prefer “id”, “ID”, or any single column that looks like an identifier
    candidates = [c for c in df.columns if c.lower() == "id"]
    if candidates:
        return candidates[0]
    # fallback: look for something ending with id
    for c in df.columns:
        if re.search(r'\bid\b$', c, flags=re.IGNORECASE):
            return c
    # as a last resort, use the first column
    return df.columns[0]

def try_parse_datetime(series: pd.Series) -> pd.Series:
    """Try parsing to datetime; return parsed if success rate > 70%, else original."""
    s = pd.to_datetime(series, errors="coerce", utc=True)
    ok = s.notna().mean()
    return s if ok >= 0.7 else series

def add_datetime_features(df, col):
    # all UTC; derive basic features (no leakage)
    s = df[col]
    df[f"{col}_year"]   = s.dt.year
    df[f"{col}_month"]  = s.dt.month
    df[f"{col}_day"]    = s.dt.day
    df[f"{col}_dow"]    = s.dt.dayofweek
    df[f"{col}_hour"]   = s.dt.hour
    df[f"{col}_is_weekend"] = (s.dt.dayofweek >= 5).astype("int8")
    # cyclic encodings
    df[f"{col}_month_sin"] = np.sin(2*np.pi*(df[f"{col}_month"]-1)/12)
    df[f"{col}_month_cos"] = np.cos(2*np.pi*(df[f"{col}_month"]-1)/12)
    df[f"{col}_hour_sin"]  = np.sin(2*np.pi*df[f"{col}_hour"]/24)
    df[f"{col}_hour_cos"]  = np.cos(2*np.pi*df[f"{col}_hour"]/24)

def frequency_encode(train_col: pd.Series, test_col: pd.Series):
    freq = train_col.value_counts(dropna=False)
    train_fe = train_col.map(freq)
    test_fe  = test_col.map(freq).fillna(0)
    return train_fe, test_fe

def cv_target_encode(X, y, X_test, col, n_splits=5, noise_std=0.0):
    """Leakage-safe KFold target encoding (mean encoding)."""
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    oof = pd.Series(index=X.index, dtype=float)
    global_mean = y.mean()
    for tr_idx, val_idx in kf.split(X):
        m = X.iloc[tr_idx][col].to_frame().join(y.iloc[tr_idx])
        means = m.groupby(col)[y.name].mean()
        oof.iloc[val_idx] = X.iloc[val_idx][col].map(means).fillna(global_mean)
    test_enc = X_test[col].map(X.join(y).groupby(col)[y.name].mean()).fillna(global_mean)
    if noise_std > 0:
        oof = oof + np.random.normal(0, noise_std, size=len(oof))
    return oof.values, test_enc.values

## Load

In [None]:
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
id_col_test = find_id_col(test)

assert TARGET_COL in train.columns, f"'{TARGET_COL}' not found in Train.csv!"

# Try infer id column from sample submission, else from Test.csv
if os.path.exists(SAMPLE_SUB_PATH):
    sample_sub = pd.read_csv(SAMPLE_SUB_PATH)
    id_col = sample_sub.columns[0]
else:
    id_col = find_id_col(train) if find_id_col(train) in train.columns else id_col_test

# Keep IDs
train_ids = train[id_col] if id_col in train.columns else pd.Series(range(len(train)))
test_ids  = test[id_col_test]

y = train[TARGET_COL].astype(float)

## Identify columns and parse datetimes

In [None]:
from pandas.api.types import is_datetime64_any_dtype

In [None]:
# Exclude target and any id-like columns
drop_cols = set([TARGET_COL])
for c in train.columns:
    if c.lower() == "id" or re.search(r'\bid\b$', c, flags=re.IGNORECASE):
        drop_cols.add(c)

feature_cols = [c for c in train.columns if c not in drop_cols]

# tentative datetime detection on object-like cols
dt_candidates = []
X_train_raw = train[feature_cols].copy()
X_test_raw  = test[[c for c in feature_cols if c in test.columns]].copy()

for c in feature_cols:
    if X_train_raw[c].dtype == "O" or np.issubdtype(X_train_raw[c].dtype, np.integer):
        parsed = try_parse_datetime(X_train_raw[c])
        # if np.issubdtype(parsed.dtype, np.datetime64):
        if is_datetime64_any_dtype(parsed):
            # also parse test
            parsed_test = pd.to_datetime(X_test_raw[c], errors="coerce", utc=True)
            X_train_raw[c] = parsed
            X_test_raw[c]  = parsed_test
            dt_candidates.append(c)

## Feature Engineering

In [None]:
# 1) Date/time features
for c in dt_candidates:
    add_datetime_features(X_train_raw, c)
    add_datetime_features(X_test_raw, c)

# Option: drop original datetime columns to avoid leakage via exact timestamps
X_train_raw.drop(columns=dt_candidates, inplace=True)
X_test_raw.drop(columns=[c for c in dt_candidates if c in X_test_raw.columns], inplace=True)

# 2) Basic text/route lengths for object cols (non-datetime)
obj_cols = [c for c in X_train_raw.columns if X_train_raw[c].dtype == "O"]
for c in obj_cols:
    X_train_raw[f"{c}_len"] = X_train_raw[c].astype(str).str.len()
    X_test_raw[f"{c}_len"]  = X_test_raw[c].astype(str).str.len()

# 3) Frequency encoding for medium/high-cardinality categoricals
# Decide which to frequency-encode vs ordinal-encode
cat_cols = [c for c in X_train_raw.columns if X_train_raw[c].dtype == "O"]
med_hi_cat = []
low_cat    = []
for c in cat_cols:
    nunique = X_train_raw[c].nunique(dropna=False)
    if nunique > 20:
        med_hi_cat.append(c)
    else:
        low_cat.append(c)

for c in med_hi_cat:
    tr_fe, te_fe = frequency_encode(X_train_raw[c].astype("category"), X_test_raw[c].astype("category"))
    X_train_raw[f"{c}_freq"] = tr_fe
    X_test_raw[f"{c}_freq"]  = te_fe

# 4) Safe target encoding for very high-cardinality cols (optional, off by default)
# If desired, uncomment to apply to (say) top two biggest categoricals.
# high_card_sorted = sorted(med_hi_cat, key=lambda col: X_train_raw[col].nunique(dropna=False), reverse=True)[:2]
# for c in high_card_sorted:
#     tr_te, te_te = cv_target_encode(X_train_raw, y, X_test_raw, c, n_splits=N_SPLITS, noise_std=0.0)
#     X_train_raw[f"{c}_te"] = tr_te
#     X_test_raw[f"{c}_te"]  = te_te

# After engineered features, remove raw object columns; we’ll ordinal-encode the small ones.
X_train_proc = X_train_raw.copy()
X_test_proc  = X_test_raw.copy()

# Keep low-cardinality categoricals for ordinal encoding
to_oe = low_cat

# Fill obvious NA before encoding
for c in X_train_proc.columns:
    if c in to_oe:
        X_train_proc[c] = X_train_proc[c].astype("object").fillna("__NA__")
        X_test_proc[c]  = X_test_proc[c].astype("object").fillna("__NA__")
    elif pd.api.types.is_numeric_dtype(X_train_proc[c]):
        med = X_train_proc[c].median()
        X_train_proc[c] = X_train_proc[c].fillna(med)
        if c in X_test_proc.columns:
            X_test_proc[c]  = X_test_proc[c].fillna(med)

# Ordinal encode low-cardinality categoricals (consistent fit on combined)
if to_oe:
    oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1, dtype="int64")
    comb = pd.concat([X_train_proc[to_oe], X_test_proc[to_oe]], axis=0)
    comb_enc = oe.fit_transform(comb)
    X_train_proc[to_oe] = comb_enc[:len(X_train_proc)]
    X_test_proc[to_oe]  = comb_enc[len(X_train_proc):]

# Drop raw medium/high-cardinality object cols (we kept *_freq and *_len)
X_train_proc.drop(columns=med_hi_cat, inplace=True, errors="ignore")
X_test_proc.drop(columns=[c for c in med_hi_cat if c in X_test_proc.columns], inplace=True, errors="ignore")

# Final safety alignment
X_test_proc = X_test_proc.reindex(columns=X_train_proc.columns, fill_value=0)

# Optional clipping of extreme targets (robust to outliers)
# y = y.clip(lower=0, upper=np.percentile(y, 99.5))


## CV Strategy (time-aware if possible)

In [None]:
# If any engineered datetime features exist, prefer TimeSeriesSplit using the *earliest* parsed datetime column
time_split = None
if dt_candidates:
    # Pick the earliest parsed DT column as ordering anchor
    # Re-create a timestamp from features to sort (year, month, day, hour) — robust if some parts missing
    anchor = dt_candidates[0]  # first parsed column
    # We had dropped original, but we can reconstruct an ordering proxy:
    # use the derived numeric features
    base_cols = [f"{anchor}_year", f"{anchor}_month", f"{anchor}_day", f"{anchor}_hour"]
    has_all = all(c in X_train_raw.columns for c in base_cols)
    if has_all:
        order_key = (X_train_raw[f"{anchor}_year"].fillna(1970).astype(int) * 10_000_000 +
                     X_train_raw[f"{anchor}_month"].fillna(1).astype(int) * 100_000 +
                     X_train_raw[f"{anchor}_day"].fillna(1).astype(int) * 1_000 +
                     X_train_raw[f"{anchor}_hour"].fillna(0).astype(int))
        order = np.argsort(order_key.values)
        time_split = TimeSeriesSplit(n_splits=N_SPLITS)
    else:
        time_split = None

## LightGBM Model & CV

In [None]:
!pip install --upgrade lightgbm



In [None]:
import lightgbm
print(lightgbm.__version__)
print(lightgbm.__file__)

4.6.0
/usr/local/lib/python3.11/dist-packages/lightgbm/__init__.py


In [None]:
!pip install --upgrade scikit-learn



In [None]:
lgb_params = dict(
    n_estimators=5000,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    num_leaves=63,
    max_depth=-1,
    reg_alpha=0.0,
    reg_lambda=2.0,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

oof = np.zeros(len(X_train_proc))
test_pred = np.zeros(len(X_test_proc))
rmses = []

if time_split is not None:
    splitter = time_split
    idx_seq = np.arange(len(X_train_proc))
else:
    splitter = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    idx_seq = None

for fold, (tr_idx, val_idx) in enumerate(splitter.split(X_train_proc if idx_seq is None else idx_seq)):
    if idx_seq is not None:
        tr_idx, val_idx = tr_idx, val_idx  # already indices
    X_tr, X_val = X_train_proc.iloc[tr_idx], X_train_proc.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    model = LGBMRegressor(**lgb_params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric="rmse",
        # callbacks=[],
        callbacks=[lightgbm.early_stopping(stopping_rounds=50)]
        # early_stopping_rounds=200
    )

    val_pred = model.predict(X_val, num_iteration=model.best_iteration_)
    oof[val_idx] = val_pred
    # rmse = mean_squared_error(y_val, val_pred, squared=False)
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    rmses.append(rmse)
    print(f"Fold {fold+1}/{N_SPLITS} RMSE: {rmse:.4f}")

    test_pred += model.predict(X_test_proc, num_iteration=model.best_iteration_) / N_SPLITS

# cv_rmse = mean_squared_error(y, oof, squared=False)
cv_rmse = np.sqrt(mean_squared_error(y, oof))
print(f"OOF RMSE: {cv_rmse:.4f} | Fold RMSEs: {np.round(rmses, 4)}")

# Non-negative delays (optional but reasonable)
test_pred = np.clip(test_pred, a_min=0, a_max=None)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001264 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 603
[LightGBM] [Info] Number of data points in the train set: 17973, number of used features: 21
[LightGBM] [Info] Start training from score 39.043065
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[11]	valid_0's rmse: 91.8894	valid_0's l2: 8443.66
Fold 1/5 RMSE: 91.8894
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006960 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 661
[LightGBM] [Info] Number of data points in the train set: 35945, number of used features: 24
[LightGBM] [Info] Start training from score 34.224426
Training until validation scores don't improve for 50 rounds
Early stopping, best 

## Submission

In [None]:
submission = pd.DataFrame({
    "id": test_ids,        # Must mirror Test.csv IDs
    "target": test_pred
})
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")

Saved submission.csv


In [None]:
from google.colab import files
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>