In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import lightgbm as lgbm
from lightgbm import LGBMRegressor
import optuna
import gc
import warnings
import traceback
from collections import defaultdict
import math

# Import helper functions
from utils import (
    smape,
    deg_to_sin,
    deg_to_cos,
    sincos_to_deg,
    convert_units,
    create_geo_clusters,
    create_time_features,
    create_lag_rolling_features_advanced,
    select_features
)

In [None]:
# 02_model_training.ipynb

# ==============================================================================
# Imports
# ==============================================================================


# ==============================================================================
# Configuration
# ==============================================================================
# --- Suppress Warnings ---
warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
pd.options.mode.chained_assignment = None

# --- Reproducibility & Control ---
GLOBAL_SEED = 42
np.random.seed(GLOBAL_SEED)
N_SEEDS = 3
OPTUNA_TRIALS = 100
N_SPLITS_TSCV = 5
N_SPLITS_FEAT_SELECT = 5
TOP_N_FEATURES = 150

# --- Data & Targets ---
TARGET_COLS = ["Avg_Temperature", "Radiation", "Rain_Amount", "Wind_Speed", "Wind_Direction"]
TARGETS_NORMAL = ["Avg_Temperature", "Radiation", "Rain_Amount", "Wind_Speed"]
TARGET_WIND_DIR = "Wind_Direction"
LOG_TRANSFORM_TARGETS = ["Rain_Amount", "Radiation", "Wind_Speed"]

# --- Feature Engineering ---
KELVIN_THRESHOLD = 100
N_CLUSTERS = 10
LAG_ROLL_INPUT_COLS = [
    "Avg_Temperature",
    "Avg_Feels_Like_Temperature",
    "Radiation",
    "Rain_Amount",
    "Rain_Duration",
    "Wind_Speed",
    "Temperature_Range",
    "Feels_Like_Temperature_Range",
    "Evapotranspiration"
]
CATEGORICAL_FEATURES_BASE = ['kingdom', 'geo_cluster', 'month', 'dayofweek', 'year', 'quarter']

# ==============================================================================
# Main Execution
# ==============================================================================
# --- Load Data ---
print("Loading initial data...")
try:
    train_df_orig = pd.read_csv("train.csv")
    test_df_orig = pd.read_csv("test.csv")
    sample_submission = pd.read_csv("sample_submission.csv")
except FileNotFoundError as e:
    print(f"Error loading data: {e}. Exiting.")
    exit()

if train_df_orig.empty or test_df_orig.empty:
    print("Error: Input CSV is empty. Exiting.")
    exit()

oof_preds_all_seeds = {}
test_preds_all_seeds = defaultdict(list)

for seed_run in range(N_SEEDS):
    current_seed = GLOBAL_SEED + seed_run
    print(f"\n{'='*25} Running Seed {seed_run+1}/{N_SEEDS} (Seed: {current_seed}) {'='*25}")
    np.random.seed(current_seed)
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    
    # --- Reload and Preprocess Data ---
    train_df = train_df_orig.copy()
    test_df = test_df_orig.copy()
    
    train_df = convert_units(train_df)
    train_df, test_df = create_geo_clusters(train_df, test_df, N_CLUSTERS, current_seed)
    train_df = create_time_features(train_df)
    test_df = create_time_features(test_df)
    
    if train_df.empty or test_df.empty:
        print(f"DF empty after time features. Skipping seed {current_seed}.")
        continue

    # Wind Dir Handling
    train_df[f'{TARGET_WIND_DIR}_sin'] = deg_to_sin(train_df[TARGET_WIND_DIR])
    train_df[f'{TARGET_WIND_DIR}_cos'] = deg_to_cos(train_df[TARGET_WIND_DIR])
    targets_with_sincos = TARGETS_NORMAL + [f'{TARGET_WIND_DIR}_sin', f'{TARGET_WIND_DIR}_cos']

    # Advanced Features
    lag_roll_inputs_this_run = [c for c in LAG_ROLL_INPUT_COLS if c in train_df.columns]
    train_df = create_lag_rolling_features_advanced(train_df, lag_roll_inputs_this_run)
    test_df = create_lag_rolling_features_advanced(test_df, lag_roll_inputs_this_run)

    # --- Define Full Feature Set ---
    exclude_cols = TARGET_COLS + targets_with_sincos + ['ID', 'date', 'latitude', 'longitude']
    initial_features = [f for f in train_df.columns if f not in exclude_cols and f in test_df.columns]
    categorical_features = [f for f in CATEGORICAL_FEATURES_BASE if f in initial_features]

    # Align Categoricals
    for cat_col in categorical_features:
        all_cats = pd.concat([train_df[cat_col].astype(str), test_df[cat_col].astype(str)]).unique()
        train_df[cat_col] = pd.Categorical(train_df[cat_col].astype(str), categories=all_cats)
        test_df[cat_col] = pd.Categorical(test_df[cat_col].astype(str), categories=all_cats)

    # --- Data for Modeling ---
    try:
        X = train_df[initial_features].copy()
        Y = train_df[targets_with_sincos].copy()
        X_test_full = test_df[initial_features].copy()
    except KeyError as e:
        print(f"Error preparing data arrays: Missing columns {e}. Skipping seed.")
        continue

    oof_df_seed = pd.DataFrame(index=X.index, columns=targets_with_sincos)

    for target in targets_with_sincos:
        print(f"\n===== Processing Target: {target} (Seed: {current_seed}) =====")
        gc.collect()
        y_target = Y[target].copy()
        X_target = X.copy()
        
        if y_target.isnull().any():
            valid_indices = y_target.notna()
            X_target = X_target.loc[valid_indices]
            y_target = y_target.loc[valid_indices]
            if X_target.empty:
                print("  No data left. Skipping.")
                continue

        # Target Transformation
        is_log_transformed = False
        inv_tf = lambda x: x
        if target in LOG_TRANSFORM_TARGETS:
            y_target_transformed = np.log1p(y_target + 1e-6) if (y_target <= 0).any() else np.log1p(y_target)
            inv_tf = np.expm1
            is_log_transformed = True

        # Feature Selection
        selected_features = select_features(X_target, y_target_transformed, initial_features, TOP_N_FEATURES, categorical_features, current_seed)
        X_train_fs = X_target[selected_features]
        X_test_fs = X_test_full[selected_features].copy()

        # Imputation & Scaling
        numerical_features_selected = X_train_fs.select_dtypes(include=np.number).columns.tolist()
        if numerical_features_selected:
            imputer = SimpleImputer(strategy='median')
            X_train_fs[numerical_features_selected] = imputer.fit_transform(X_train_fs[numerical_features_selected])
            X_test_fs[numerical_features_selected] = imputer.transform(X_test_fs[numerical_features_selected])
            
            scaler = StandardScaler()
            X_train_fs[numerical_features_selected] = scaler.fit_transform(X_train_fs[numerical_features_selected])
            X_test_fs[numerical_features_selected] = scaler.transform(X_test_fs[numerical_features_selected])

        cats_selected = [c for c in categorical_features if c in selected_features]
        for cat_col in cats_selected:
            all_cats = pd.concat([X_train_fs[cat_col].astype(str), X_test_fs[cat_col].astype(str)]).unique()
            X_train_fs[cat_col] = pd.Categorical(X_train_fs[cat_col].astype(str), categories=all_cats)
            X_test_fs[cat_col] = pd.Categorical(X_test_fs[cat_col].astype(str), categories=all_cats)

        # Optuna Tuning
        def objective(trial):
            params = {
                'objective': 'regression_l1',
                'metric': 'mae',
                'verbosity': -1,
                'n_jobs': -1,
                'seed': current_seed,
                'boosting_type': 'gbdt',
                'n_estimators': trial.suggest_int('n_estimators', 500, 4000, step=100),
                'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05),
                'num_leaves': trial.suggest_int('num_leaves', 20, 150),
                'max_depth': trial.suggest_int('max_depth', 5, 16),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.05),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.05),
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
                'subsample_freq': trial.suggest_int('subsample_freq', 0, 7),
                'min_child_samples': trial.suggest_int('min_child_samples', 5, 50)
            }
            
            tscv = TimeSeriesSplit(n_splits=N_SPLITS_TSCV)
            scores = []
            oof_preds_fold = np.full(len(X_train_fs), np.nan)
            lgbm_cats_obj = [c for c in cats_selected if c in X_train_fs.columns] or 'auto'

            for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train_fs)):
                X_tr, X_val = X_train_fs.iloc[train_idx], X_train_fs.iloc[val_idx]
                y_tr, y_val = y_target_transformed.iloc[train_idx], y_target_transformed.iloc[val_idx]
                y_val_orig = y_target.iloc[val_idx]

                model = LGBMRegressor(**params)
                callbacks = [lgbm.early_stopping(100, verbose=False)]
                if OPTUNA_INTEGRATION_AVAILABLE:
                    callbacks.append(lgb_optuna.LightGBMPruningCallback(trial, 'l1'))

                model.fit(
                    X_tr, y_tr,
                    eval_set=[(X_val, y_val)],
                    eval_metric='mae',
                    callbacks=callbacks,
                    categorical_feature=lgbm_cats_obj
                )

                preds_val_tf = model.predict(X_val)
                preds_val_orig = inv_tf(preds_val_tf)
                oof_preds_fold[val_idx] = preds_val_orig

                if target in LOG_TRANSFORM_TARGETS or target == "Radiation":
                    preds_val_orig = np.clip(preds_val_orig, 0, None)
                if target.endswith('_sin') or target.endswith('_cos'):
                    preds_val_orig = np.clip(preds_val_orig, -1, 1)

                scores.append(smape(y_val_orig, preds_val_orig))

            trial.set_user_attr("oof_predictions", oof_preds_fold)
            return np.mean(scores)

        pruner = optuna.pruners.MedianPruner(n_startup_trials=15, n_warmup_steps=30, interval_steps=10)
        study = optuna.create_study(direction='minimize', pruner=pruner)
        study.optimize(objective, n_trials=OPTUNA_TRIALS, show_progress_bar=True, catch=(Exception,))

        # Store best OOF
        best_oof = study.best_trial.user_attrs.get("oof_predictions")
        if best_oof is not None:
            valid_indices = ~np.isnan(best_oof)
            oof_df_seed.loc[X_target.index[valid_indices], target] = best_oof[valid_indices]

        # Train Final Model
        final_params = study.best_trial.params
        final_params.update({
            'objective': 'regression_l1',
            'metric': 'mae',
            'verbosity': -1,
            'n_jobs': -1,
            'seed': current_seed,
            'boosting_type': 'gbdt'
        })
        final_model = LGBMRegressor(**final_params)
        final_model.fit(X_train_fs, y_target_transformed, categorical_feature=lgbm_cats_obj)

        # Predict Test
        test_preds_tf = final_model.predict(X_test_fs)
        test_preds_orig = inv_tf(test_preds_tf)
        test_preds_all_seeds[target].append(test_preds_orig)

        del final_model, study, X_target, y_target, X_train_fs, X_test_fs
        gc.collect()

    oof_preds_all_seeds[current_seed] = oof_df_seed

# ==============================================================================
# Aggregation & Final Submission
# ==============================================================================
print("\n{'='*25} Ensembling & Final Submission {'='*25}")
final_test_preds_agg = {}
expected_len = len(test_df_orig)

for target in test_preds_all_seeds.keys():
    valid_preds = [p for p in test_preds_all_seeds[target] if len(p) == expected_len]
    final_test_preds_agg[target] = np.mean(valid_preds, axis=0) if valid_preds else np.zeros(expected_len)

# Convert Wind Dir
wind_dir = sincos_to_deg(
    np.clip(final_test_preds_agg.get(f'{TARGET_WIND_DIR}_sin', 0), -1, 1),
    np.clip(final_test_preds_agg.get(f'{TARGET_WIND_DIR}_cos', 0), -1, 1)
)
final_test_preds_agg[TARGET_WIND_DIR] = wind_dir

# Post-process
for target in TARGET_COLS:
    if target in final_test_preds_agg:
        preds = final_test_preds_agg[target]
        if target in ["Rain_Amount", "Radiation", "Wind_Speed"]:
            preds = np.clip(preds, 0, None)
            preds[preds < 1e-4] = 0
        if target == TARGET_WIND_DIR:
            preds = np.clip(preds, 0, 360)
        if target == "Avg_Temperature":
            preds = np.clip(preds, -50, 60)
        final_test_preds_agg[target] = preds
    else:
        final_test_preds_agg[target] = 0

# Create Submission
submission_df = pd.DataFrame({'ID': test_df_orig['ID']})
for col in TARGET_COLS:
    submission_df[col] = final_test_preds_agg.get(col, 0)

final_submission = sample_submission[['ID']].merge(submission_df, on='ID', how='left')
final_submission = final_submission.fillna(0)[sample_submission.columns]
final_submission.to_csv("final_submission.csv", index=False)
print("Submission file created successfully.")