In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import lightgbm as lgbm
from lightgbm import LGBMRegressor
import optuna
import gc
import warnings
import traceback
from collections import defaultdict
import math
try:
    import optuna.integration.lightgbm as lgb_optuna
    OPTUNA_INTEGRATION_AVAILABLE = True
except ImportError:
    print("WARNING: `optuna-integration[lightgbm]` not found. Pruning callback disabled.")
    print("Install using: pip install optuna-integration[lightgbm]")
    OPTUNA_INTEGRATION_AVAILABLE = False

# Import helper functions
from utils import (
    smape,
    deg_to_sin,
    deg_to_cos,
    sincos_to_deg,
    convert_units,
    create_geo_clusters,
    create_time_features,
    create_lag_rolling_features_advanced,
    select_features
)

In [None]:
#--- Suppress Warnings ---
warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning) # Suppress fragmentation warning
pd.options.mode.chained_assignment = None

# --- Reproducibility & Control ---
GLOBAL_SEED = 42
np.random.seed(GLOBAL_SEED)
N_SEEDS = 3
OPTUNA_TRIALS = 100
N_SPLITS_TSCV = 5

TOP_N_FEATURES = 150

# --- Data & Targets ---
TARGET_COLS = ["Avg_Temperature", "Radiation", "Rain_Amount", "Wind_Speed", "Wind_Direction"]
TARGETS_NORMAL = ["Avg_Temperature", "Radiation", "Rain_Amount", "Wind_Speed"]
TARGET_WIND_DIR = "Wind_Direction"
LOG_TRANSFORM_TARGETS = ["Rain_Amount", "Radiation", "Wind_Speed"]

# --- Feature Engineering ---
N_CLUSTERS = 10
LAG_ROLL_INPUT_COLS = ["Avg_Temperature", "Avg_Feels_Like_Temperature", "Radiation", "Rain_Amount", "Rain_Duration", "Wind_Speed", "Temperature_Range", "Feels_Like_Temperature_Range", "Evapotranspiration"]
CATEGORICAL_FEATURES_BASE = ['kingdom', 'geo_cluster', 'month', 'dayofweek', 'year', 'quarter']


In [None]:
try:
    train_df_orig = pd.read_csv("../data/train.csv"); 
    test_df_orig = pd.read_csv("../data/test.csv"); 
    sample_submission = pd.read_csv("../data/sample_submission.csv")

except FileNotFoundError as e: print(f"Error loading data: {e}. Exiting."); exit()

if train_df_orig.empty or test_df_orig.empty: 
    print("Error: Input CSV is empty. Exiting."); exit()


oof_preds_all_seeds = {} # Store OOF {seed: oof_df}
test_preds_all_seeds = defaultdict(list) # Store test {target: [seed0_preds, seed1_preds,...]}

for seed_run in range(N_SEEDS):
    current_seed = GLOBAL_SEED + seed_run

    print(f"\n{'='*25} Running Seed {seed_run+1}/{N_SEEDS} (Seed: {current_seed}) {'='*25}")

    np.random.seed(current_seed); optuna.logging.set_verbosity(optuna.logging.WARNING)

    # --- Reload and Preprocess Data ---
    train_df = train_df_orig.copy(); test_df = test_df_orig.copy()
    train_df = convert_units(train_df)
    train_df, test_df = create_geo_clusters(train_df, test_df, N_CLUSTERS, current_seed)
    train_df = create_time_features(train_df)
    test_df = create_time_features(test_df)

    if train_df.empty or test_df.empty: 
        print(f"DF empty after time features. Skipping seed {current_seed}."); 
        continue

    # Wind Dir Handling
    train_df[f'{TARGET_WIND_DIR}_sin'] = deg_to_sin(train_df[TARGET_WIND_DIR])
    train_df[f'{TARGET_WIND_DIR}_cos'] = deg_to_cos(train_df[TARGET_WIND_DIR])
    targets_with_sincos = TARGETS_NORMAL + [f'{TARGET_WIND_DIR}_sin', f'{TARGET_WIND_DIR}_cos']

    # Advanced Features
    lag_roll_inputs_this_run = [c for c in LAG_ROLL_INPUT_COLS if c in train_df.columns]
    train_df = create_lag_rolling_features_advanced(train_df, lag_roll_inputs_this_run)
    test_df = create_lag_rolling_features_advanced(test_df, lag_roll_inputs_this_run)

    # --- Define Full Feature Set ---
    print("\nDefining full feature set...")

    all_cols = train_df.columns.tolist()
    exclude_cols = TARGET_COLS + targets_with_sincos + ['ID', 'date', 'latitude', 'longitude']
    initial_features = sorted([f for f in all_cols if f not in exclude_cols and f in test_df.columns])
    categorical_features = [f for f in CATEGORICAL_FEATURES_BASE if f in initial_features]

    print(f"Initial feature set size: {len(initial_features)}")

    # Align Categoricals
    print("Aligning final categorical dtypes...")
    for cat_col in categorical_features:
        if cat_col in train_df.columns and cat_col in test_df.columns:

            train_df[cat_col] = train_df[cat_col].astype('category'); test_df[cat_col] = test_df[cat_col].astype('category')
            all_cats = pd.concat([train_df[cat_col].astype(str), test_df[cat_col].astype(str)]).unique()
            train_df[cat_col] = pd.Categorical(train_df[cat_col].astype(str), categories=all_cats, ordered=False)
            test_df[cat_col] = pd.Categorical(test_df[cat_col].astype(str), categories=all_cats, ordered=False)

    # --- Data for Modeling ---
    try: 
        X = train_df[initial_features].copy(); 
        Y = train_df[targets_with_sincos].copy(); 
        X_test_full = test_df[initial_features].copy()

    except KeyError as e: 
        print(f"Error preparing data arrays: Missing columns {e}. Skipping seed."); 
        continue

    oof_df_seed = pd.DataFrame(index=X.index) # Initialize with the correct index
    for target in targets_with_sincos:
        oof_df_seed[target] = np.nan # Add columns one by one, filled with NaN


    print("\n--- Starting Model Training Loop (Per Target) ---")
    targets_processed_this_seed = []

    for target in targets_with_sincos:
        print(f"\n===== Processing Target: {target} (Seed: {current_seed}) =====")

        gc.collect(); y_target = Y[target].copy(); X_target = X.copy()

        if y_target.isnull().any():
            valid_indices = y_target.notna(); X_target = X_target.loc[valid_indices]; y_target = y_target.loc[valid_indices]

            if X_target.empty: print(f"  No data left. Skipping."); 
            continue

        # Target Transformation
        is_log_transformed = False; 
        inv_tf = lambda x: x

        if target in LOG_TRANSFORM_TARGETS:

            if (y_target <= 0).any(): 
                y_target_transformed = np.log1p(y_target + 1e-6)

            else: y_target_transformed = np.log1p(y_target)

            inv_tf = np.expm1; is_log_transformed = True; print("  Log1p Transform Applied.")

        else: y_target_transformed = y_target

        # Feature Selection
        selected_features = select_features(X_target, y_target_transformed, initial_features, TOP_N_FEATURES, categorical_features, current_seed)

        if not selected_features: 
            print("  Error: No features selected. Skipping target."); 
            continue

        X_train_fs = X_target[selected_features]; X_test_fs = X_test_full[selected_features].copy()

        # Imputation & Scaling
        print(f"  Processing selected features ({len(selected_features)})...")

        numerical_features_selected = X_train_fs.select_dtypes(include=np.number).columns.tolist()
        imputer, scaler = None, None

        if numerical_features_selected:

            if X_train_fs[numerical_features_selected].isnull().any().any():

                print("    Imputing..."); 
                imputer = SimpleImputer(strategy='median'); 
                X_train_fs[numerical_features_selected] = imputer.fit_transform(X_train_fs[numerical_features_selected]); 
                X_test_fs[numerical_features_selected] = imputer.transform(X_test_fs[numerical_features_selected])

            print("    Scaling..."); 
            scaler = StandardScaler(); 
            X_train_fs[numerical_features_selected] = scaler.fit_transform(X_train_fs[numerical_features_selected]); 
            X_test_fs[numerical_features_selected] = scaler.transform(X_test_fs[numerical_features_selected])

        cats_selected = [c for c in categorical_features if c in selected_features]
        for cat_col in cats_selected:
             all_cats = pd.concat([X_train_fs[cat_col].astype(str), X_test_fs[cat_col].astype(str)]).unique()
             X_train_fs[cat_col] = pd.Categorical(X_train_fs[cat_col].astype(str), categories=all_cats, ordered=False)
             X_test_fs[cat_col] = pd.Categorical(X_test_fs[cat_col].astype(str), categories=all_cats, ordered=False)

        if X_train_fs.isnull().any().any() or X_test_fs.isnull().any().any(): 
            print("CRITICAL WARNING: NaNs detected AFTER imputation/scaling. Skipping target."); 
            continue

        # Optuna Tuning
        print(f"  Starting Optuna tuning ({OPTUNA_TRIALS} trials, Data: {X_train_fs.shape})...")

        def objective(trial):
            min_samples_needed = N_SPLITS_TSCV + 1

            if len(X_train_fs) < min_samples_needed: 
                raise optuna.exceptions.TrialPruned(f"Samples ({len(X_train_fs)}) < N_SPLITS+1")
            
            params = { 'objective': 'regression_l1', 'metric': 'mae', 'verbosity': -1, 'n_jobs': -1, 'seed': 
                      current_seed, 'boosting_type': 'gbdt', 'n_estimators': 
                      trial.suggest_int('n_estimators', 500, 4000, step=100), 'learning_rate': 
                      trial.suggest_float('learning_rate', 0.005, 0.05), 'num_leaves': 
                      trial.suggest_int('num_leaves', 20, 150), 'max_depth': 
                      trial.suggest_int('max_depth', 5, 16), 'subsample': 
                      trial.suggest_float('subsample', 0.5, 1.0, step=0.05), 'colsample_bytree': 
                      trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.05), 'reg_alpha': 
                      trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True), 'reg_lambda': 
                      trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True), 'subsample_freq': 
                      trial.suggest_int('subsample_freq', 0, 7), 'min_child_samples': 
                      trial.suggest_int('min_child_samples', 5, 50) }
            
            tscv = TimeSeriesSplit(n_splits=N_SPLITS_TSCV); scores = []
            lgbm_cats_obj = [c for c in categorical_features if c in X_train_fs.columns] or 'auto'
            oof_preds_fold = np.full(len(X_train_fs), np.nan)

            try:
                for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train_fs)):

                    if len(val_idx) == 0: 
                        continue

                    X_tr, X_val = X_train_fs.iloc[train_idx], X_train_fs.iloc[val_idx]; 
                    y_tr, y_val = y_target_transformed.iloc[train_idx], y_target_transformed.iloc[val_idx]; 
                    y_val_orig = y_target.iloc[val_idx]

                    model = LGBMRegressor(**params)
                    callbacks_list = [lgbm.early_stopping(100, verbose=False)]

                    if OPTUNA_INTEGRATION_AVAILABLE: 
                        callbacks_list.insert(0, lgb_optuna.LightGBMPruningCallback(trial, 'l1'))

                    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], eval_metric='mae', callbacks=callbacks_list, categorical_feature=lgbm_cats_obj)
                    preds_val_tf = model.predict(X_val); preds_val_orig = inv_tf(preds_val_tf)
                    oof_preds_fold[val_idx] = preds_val_orig # Store OOF

                    # Clip for scoring
                    if target in LOG_TRANSFORM_TARGETS or target == "Radiation": 
                        preds_val_orig = np.clip(preds_val_orig, 0, None)

                    if target.endswith('_sin'): 
                        preds_val_orig = np.clip(preds_val_orig, -1, 1)

                    if target.endswith('_cos'): 
                        preds_val_orig = np.clip(preds_val_orig, -1, 1)

                    if np.isnan(preds_val_orig).any() or np.isnan(y_val_orig).any(): 
                        continue

                    scores.append(smape(y_val_orig, preds_val_orig))

            except optuna.exceptions.TrialPruned as e: raise
            
            except Exception as e: print(f" Objective Error: {e}"); traceback.print_exc(); raise optuna.exceptions.TrialPruned(f"Obj Err: {e}")

            if not scores: raise optuna.exceptions.TrialPruned("No valid scores.")

            trial.set_user_attr("oof_predictions", oof_preds_fold) # Store full OOF array for trial
            return np.mean(scores) # Return sMAPE

        pruner = optuna.pruners.MedianPruner(n_startup_trials=15, n_warmup_steps=30, interval_steps=10)
        study = optuna.create_study(direction='minimize', pruner=pruner)

        try: 
            study.optimize(objective, n_trials=OPTUNA_TRIALS, show_progress_bar=True, catch=(Exception,))
            
        except Exception as e: print(f" Optuna optimize failed: {e}"); continue
        
        if study.best_trial is None: 
            print(f" No best trial found. Skipping."); 
            continue

        print(f"  Best CV sMAPE: {study.best_value:.4f}")

        # Store best OOF
        best_oof = study.best_trial.user_attrs.get("oof_predictions")

        if best_oof is not None:

            valid_oof_indices = ~np.isnan(best_oof)
            original_indices_for_oof = X_train_fs.index[valid_oof_indices]
            valid_oof_values = best_oof[valid_oof_indices]

            oof_df_seed.loc[original_indices_for_oof, target] = valid_oof_values

        else: 
            print("  Warning: Could not retrieve OOF predictions from best trial.")

        # Train Final Model
        print(f"  Training final model...")

        final_params = study.best_trial.params; 
        final_params.update({'objective': 'regression_l1', 'metric': 'mae', 'verbosity': -1, 'n_jobs': -1, 'seed': current_seed, 'boosting_type': 'gbdt'})
        final_model = LGBMRegressor(**final_params); lgbm_cats_final = [c for c in categorical_features if c in X_train_fs.columns] or 'auto'

        try: 
            final_model.fit(X_train_fs, y_target_transformed, categorical_feature=lgbm_cats_final)
        except Exception as e: 
            print(f"  Error training final model: {e}"); 
            continue

        # Predict & Store Test
        try:
            print(f"  Predicting test data ({X_test_fs.shape})...")

            test_preds_tf = final_model.predict(X_test_fs)
            test_preds_orig = inv_tf(test_preds_tf)

            if len(test_preds_orig) == len(test_df): 
                test_preds_all_seeds[target].append(test_preds_orig); 
                targets_processed_this_seed.append(target)

            else: 
                print(f"  Error: Pred length ({len(test_preds_orig)}) != Test length ({len(test_df)}).")

        except Exception as e: print(f"  Error predicting: {e}")

        del final_model, study, X_target, y_target, X_train_fs, X_test_fs, y_target_transformed; gc.collect()

    oof_preds_all_seeds[current_seed] = oof_df_seed # Store this seed's OOF

    print(f"--- Finished Seed {seed_run+1}/{N_SEEDS} (Processed: {len(targets_processed_this_seed)} targets) ---")

    del train_df, test_df, X, Y, X_test_full; gc.collect()

In [None]:
# --- Ensemble Test Predictions ---
final_test_preds_agg = {}

if not test_preds_all_seeds: print("CRITICAL ERROR: No predictions generated. Cannot create submission."); exit()

common_targets = set(test_preds_all_seeds.keys())

print(f"Targets with predictions: {common_targets}")

expected_len = len(test_df_orig) # Use original test length

for target in common_targets:
    valid_preds = [p for p in test_preds_all_seeds[target] if p is not None and isinstance(p, np.ndarray) and len(p) == expected_len]
    if not valid_preds: print(f"Warning: No valid predictions for {target}. Filling with 0."); final_test_preds_agg[target] = np.zeros(expected_len); continue
    print(f"  Averaging {len(valid_preds)} predictions for {target}...")
    final_test_preds_agg[target] = np.mean(np.array(valid_preds), axis=0)

# --- Convert Wind Dir ---
wind_sin_key = f'{TARGET_WIND_DIR}_sin'; wind_cos_key = f'{TARGET_WIND_DIR}_cos'
if wind_sin_key in final_test_preds_agg and wind_cos_key in final_test_preds_agg:
    print("Converting Wind Dir sin/cos to degrees...")
    wind_sin = np.clip(final_test_preds_agg[wind_sin_key], -1.0, 1.0); wind_cos = np.clip(final_test_preds_agg[wind_cos_key], -1.0, 1.0)
    final_test_preds_agg[TARGET_WIND_DIR] = sincos_to_deg(wind_sin, wind_cos)
    if wind_sin_key in final_test_preds_agg: del final_test_preds_agg[wind_sin_key]
    if wind_cos_key in final_test_preds_agg: del final_test_preds_agg[wind_cos_key]
else: print(f"Warning: Sin/Cos preds for {TARGET_WIND_DIR} not found. Filling Dir with 0."); final_test_preds_agg[TARGET_WIND_DIR] = 0

# --- Post-Processing ---
print("Post-processing final predictions...")
for target in TARGET_COLS: # Use original target list
    if target in final_test_preds_agg:
        preds = final_test_preds_agg[target]
        if target in ["Rain_Amount", "Radiation", "Wind_Speed"]: preds = np.clip(preds, 0, None); preds[preds < 1e-4] = 0
        if target == TARGET_WIND_DIR: preds = np.clip(preds, 0, 360)
        if target == "Avg_Temperature": preds = np.clip(preds, -50, 60)
        final_test_preds_agg[target] = preds
    else: print(f"Warning: {target} missing in final dict. Filling with 0."); final_test_preds_agg[target] = 0

In [None]:
# --- Create Submission ---
print("\nCreating final submission file...")
try:
    submission_df = pd.DataFrame({'ID': test_df_orig['ID']})

    pred_df = pd.DataFrame({k: (v if isinstance(v, np.ndarray) and len(v) == expected_len else np.zeros(expected_len))
                           for k, v in final_test_preds_agg.items()})

    for col in TARGET_COLS:
        if col in pred_df.columns: submission_df[col] = pred_df[col].values
        else: print(f"Warning: Target '{col}' final prediction missing. Filling with 0."); submission_df[col] = 0.0

    # Merge with sample submission structure for final format check
    # NOTE: sample_submission is defined globally at the start
    final_submission = sample_submission[['ID']].merge(submission_df, on='ID', how='left')
    if final_submission.isnull().any().any(): print("Warning: NaNs after final merge. Filling with 0."); final_submission = final_submission.fillna(0)
    final_submission = final_submission[sample_submission.columns] # Ensure column order

    submission_filename = "submission_advanced_v1_fixed_oof.csv" # Updated filename
    final_submission.to_csv(submission_filename, index=False)
    print(f"Submission file created: {submission_filename}")
    print(final_submission.head())
except NameError as e:
     print(f"Error creating submission: Variable not defined? {e}") # Catch if sample_submission wasn't loaded
except Exception as e: print(f"Error creating/saving submission file: {e}"); traceback.print_exc()


print("\nPipeline Finished.")