# Model Selection for SLP Prediction

This notebook performs model selection to predict the `slp` column using various machine learning algorithms with time series cross-validation.


In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings('ignore')


## 1. Load and Prepare Data


In [68]:
# Load the dataset
df = pd.read_csv('dataset/data_v2_full.csv', sep=';', decimal=',')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

Dataset shape: (3560, 26)

Columns: ['date', 'entry', 'rlm', 'slp', 'day_of_year', 'day_of_week', 'holiday', 'weathercode', 'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'apparent_temperature_max', 'apparent_temperature_min', 'apparent_temperature_mean', 'sunrise', 'sunset', 'daylight_duration', 'sunshine_duration', 'rain_sum', 'snowfall_sum', 'precipitation_hours', 'windspeed_10m_max', 'windgusts_10m_max', 'winddirection_10m_dominant', 'shortwave_radiation_sum', 'et0_fao_evapotranspiration']


Unnamed: 0,date,entry,rlm,slp,day_of_year,day_of_week,holiday,weathercode,temperature_2m_max,temperature_2m_min,...,daylight_duration,sunshine_duration,rain_sum,snowfall_sum,precipitation_hours,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration
0,2016-01-01 00:00:00+00:00,4501250.0,2565526.0,1935724.0,1,4,1,3,4.3,0.1,...,27973.34,17706.46,0.0,0.0,0.0,9.0,18.4,208,2.64,0.21
1,2016-01-02 00:00:00+00:00,5448037.0,3037879.0,2410158.0,2,5,0,3,2.5,-5.3,...,28047.14,2501.06,0.0,0.0,0.0,24.0,48.2,97,1.34,0.5
2,2016-01-03 00:00:00+00:00,6472305.0,3462336.0,3009970.0,3,6,0,3,-5.4,-8.4,...,28127.21,21285.25,0.0,0.0,0.0,21.5,44.3,102,3.38,0.68
3,2016-01-04 00:00:00+00:00,7766598.0,4565146.0,3201452.0,4,0,0,3,-3.5,-7.9,...,28213.34,9701.3,0.0,0.0,0.0,18.9,39.2,97,2.47,0.61
4,2016-01-05 00:00:00+00:00,7842385.0,4672447.0,3169938.0,5,1,0,73,-5.2,-6.4,...,28305.35,0.0,0.0,2.45,14.0,16.2,33.5,84,1.0,0.46


In [69]:
# Remove 'rlm' and/or 'entry' columns if present
to_remove = [col for col in ['rlm', 'entry'] if col in df.columns]
if to_remove:
    df_clean = df.drop(columns=to_remove)
    print(f"Removed columns: {to_remove}")
else:
    df_clean = df.copy()
    print("Columns 'rlm' and 'entry' not found, dataset unchanged.")

print(f"Dataset shape after removing rlm and entry (if present): {df_clean.shape}")
print(f"\nRemaining columns: {df_clean.columns.tolist()}")


Removed columns: ['rlm', 'entry']
Dataset shape after removing rlm and entry (if present): (3560, 24)

Remaining columns: ['date', 'slp', 'day_of_year', 'day_of_week', 'holiday', 'weathercode', 'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'apparent_temperature_max', 'apparent_temperature_min', 'apparent_temperature_mean', 'sunrise', 'sunset', 'daylight_duration', 'sunshine_duration', 'rain_sum', 'snowfall_sum', 'precipitation_hours', 'windspeed_10m_max', 'windgusts_10m_max', 'winddirection_10m_dominant', 'shortwave_radiation_sum', 'et0_fao_evapotranspiration']


In [70]:
# Parse date and sort by date (important for time series)
df_clean['date'] = pd.to_datetime(df_clean['date'])
df_clean = df_clean.sort_values('date').reset_index(drop=True)

# Check for missing values
print("Missing values per column:")
print(df_clean.isnull().sum())


Missing values per column:
date                          0
slp                           0
day_of_year                   0
day_of_week                   0
holiday                       0
weathercode                   0
temperature_2m_max            0
temperature_2m_min            0
temperature_2m_mean           0
apparent_temperature_max      0
apparent_temperature_min      0
apparent_temperature_mean     0
sunrise                       0
sunset                        0
daylight_duration             0
sunshine_duration             0
rain_sum                      0
snowfall_sum                  0
precipitation_hours           0
windspeed_10m_max             0
windgusts_10m_max             0
winddirection_10m_dominant    0
shortwave_radiation_sum       0
et0_fao_evapotranspiration    0
dtype: int64


In [71]:
# Separate features and target
X = df_clean.drop(columns=['date', 'slp'])
y = df_clean['slp']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns: {X.columns.tolist()}")


Features shape: (3560, 22)
Target shape: (3560,)

Feature columns: ['day_of_year', 'day_of_week', 'holiday', 'weathercode', 'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'apparent_temperature_max', 'apparent_temperature_min', 'apparent_temperature_mean', 'sunrise', 'sunset', 'daylight_duration', 'sunshine_duration', 'rain_sum', 'snowfall_sum', 'precipitation_hours', 'windspeed_10m_max', 'windgusts_10m_max', 'winddirection_10m_dominant', 'shortwave_radiation_sum', 'et0_fao_evapotranspiration']


## 2. Scale Features

In [72]:
# Check feature ranges before scaling
print("Feature statistics before scaling:")
X.describe().T[['min', 'max', 'mean', 'std']]


Feature statistics before scaling:


Unnamed: 0,min,max,mean,std
day_of_year,1.0,366.0,179.662079,104.39644
day_of_week,0.0,6.0,3.000562,2.000983
holiday,0.0,1.0,0.050562,0.219132
weathercode,0.0,75.0,33.560955,27.22375
sunrise,9960.0,26400.0,17927.696629,5540.841244
sunset,53760.0,70560.0,62434.162921,5664.918279
winddirection_10m_dominant,0.0,360.0,204.194101,88.948244


In [73]:
# Apply StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

print("Feature statistics after scaling:")
X_scaled.describe().T[['min', 'max', 'mean', 'std']]


Feature statistics after scaling:


Unnamed: 0,min,max,mean,std
day_of_year,-1.711621,1.785158,3.1934500000000005e-17,1.00014
day_of_week,-1.499754,1.499193,1.763571e-17,1.00014
holiday,-0.230769,4.333333,-5.239255e-17,1.00014
weathercode,-1.232955,1.522379,1.217503e-16,1.00014
temperature_2m_max,-2.574805,2.520517,6.386901000000001e-17,1.00014
temperature_2m_min,-3.692157,2.36807,1.5967250000000003e-17,1.00014
temperature_2m_mean,-2.85094,2.41537,-3.1934500000000005e-17,1.00014
apparent_temperature_max,-2.574036,2.417775,-6.386901000000001e-17,1.00014
apparent_temperature_min,-3.368855,2.294788,6.786082000000001e-17,1.00014
apparent_temperature_mean,-2.601246,2.314395,-3.1934500000000005e-17,1.00014


## 3. Time Series Split

In [74]:
# Use TimeSeriesSplit for proper time series cross-validation
# This ensures we always train on past data and test on future data
tscv = TimeSeriesSplit(n_splits=10)

# Visualize the splits
print("Time Series Cross-Validation Splits:")
print("="*50)
for fold, (train_idx, test_idx) in enumerate(tscv.split(X_scaled)):
    train_dates = df_clean.iloc[train_idx]['date']
    test_dates = df_clean.iloc[test_idx]['date']
    print(f"Fold {fold + 1}:")
    print(f"  Train: {train_dates.min().date()} to {train_dates.max().date()} ({len(train_idx)} samples)")
    print(f"  Test:  {test_dates.min().date()} to {test_dates.max().date()} ({len(test_idx)} samples)")
    print()


Time Series Cross-Validation Splits:
Fold 1:
  Train: 2016-01-01 to 2016-11-25 (330 samples)
  Test:  2016-11-26 to 2017-10-14 (323 samples)

Fold 2:
  Train: 2016-01-01 to 2017-10-14 (653 samples)
  Test:  2017-10-15 to 2018-09-02 (323 samples)

Fold 3:
  Train: 2016-01-01 to 2018-09-02 (976 samples)
  Test:  2018-09-03 to 2019-07-22 (323 samples)

Fold 4:
  Train: 2016-01-01 to 2019-07-22 (1299 samples)
  Test:  2019-07-23 to 2020-06-09 (323 samples)

Fold 5:
  Train: 2016-01-01 to 2020-06-09 (1622 samples)
  Test:  2020-06-10 to 2021-04-28 (323 samples)

Fold 6:
  Train: 2016-01-01 to 2021-04-28 (1945 samples)
  Test:  2021-04-29 to 2022-03-17 (323 samples)

Fold 7:
  Train: 2016-01-01 to 2022-03-17 (2268 samples)
  Test:  2022-03-18 to 2023-02-03 (323 samples)

Fold 8:
  Train: 2016-01-01 to 2023-02-03 (2591 samples)
  Test:  2023-02-04 to 2023-12-23 (323 samples)

Fold 9:
  Train: 2016-01-01 to 2023-12-23 (2914 samples)
  Test:  2023-12-24 to 2024-11-10 (323 samples)

Fold 10:
  T

## 4. Define Models

In [75]:
# Define different types of models with standard parameters
models = {
    # Linear Models
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet': ElasticNet(),
    
    # Tree-based Models
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42, n_jobs=-1, verbosity=0),
    'LightGBM': LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1),
    
    # Distance-based Models
    'K-Nearest Neighbors': KNeighborsRegressor(),
    
    # Support Vector Machine (tuned for large target values)
    'SVR': SVR(C=1e6, epsilon=1e4, kernel='rbf'),
}

print(f"Total models to evaluate: {len(models)}")
for name in models.keys():
    print(f"  - {name}")


Total models to evaluate: 12
  - Linear Regression
  - Ridge Regression
  - Lasso Regression
  - ElasticNet
  - Decision Tree
  - Random Forest
  - Gradient Boosting
  - AdaBoost
  - XGBoost
  - LightGBM
  - K-Nearest Neighbors
  - SVR


## 5. Train and Test Models

In [76]:
def evaluate_model(model, X, y, tscv):
    """
    Evaluate a model using time series cross-validation.
    Returns average metrics across all folds.
    """
    rmse_scores = []
    mae_scores = []
    r2_scores = []
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        rmse_scores.append(rmse)
        mae_scores.append(mae)
        r2_scores.append(r2)
    
    return {
        'RMSE_mean': np.mean(rmse_scores),
        'RMSE_std': np.std(rmse_scores),
        'MAE_mean': np.mean(mae_scores),
        'MAE_std': np.std(mae_scores),
        'R2_mean': np.mean(r2_scores),
        'R2_std': np.std(r2_scores),
    }


In [77]:
# Train and evaluate all models
results = {}

print("Training and evaluating models...")
print("="*60)

for name, model in models.items():
    print(f"Training: {name}...", end=" ")
    try:
        metrics = evaluate_model(model, X_scaled, y, tscv)
        results[name] = metrics
        print(f"Done! (R² = {metrics['R2_mean']:.4f})")
    except Exception as e:
        print(f"Error: {str(e)}")
        results[name] = {'RMSE_mean': np.nan, 'MAE_mean': np.nan, 'R2_mean': np.nan}

print("\nAll models trained!")


Training and evaluating models...
Training: Linear Regression... Done! (R² = 0.9325)
Training: Ridge Regression... Done! (R² = 0.9309)
Training: Lasso Regression... Done! (R² = 0.9297)
Training: ElasticNet... Done! (R² = 0.8901)
Training: Decision Tree... Done! (R² = 0.9086)
Training: Random Forest... Done! (R² = 0.9436)
Training: Gradient Boosting... Done! (R² = 0.9445)
Training: AdaBoost... Done! (R² = 0.9045)
Training: XGBoost... Done! (R² = 0.9400)
Training: LightGBM... Done! (R² = 0.9425)
Training: K-Nearest Neighbors... Done! (R² = 0.9169)
Training: SVR... Done! (R² = 0.9375)

All models trained!


## 6. Results


In [78]:
# Create results DataFrame
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('R2_mean', ascending=False)

# Format for display
results_display = results_df.copy()
results_display['RMSE'] = results_display.apply(lambda x: f"{x['RMSE_mean']:.2f} ± {x['RMSE_std']:.2f}", axis=1)
results_display['MAE'] = results_display.apply(lambda x: f"{x['MAE_mean']:.2f} ± {x['MAE_std']:.2f}", axis=1)
results_display['R²'] = results_display.apply(lambda x: f"{x['R2_mean']:.4f} ± {x['R2_std']:.4f}", axis=1)

print("\n" + "="*80)
print("MODEL COMPARISON RESULTS (sorted by R² score)")
print("="*80)
print("\nMetrics averaged over 10-fold Time Series Cross-Validation:")
print()
results_display[['RMSE', 'MAE', 'R²']]



MODEL COMPARISON RESULTS (sorted by R² score)

Metrics averaged over 10-fold Time Series Cross-Validation:



Unnamed: 0,RMSE,MAE,R²
Gradient Boosting,149300.89 ± 52638.92,93745.44 ± 20765.09,0.9445 ± 0.0527
Random Forest,151011.19 ± 52096.58,93510.39 ± 23190.61,0.9436 ± 0.0518
LightGBM,153162.05 ± 51768.32,95474.51 ± 22173.84,0.9425 ± 0.0537
XGBoost,157403.40 ± 51227.39,97547.66 ± 22438.46,0.9400 ± 0.0523
SVR,164884.57 ± 48499.36,99052.21 ± 20729.36,0.9375 ± 0.0498
Linear Regression,171162.15 ± 46925.39,122936.46 ± 20948.28,0.9325 ± 0.0535
Ridge Regression,173556.09 ± 46609.34,125098.16 ± 20619.37,0.9309 ± 0.0546
Lasso Regression,175458.86 ± 46282.41,126674.33 ± 20528.26,0.9297 ± 0.0544
K-Nearest Neighbors,195363.38 ± 41367.32,127734.73 ± 17614.96,0.9169 ± 0.0542
Decision Tree,203515.62 ± 44449.83,123192.79 ± 17798.91,0.9086 ± 0.0544


In [79]:
# Summary statistics
print("\n" + "="*80)
print("SUMMARY")
print("="*80)

best_model = results_df['R2_mean'].idxmax()
best_r2 = results_df.loc[best_model, 'R2_mean']
best_rmse = results_df.loc[best_model, 'RMSE_mean']
best_mae = results_df.loc[best_model, 'MAE_mean']

print(f"\nBest Model: {best_model}")
print(f"   - R² Score: {best_r2:.4f}")
print(f"   - RMSE: {best_rmse:.2f}")
print(f"   - MAE: {best_mae:.2f}")

print(f"\nTarget variable (slp) statistics:")
print(f"   - Mean: {y.mean():.2f}")
print(f"   - Std: {y.std():.2f}")
print(f"   - Min: {y.min():.2f}")
print(f"   - Max: {y.max():.2f}")



SUMMARY

Best Model: Gradient Boosting
   - R² Score: 0.9445
   - RMSE: 149300.89
   - MAE: 93745.44

Target variable (slp) statistics:
   - Mean: 988192.36
   - Std: 752107.47
   - Min: -435171.56
   - Max: 3341410.76


In [80]:
# Visual comparison (text-based bar chart)
print("\n" + "="*80)
print("R² SCORE COMPARISON")
print("="*80 + "\n")

max_bar_length = 50
max_r2 = results_df['R2_mean'].max()

for model_name in results_df.index:
    r2 = results_df.loc[model_name, 'R2_mean']
    if r2 > 0:
        bar_length = int((r2 / max_r2) * max_bar_length)
        bar = '█' * bar_length
    else:
        bar_length = 0
        bar = ''
    print(f"{model_name:25s} | {bar} {r2:.4f}")



R² SCORE COMPARISON

Gradient Boosting         | ██████████████████████████████████████████████████ 0.9445
Random Forest             | █████████████████████████████████████████████████ 0.9436
LightGBM                  | █████████████████████████████████████████████████ 0.9425
XGBoost                   | █████████████████████████████████████████████████ 0.9400
SVR                       | █████████████████████████████████████████████████ 0.9375
Linear Regression         | █████████████████████████████████████████████████ 0.9325
Ridge Regression          | █████████████████████████████████████████████████ 0.9309
Lasso Regression          | █████████████████████████████████████████████████ 0.9297
K-Nearest Neighbors       | ████████████████████████████████████████████████ 0.9169
Decision Tree             | ████████████████████████████████████████████████ 0.9086
AdaBoost                  | ███████████████████████████████████████████████ 0.9045
ElasticNet                | ██████████████████