# Model Selection for SLP Prediction

This notebook performs model selection to predict the `slp` column using various machine learning algorithms with time series cross-validation.


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings('ignore')


## 1. Load and Prepare Data


In [2]:
# Load the dataset
dataset = 'dataset/df_clean_rf.csv'
df = pd.read_csv(dataset, sep=';', decimal=',')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

Dataset shape: (3560, 9)

Columns: ['idx', 'date', 'day_of_year', 'slp', 'apparent_temperature_mean', 'temperature_2m_mean', 'apparent_temperature_max', 'temperature_2m_max', 'sunrise']


Unnamed: 0,idx,date,day_of_year,slp,apparent_temperature_mean,temperature_2m_mean,apparent_temperature_max,temperature_2m_max,sunrise
0,0,2016-01-01 00:00:00+00:00,1,1935724.47983604,-0.4,2.4,2.2,4.3,26400
1,1,2016-01-02 00:00:00+00:00,2,2410157.74830502,-6.9,-1.5,-0.7,2.5,26340
2,2,2016-01-03 00:00:00+00:00,3,3009969.53122598,-13.2,-6.8,-11.8,-5.4,26340
3,3,2016-01-04 00:00:00+00:00,4,3201452.44485292,-11.5,-5.7,-9.1,-3.5,26340
4,4,2016-01-05 00:00:00+00:00,5,3169937.6386627,-11.3,-5.8,-10.6,-5.2,26340


In [3]:
# Remove 'rlm' and/or 'entry' columns if present
to_remove = [col for col in ['rlm', 'entry'] if col in df.columns]
if to_remove:
    df_clean = df.drop(columns=to_remove)
    print(f"Removed columns: {to_remove}")
else:
    df_clean = df.copy()
    print("Columns 'rlm' and 'entry' not found, dataset unchanged.")

print(f"Dataset shape after removing rlm and entry (if present): {df_clean.shape}")
print(f"\nRemaining columns: {df_clean.columns.tolist()}")


Columns 'rlm' and 'entry' not found, dataset unchanged.
Dataset shape after removing rlm and entry (if present): (3560, 9)

Remaining columns: ['idx', 'date', 'day_of_year', 'slp', 'apparent_temperature_mean', 'temperature_2m_mean', 'apparent_temperature_max', 'temperature_2m_max', 'sunrise']


In [4]:
# Parse date and sort by date (important for time series)
df_clean['date'] = pd.to_datetime(df_clean['date'])
df_clean = df_clean.sort_values('date').reset_index(drop=True)


In [5]:
# Separate features and target
if dataset == 'dataset/df_clean_stat.csv':
    X = df_clean.drop(columns=['date', "Residential (SLP)"])
    y = df_clean['Residential (SLP)']
else:
    X = df_clean.drop(columns=['date', 'slp'])
    y = df_clean['slp']
    

print(y)


# Define feature types for proper preprocessing
boolean_cols = ['holiday']
categorical_cols = ['weathercode']  # Leave as-is for tree-based models
cyclical_cols = ['day_of_week', 'day_of_year', 'winddirection_10m_dominant']

# All other columns are continuous and should be scaled
continuous_cols = [col for col in X.columns 
                   if col not in boolean_cols + categorical_cols + cyclical_cols]

# Convert continuous columns to numeric (CSV has mixed decimal formats: some use ',' some use '.')
for col in continuous_cols:
    X[col] = pd.to_numeric(X[col], errors='coerce')

y = pd.to_numeric(y, errors='coerce')

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nBoolean columns (not scaled): {boolean_cols}")
print(f"Categorical columns (not scaled): {categorical_cols}")
print(f"Cyclical columns (will be sin/cos encoded): {cyclical_cols}")
print(f"Continuous columns (will be scaled): {continuous_cols}")


0       1935724.47983604
1       2410157.74830502
2       3009969.53122598
3       3201452.44485292
4        3169937.6386627
              ...       
3555           555932.07
3556          474066.306
3557          516819.667
3558          539540.291
3559          641966.023
Name: slp, Length: 3560, dtype: object
Features shape: (3560, 7)
Target shape: (3560,)

Boolean columns (not scaled): ['holiday']
Categorical columns (not scaled): ['weathercode']
Cyclical columns (will be sin/cos encoded): ['day_of_week', 'day_of_year', 'winddirection_10m_dominant']
Continuous columns (will be scaled): ['idx', 'apparent_temperature_mean', 'temperature_2m_mean', 'apparent_temperature_max', 'temperature_2m_max', 'sunrise']


## 2. Feature Preprocessing

- **Boolean features** (holiday): Left unchanged (0/1)
- **Categorical features** (weathercode): Left unchanged (tree-based models handle them well)
- **Cyclical features** (day_of_week, day_of_year, winddirection): Sin/cos encoding to preserve circular nature
- **Continuous features**: StandardScaler normalization

In [6]:
# Apply cyclical encoding for cyclical features
# This preserves the circular nature of these variables (e.g., day 365 is close to day 1)

def cyclical_encode(df, col, max_val):
    """Encode a cyclical feature using sine and cosine transformation."""
    df[f'{col}_sin'] = np.sin(2 * np.pi * df[col] / max_val)
    df[f'{col}_cos'] = np.cos(2 * np.pi * df[col] / max_val)
    return df

# Create a copy and apply cyclical encoding
X_encoded = X.copy()

if dataset == 'dataset/data_v2_full.csv':
    # Encode day_of_week (0-6, period=7)
    X_encoded = cyclical_encode(X_encoded, 'day_of_week', 7)
    
    # Encode wind direction (0-360 degrees, period=360)
    X_encoded = cyclical_encode(X_encoded, 'winddirection_10m_dominant', 360)

    # Drop original cyclical columns (replaced by sin/cos versions)
    X_encoded = X_encoded.drop(columns=cyclical_cols)

    
# Encode day_of_year (1-366, period=366)
X_encoded = cyclical_encode(X_encoded, 'day_of_year', 366)

print(f"Shape after cyclical encoding: {X_encoded.shape}")
print(f"\nNew cyclical features added:")
for col in cyclical_cols:
    print(f"  {col} -> {col}_sin, {col}_cos")
print(f"\nAll features: {X_encoded.columns.tolist()}")


Shape after cyclical encoding: (3560, 9)

New cyclical features added:
  day_of_week -> day_of_week_sin, day_of_week_cos
  day_of_year -> day_of_year_sin, day_of_year_cos
  winddirection_10m_dominant -> winddirection_10m_dominant_sin, winddirection_10m_dominant_cos

All features: ['idx', 'day_of_year', 'apparent_temperature_mean', 'temperature_2m_mean', 'apparent_temperature_max', 'temperature_2m_max', 'sunrise', 'day_of_year_sin', 'day_of_year_cos']


In [7]:
# Check feature ranges before scaling (continuous features only)
print("Continuous feature statistics before scaling:")
X_encoded[continuous_cols].describe().T[['min', 'max', 'mean', 'std']]


Continuous feature statistics before scaling:


Unnamed: 0,min,max,mean,std
idx,0.0,3559.0,1779.5,1027.827807
apparent_temperature_mean,-14.7,29.6,8.742556,9.013316
temperature_2m_mean,-10.2,29.1,11.075225,7.463579
apparent_temperature_max,-13.0,36.7,12.627893,9.957706
temperature_2m_max,-6.7,36.1,14.928006,8.401043
sunrise,9960.0,26400.0,17927.696629,5540.841244


In [8]:
# Apply StandardScaler ONLY to continuous features
scaler = StandardScaler()

# Start with the encoded data
X_scaled = X_encoded.copy()

# Scale only continuous columns
X_scaled[continuous_cols] = scaler.fit_transform(X_encoded[continuous_cols])

print("Feature preprocessing summary:")
print(f"  - Boolean features (unchanged): {boolean_cols}")
print(f"  - Categorical features (unchanged): {categorical_cols}")
print(f"  - Cyclical features (sin/cos encoded): {[f'{c}_sin, {c}_cos' for c in cyclical_cols]}")
print(f"  - Continuous features (standardized): {len(continuous_cols)} columns")
print(f"\nFinal feature matrix shape: {X_scaled.shape}")
print(f"\nContinuous feature statistics after scaling:")
X_scaled[continuous_cols].describe().T[['min', 'max', 'mean', 'std']]


Feature preprocessing summary:
  - Boolean features (unchanged): ['holiday']
  - Categorical features (unchanged): ['weathercode']
  - Cyclical features (sin/cos encoded): ['day_of_week_sin, day_of_week_cos', 'day_of_year_sin, day_of_year_cos', 'winddirection_10m_dominant_sin, winddirection_10m_dominant_cos']
  - Continuous features (standardized): 6 columns

Final feature matrix shape: (3560, 9)

Continuous feature statistics after scaling:


Unnamed: 0,min,max,mean,std
idx,-1.731564,1.731564,0.0,1.00014
apparent_temperature_mean,-2.601246,2.314395,-3.1934500000000005e-17,1.00014
temperature_2m_mean,-2.85094,2.41537,-3.1934500000000005e-17,1.00014
apparent_temperature_max,-2.574036,2.417775,-6.386901000000001e-17,1.00014
temperature_2m_max,-2.574805,2.520517,6.386901000000001e-17,1.00014
sunrise,-1.438196,1.529279,1.91607e-16,1.00014


In [9]:
# Export preprocessed data to CSV
export_df = X_scaled.copy()
export_df['slp'] = y.values
export_df['date'] = df_clean['date'].values

# Reorder columns to put date first
cols = ['date', 'slp'] + [col for col in export_df.columns if col not in ['date', 'slp']]
export_df = export_df[cols]

# Save to dataset folder
# export_df.to_csv('dataset/data_v2_step_5.csv', sep=';', decimal=',', index=False)
# print(f"Exported preprocessed data to 'dataset/data_v2_step_5.csv'")
# print(f"Shape: {export_df.shape}")


## 3. Time Series Split

In [10]:
# Use TimeSeriesSplit for proper time series cross-validation
# This ensures we always train on past data and test on future data
tscv = TimeSeriesSplit(n_splits=10)

# Visualize the splits
print("Time Series Cross-Validation Splits:")
print("="*50)
for fold, (train_idx, test_idx) in enumerate(tscv.split(X_scaled)):
    train_dates = df_clean.iloc[train_idx]['date']
    test_dates = df_clean.iloc[test_idx]['date']
    print(f"Fold {fold + 1}:")
    print(f"  Train: {train_dates.min().date()} to {train_dates.max().date()} ({len(train_idx)} samples)")
    print(f"  Test:  {test_dates.min().date()} to {test_dates.max().date()} ({len(test_idx)} samples)")
    print()


Time Series Cross-Validation Splits:
Fold 1:
  Train: 2016-01-01 to 2016-11-25 (330 samples)
  Test:  2016-11-26 to 2017-10-14 (323 samples)

Fold 2:
  Train: 2016-01-01 to 2017-10-14 (653 samples)
  Test:  2017-10-15 to 2018-09-02 (323 samples)

Fold 3:
  Train: 2016-01-01 to 2018-09-02 (976 samples)
  Test:  2018-09-03 to 2019-07-22 (323 samples)

Fold 4:
  Train: 2016-01-01 to 2019-07-22 (1299 samples)
  Test:  2019-07-23 to 2020-06-09 (323 samples)

Fold 5:
  Train: 2016-01-01 to 2020-06-09 (1622 samples)
  Test:  2020-06-10 to 2021-04-28 (323 samples)

Fold 6:
  Train: 2016-01-01 to 2021-04-28 (1945 samples)
  Test:  2021-04-29 to 2022-03-17 (323 samples)

Fold 7:
  Train: 2016-01-01 to 2022-03-17 (2268 samples)
  Test:  2022-03-18 to 2023-02-03 (323 samples)

Fold 8:
  Train: 2016-01-01 to 2023-02-03 (2591 samples)
  Test:  2023-02-04 to 2023-12-23 (323 samples)

Fold 9:
  Train: 2016-01-01 to 2023-12-23 (2914 samples)
  Test:  2023-12-24 to 2024-11-10 (323 samples)

Fold 10:
  T

## 4. Define Models

In [11]:
# Define different types of models with standard parameters
models = {
    # Linear Models
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet': ElasticNet(),
    
    # Tree-based Models
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42, n_jobs=-1, verbosity=0),
    'LightGBM': LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1),
    
    # Distance-based Models
    'K-Nearest Neighbors': KNeighborsRegressor(),
    
    # Support Vector Machine (tuned for large target values)
    'SVR': SVR(C=1e6, epsilon=1e4, kernel='rbf'),
}

print(f"Total models to evaluate: {len(models)}")
for name in models.keys():
    print(f"  - {name}")


Total models to evaluate: 12
  - Linear Regression
  - Ridge Regression
  - Lasso Regression
  - ElasticNet
  - Decision Tree
  - Random Forest
  - Gradient Boosting
  - AdaBoost
  - XGBoost
  - LightGBM
  - K-Nearest Neighbors
  - SVR


## 5. Train and Test Models

In [12]:
def evaluate_model(model, X, y, tscv):
    """
    Evaluate a model using time series cross-validation.
    Returns average metrics across all folds.
    """
    rmse_scores = []
    mae_scores = []
    r2_scores = []
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        rmse_scores.append(rmse)
        mae_scores.append(mae)
        r2_scores.append(r2)
    
    return {
        'RMSE_mean': np.mean(rmse_scores),
        'RMSE_std': np.std(rmse_scores),
        'MAE_mean': np.mean(mae_scores),
        'MAE_std': np.std(mae_scores),
        'R2_mean': np.mean(r2_scores),
        'R2_std': np.std(r2_scores),
    }


In [13]:
# Train and evaluate all models
results = {}

print("Training and evaluating models...")
print("="*60)

for name, model in models.items():
    print(f"Training: {name}...", end=" ")
    try:
        metrics = evaluate_model(model, X_scaled, y, tscv)
        results[name] = metrics
        print(f"Done! (R² = {metrics['R2_mean']:.4f})")
    except Exception as e:
        print(f"Error: {str(e)}")
        results[name] = {'RMSE_mean': np.nan, 'MAE_mean': np.nan, 'R2_mean': np.nan}

print("\nAll models trained!")


Training and evaluating models...
Training: Linear Regression... Done! (R² = 0.9058)
Training: Ridge Regression... Done! (R² = 0.9049)
Training: Lasso Regression... Done! (R² = 0.9031)
Training: ElasticNet... Done! (R² = 0.8899)
Training: Decision Tree... Done! (R² = 0.9010)
Training: Random Forest... Done! (R² = 0.9451)
Training: Gradient Boosting... Done! (R² = 0.9256)
Training: AdaBoost... Done! (R² = 0.8799)
Training: XGBoost... Done! (R² = 0.9361)
Training: LightGBM... Done! (R² = 0.9429)
Training: K-Nearest Neighbors... Done! (R² = 0.8638)
Training: SVR... Done! (R² = 0.8158)

All models trained!


## 6. Results


In [14]:
# Create results DataFrame
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('R2_mean', ascending=False)

# Format for display
results_display = results_df.copy()
results_display['RMSE'] = results_display.apply(lambda x: f"{x['RMSE_mean']:.2f} ± {x['RMSE_std']:.2f}", axis=1)
results_display['MAE'] = results_display.apply(lambda x: f"{x['MAE_mean']:.2f} ± {x['MAE_std']:.2f}", axis=1)
results_display['R²'] = results_display.apply(lambda x: f"{x['R2_mean']:.4f} ± {x['R2_std']:.4f}", axis=1)

print("\n" + "="*80)
print("MODEL COMPARISON RESULTS (sorted by R² score)")
print("="*80)
print("\nMetrics averaged over 10-fold Time Series Cross-Validation:")
print()
results_display[['RMSE', 'MAE', 'R²']]



MODEL COMPARISON RESULTS (sorted by R² score)

Metrics averaged over 10-fold Time Series Cross-Validation:



Unnamed: 0,RMSE,MAE,R²
Random Forest,152040.00 ± 50329.68,93843.47 ± 18183.40,0.9451 ± 0.0479
LightGBM,160321.47 ± 46285.34,102866.45 ± 19313.56,0.9429 ± 0.0444
XGBoost,169770.89 ± 43109.88,107645.66 ± 19283.99,0.9361 ± 0.0460
Gradient Boosting,181226.98 ± 64470.66,124235.50 ± 46908.46,0.9256 ± 0.0560
Linear Regression,208749.20 ± 46205.16,154654.10 ± 25749.47,0.9058 ± 0.0616
Ridge Regression,209963.50 ± 46083.98,155820.68 ± 26145.88,0.9049 ± 0.0619
Lasso Regression,213767.44 ± 43468.16,159543.09 ± 25122.42,0.9031 ± 0.0596
Decision Tree,209595.57 ± 59306.30,129155.74 ± 28549.11,0.9010 ± 0.0637
ElasticNet,229550.00 ± 40087.90,175387.64 ± 25232.69,0.8899 ± 0.0600
AdaBoost,221981.65 ± 88196.64,176581.08 ± 77602.76,0.8799 ± 0.1002


In [15]:
# Summary statistics
print("\n" + "="*80)
print("SUMMARY")
print("="*80)

best_model = results_df['R2_mean'].idxmax()
best_r2 = results_df.loc[best_model, 'R2_mean']
best_rmse = results_df.loc[best_model, 'RMSE_mean']
best_mae = results_df.loc[best_model, 'MAE_mean']

print(f"\nBest Model: {best_model}")
print(f"   - R² Score: {best_r2:.4f}")
print(f"   - RMSE: {best_rmse:.2f}")
print(f"   - MAE: {best_mae:.2f}")

print(f"\nTarget variable (slp) statistics:")
print(f"   - Mean: {y.mean():.2f}")
print(f"   - Std: {y.std():.2f}")
print(f"   - Min: {y.min():.2f}")
print(f"   - Max: {y.max():.2f}")



SUMMARY

Best Model: Random Forest
   - R² Score: 0.9451
   - RMSE: 152040.00
   - MAE: 93843.47

Target variable (slp) statistics:
   - Mean: 988192.36
   - Std: 752107.47
   - Min: -435171.56
   - Max: 3341410.76


In [16]:
# Visual comparison (text-based bar chart)
print("\n" + "="*80)
print("R² SCORE COMPARISON")
print("="*80 + "\n")

max_bar_length = 50
max_r2 = results_df['R2_mean'].max()

for model_name in results_df.index:
    r2 = results_df.loc[model_name, 'R2_mean']
    if r2 > 0:
        bar_length = int((r2 / max_r2) * max_bar_length)
        bar = '█' * bar_length
    else:
        bar_length = 0
        bar = ''
    print(f"{model_name:25s} | {bar} {r2:.4f}")



R² SCORE COMPARISON

Random Forest             | ██████████████████████████████████████████████████ 0.9451
LightGBM                  | █████████████████████████████████████████████████ 0.9429
XGBoost                   | █████████████████████████████████████████████████ 0.9361
Gradient Boosting         | ████████████████████████████████████████████████ 0.9256
Linear Regression         | ███████████████████████████████████████████████ 0.9058
Ridge Regression          | ███████████████████████████████████████████████ 0.9049
Lasso Regression          | ███████████████████████████████████████████████ 0.9031
Decision Tree             | ███████████████████████████████████████████████ 0.9010
ElasticNet                | ███████████████████████████████████████████████ 0.8899
AdaBoost                  | ██████████████████████████████████████████████ 0.8799
K-Nearest Neighbors       | █████████████████████████████████████████████ 0.8638
SVR                       | ████████████████████████████████

## 7. Optimal Training Timespan Analysis

This section determines the optimal amount of historical data for predicting one year ahead.
We use the last year of data as the test set and vary the training period from 1 year to all available historical data.


In [17]:
# Define the test period: last 1 year of data
test_end_date = df_clean['date'].max()
test_start_date = test_end_date - pd.DateOffset(years=1)

# Create test set mask
test_mask = df_clean['date'] > test_start_date
X_test_final = X_scaled[test_mask]
y_test_final = y[test_mask]

print(f"Test period: {test_start_date.date()} to {test_end_date.date()}")
print(f"Test set size: {len(X_test_final)} samples")

# Available training data (everything before test period)
train_available_mask = df_clean['date'] <= test_start_date
train_start_date = df_clean[train_available_mask]['date'].min()
train_end_date = df_clean[train_available_mask]['date'].max()

print(f"\nAvailable training period: {train_start_date.date()} to {train_end_date.date()}")
total_train_years = (train_end_date - train_start_date).days / 365.25
print(f"Total available training data: {total_train_years:.1f} years ({train_available_mask.sum()} samples)")


Test period: 2024-09-30 to 2025-09-30
Test set size: 364 samples

Available training period: 2016-01-01 to 2024-09-30
Total available training data: 8.7 years (3196 samples)


In [18]:
# Define the top 3 models based on previous results
top_models = {
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1),
    'Random Forest': RandomForestRegressor(random_state=42),
}

# Calculate max years available for training
max_years = int(total_train_years)
print(f"Testing training periods from 1 to {max_years} years\n")

# Store results for each training period
timespan_results = {model_name: {'years': [], 'rmse': [], 'mae': [], 'r2': []} 
                    for model_name in top_models.keys()}

print("Evaluating models with different training timespans...")
print("=" * 70)

for n_years in range(1, max_years + 1):
    # Define training period: n_years before test_start_date
    train_period_start = test_start_date - pd.DateOffset(years=n_years)
    
    # Create training mask for this period
    train_mask = (df_clean['date'] > train_period_start) & (df_clean['date'] <= test_start_date)
    
    X_train = X_scaled[train_mask]
    y_train = y[train_mask]
    
    print(f"\n{n_years} year(s) of training data: {train_period_start.date()} to {test_start_date.date()} ({len(X_train)} samples)")
    
    for model_name, model in top_models.items():
        # Clone the model to avoid refitting issues
        from sklearn.base import clone
        model_clone = clone(model)
        
        # Train and predict
        model_clone.fit(X_train, y_train)
        y_pred = model_clone.predict(X_test_final)
        
        # Calculate metrics
        rmse = np.sqrt(mean_squared_error(y_test_final, y_pred))
        mae = mean_absolute_error(y_test_final, y_pred)
        r2 = r2_score(y_test_final, y_pred)
        
        # Store results
        timespan_results[model_name]['years'].append(n_years)
        timespan_results[model_name]['rmse'].append(rmse)
        timespan_results[model_name]['mae'].append(mae)
        timespan_results[model_name]['r2'].append(r2)
        
        print(f"  {model_name}: R² = {r2:.4f}, RMSE = {rmse:.2f}")

print("\n" + "=" * 70)
print("Training timespan analysis complete!")


Testing training periods from 1 to 8 years

Evaluating models with different training timespans...

1 year(s) of training data: 2023-09-30 to 2024-09-30 (366 samples)


  Gradient Boosting: R² = 0.9526, RMSE = 149933.52
  LightGBM: R² = 0.9497, RMSE = 154465.58
  Random Forest: R² = 0.9519, RMSE = 151130.17

2 year(s) of training data: 2022-09-30 to 2024-09-30 (731 samples)
  Gradient Boosting: R² = 0.9366, RMSE = 173521.18
  LightGBM: R² = 0.9468, RMSE = 158885.38
  Random Forest: R² = 0.9456, RMSE = 160723.22

3 year(s) of training data: 2021-09-30 to 2024-09-30 (1096 samples)
  Gradient Boosting: R² = 0.9247, RMSE = 189011.42
  LightGBM: R² = 0.9522, RMSE = 150695.53
  Random Forest: R² = 0.9345, RMSE = 176343.35

4 year(s) of training data: 2020-09-30 to 2024-09-30 (1461 samples)
  Gradient Boosting: R² = 0.9005, RMSE = 217300.43
  LightGBM: R² = 0.9574, RMSE = 142238.43
  Random Forest: R² = 0.9288, RMSE = 183796.38

5 year(s) of training data: 2019-09-30 to 2024-09-30 (1827 samples)
  Gradient Boosting: R² = 0.8936, RMSE = 224719.24
  LightGBM: R² = 0.9466, RMSE = 159255.92
  Random Forest: R² = 0.9128, RMSE = 203470.25

6 year(s) of training da

In [19]:
# Create a comprehensive results DataFrame
timespan_df_list = []
for model_name, results in timespan_results.items():
    for i in range(len(results['years'])):
        timespan_df_list.append({
            'Model': model_name,
            'Training Years': results['years'][i],
            'RMSE': results['rmse'][i],
            'MAE': results['mae'][i],
            'R²': results['r2'][i]
        })

timespan_df = pd.DataFrame(timespan_df_list)

# Pivot table for R² scores
r2_pivot = timespan_df.pivot(index='Training Years', columns='Model', values='R²')
rmse_pivot = timespan_df.pivot(index='Training Years', columns='Model', values='RMSE')

print("\n" + "=" * 80)
print("R² SCORES BY TRAINING TIMESPAN")
print("=" * 80)
print(r2_pivot.round(4).to_string())

print("\n" + "=" * 80)
print("RMSE BY TRAINING TIMESPAN")
print("=" * 80)
print(rmse_pivot.round(2).to_string())



R² SCORES BY TRAINING TIMESPAN
Model           Gradient Boosting  LightGBM  Random Forest
Training Years                                            
1                          0.9526    0.9497         0.9519
2                          0.9366    0.9468         0.9456
3                          0.9247    0.9522         0.9345
4                          0.9005    0.9574         0.9288
5                          0.8936    0.9466         0.9128
6                          0.8912    0.9462         0.9139
7                          0.8808    0.9622         0.9130
8                          0.8885    0.9459         0.9154

RMSE BY TRAINING TIMESPAN
Model           Gradient Boosting   LightGBM  Random Forest
Training Years                                             
1                       149933.52  154465.58      151130.17
2                       173521.18  158885.38      160723.22
3                       189011.42  150695.53      176343.35
4                       217300.43  142238.43      1

In [20]:
# Find optimal training timespan for each model
print("\n" + "=" * 80)
print("OPTIMAL TRAINING TIMESPAN ANALYSIS")
print("=" * 80)

optimal_timespans = {}
for model_name in top_models.keys():
    model_data = timespan_df[timespan_df['Model'] == model_name]
    best_idx = model_data['R²'].idxmax()
    best_row = timespan_df.loc[best_idx]
    optimal_timespans[model_name] = {
        'years': int(best_row['Training Years']),
        'r2': best_row['R²'],
        'rmse': best_row['RMSE'],
        'mae': best_row['MAE']
    }
    print(f"\n{model_name}:")
    print(f"  Optimal training period: {optimal_timespans[model_name]['years']} year(s)")
    print(f"  Best R²: {optimal_timespans[model_name]['r2']:.4f}")
    print(f"  RMSE: {optimal_timespans[model_name]['rmse']:.2f}")
    print(f"  MAE: {optimal_timespans[model_name]['mae']:.2f}")

# Calculate average R² for each training timespan across all models
avg_r2_by_years = timespan_df.groupby('Training Years')['R²'].mean()
optimal_years_overall = avg_r2_by_years.idxmax()
optimal_r2_overall = avg_r2_by_years.max()

print("\n" + "-" * 80)
print("OVERALL RECOMMENDATION (averaged across all 3 models)")
print("-" * 80)
print(f"\nOptimal training timespan: {optimal_years_overall} year(s)")
print(f"Average R² score: {optimal_r2_overall:.4f}")



OPTIMAL TRAINING TIMESPAN ANALYSIS

Gradient Boosting:
  Optimal training period: 1 year(s)
  Best R²: 0.9526
  RMSE: 149933.52
  MAE: 93256.77

LightGBM:
  Optimal training period: 7 year(s)
  Best R²: 0.9622
  RMSE: 133944.93
  MAE: 87833.06

Random Forest:
  Optimal training period: 1 year(s)
  Best R²: 0.9519
  RMSE: 151130.17
  MAE: 100635.41

--------------------------------------------------------------------------------
OVERALL RECOMMENDATION (averaged across all 3 models)
--------------------------------------------------------------------------------

Optimal training timespan: 1 year(s)
Average R² score: 0.9514


In [21]:
# Visual comparison of R² scores by training timespan
print("\n" + "=" * 80)
print("R² SCORE BY TRAINING TIMESPAN (Visual)")
print("=" * 80 + "\n")

for model_name in top_models.keys():
    print(f"\n{model_name}:")
    model_data = timespan_df[timespan_df['Model'] == model_name].sort_values('Training Years')
    
    max_r2 = model_data['R²'].max()
    for _, row in model_data.iterrows():
        years = int(row['Training Years'])
        r2 = row['R²']
        bar_length = int((r2 / max_r2) * 40) if r2 > 0 else 0
        bar = '█' * bar_length
        marker = ' ← BEST' if r2 == max_r2 else ''
        print(f"  {years:2d} year(s) | {bar} {r2:.4f}{marker}")

# Average across models
print(f"\nAverage (all models):")
for years in sorted(avg_r2_by_years.index):
    r2 = avg_r2_by_years[years]
    bar_length = int((r2 / avg_r2_by_years.max()) * 40) if r2 > 0 else 0
    bar = '█' * bar_length
    marker = ' ← BEST' if years == optimal_years_overall else ''
    print(f"  {years:2d} year(s) | {bar} {r2:.4f}{marker}")



R² SCORE BY TRAINING TIMESPAN (Visual)


Gradient Boosting:
   1 year(s) | ████████████████████████████████████████ 0.9526 ← BEST
   2 year(s) | ███████████████████████████████████████ 0.9366
   3 year(s) | ██████████████████████████████████████ 0.9247
   4 year(s) | █████████████████████████████████████ 0.9005
   5 year(s) | █████████████████████████████████████ 0.8936
   6 year(s) | █████████████████████████████████████ 0.8912
   7 year(s) | ████████████████████████████████████ 0.8808
   8 year(s) | █████████████████████████████████████ 0.8885

LightGBM:
   1 year(s) | ███████████████████████████████████████ 0.9497
   2 year(s) | ███████████████████████████████████████ 0.9468
   3 year(s) | ███████████████████████████████████████ 0.9522
   4 year(s) | ███████████████████████████████████████ 0.9574
   5 year(s) | ███████████████████████████████████████ 0.9466
   6 year(s) | ███████████████████████████████████████ 0.9462
   7 year(s) | ████████████████████████████████████████ 0.9622 ←

In [22]:
# Final summary
print("\n" + "=" * 80)
print("FINAL SUMMARY: OPTIMAL TRAINING TIMESPAN")
print("=" * 80)

print(f"""
Test Period: {test_start_date.date()} to {test_end_date.date()} ({len(y_test_final)} days)

Results by Model:
""")

for model_name, opt in optimal_timespans.items():
    print(f"  • {model_name}: {opt['years']} year(s) → R² = {opt['r2']:.4f}")

print(f"""
════════════════════════════════════════════════════════════════════════════════
RECOMMENDATION: Use {optimal_years_overall} year(s) of historical data for training
                when predicting 1 year ahead.
                
                This achieves an average R² of {optimal_r2_overall:.4f} across the
                top 3 models (Random Forest, Gradient Boosting, LightGBM).
════════════════════════════════════════════════════════════════════════════════
""")



FINAL SUMMARY: OPTIMAL TRAINING TIMESPAN

Test Period: 2024-09-30 to 2025-09-30 (364 days)

Results by Model:

  • Gradient Boosting: 1 year(s) → R² = 0.9526
  • LightGBM: 7 year(s) → R² = 0.9622
  • Random Forest: 1 year(s) → R² = 0.9519

════════════════════════════════════════════════════════════════════════════════
RECOMMENDATION: Use 1 year(s) of historical data for training
                when predicting 1 year ahead.
                
                This achieves an average R² of 0.9514 across the
                top 3 models (Random Forest, Gradient Boosting, LightGBM).
════════════════════════════════════════════════════════════════════════════════



In [23]:
# Export data with optimal training timespan + test timespan
# Define the optimal training period start date
optimal_train_start = test_start_date - pd.DateOffset(years=optimal_years_overall)

# Create mask for optimal training + test data
export_mask = df_clean['date'] > optimal_train_start

# Create export dataframe with scaled features
export_df_optimal = X_scaled[export_mask].copy()
export_df_optimal['slp'] = y[export_mask].values
export_df_optimal['date'] = df_clean.loc[export_mask, 'date'].values

# Reorder columns to put date first
cols = ['date', 'slp'] + [col for col in export_df_optimal.columns if col not in ['date', 'slp']]
export_df_optimal = export_df_optimal[cols]

# Save to CSV
export_path = 'dataset/data_v3_rf.csv'
export_df_optimal.to_csv(export_path, sep=';', decimal=',', index=False)

print(f"Exported data to '{export_path}'")
print(f"\nData period: {optimal_train_start.date()} to {test_end_date.date()}")
print(f"  - Training period: {optimal_train_start.date()} to {test_start_date.date()} ({optimal_years_overall} years)")
print(f"  - Test period: {test_start_date.date()} to {test_end_date.date()} (1 year)")
print(f"\nTotal samples: {len(export_df_optimal)}")
print(f"Shape: {export_df_optimal.shape}")

Exported data to 'dataset/data_v3_rf.csv'

Data period: 2023-09-30 to 2025-09-30
  - Training period: 2023-09-30 to 2024-09-30 (1 years)
  - Test period: 2024-09-30 to 2025-09-30 (1 year)

Total samples: 730
Shape: (730, 11)
