# Model Selection for SLP Prediction

This notebook performs model selection to predict the `slp` column using various machine learning algorithms with time series cross-validation.


In [341]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings('ignore')


## 1. Load and Prepare Data


In [342]:
# Load the dataset
df = pd.read_csv('dataset/data_v2_full.csv', sep=';', decimal=',')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

Dataset shape: (3560, 26)

Columns: ['date', 'entry', 'rlm', 'slp', 'day_of_year', 'day_of_week', 'holiday', 'weathercode', 'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'apparent_temperature_max', 'apparent_temperature_min', 'apparent_temperature_mean', 'sunrise', 'sunset', 'daylight_duration', 'sunshine_duration', 'rain_sum', 'snowfall_sum', 'precipitation_hours', 'windspeed_10m_max', 'windgusts_10m_max', 'winddirection_10m_dominant', 'shortwave_radiation_sum', 'et0_fao_evapotranspiration']


Unnamed: 0,date,entry,rlm,slp,day_of_year,day_of_week,holiday,weathercode,temperature_2m_max,temperature_2m_min,...,daylight_duration,sunshine_duration,rain_sum,snowfall_sum,precipitation_hours,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration
0,2016-01-01 00:00:00+00:00,4501250.0,2565526.0,1935724.0,1,4,1,3,4.3,0.1,...,27973.34,17706.46,0.0,0.0,0.0,9.0,18.4,208,2.64,0.21
1,2016-01-02 00:00:00+00:00,5448037.0,3037879.0,2410158.0,2,5,0,3,2.5,-5.3,...,28047.14,2501.06,0.0,0.0,0.0,24.0,48.2,97,1.34,0.5
2,2016-01-03 00:00:00+00:00,6472305.0,3462336.0,3009970.0,3,6,0,3,-5.4,-8.4,...,28127.21,21285.25,0.0,0.0,0.0,21.5,44.3,102,3.38,0.68
3,2016-01-04 00:00:00+00:00,7766598.0,4565146.0,3201452.0,4,0,0,3,-3.5,-7.9,...,28213.34,9701.3,0.0,0.0,0.0,18.9,39.2,97,2.47,0.61
4,2016-01-05 00:00:00+00:00,7842385.0,4672447.0,3169938.0,5,1,0,73,-5.2,-6.4,...,28305.35,0.0,0.0,2.45,14.0,16.2,33.5,84,1.0,0.46


In [343]:
# Remove 'rlm' and/or 'entry' columns if present
to_remove = [col for col in ['rlm', 'entry'] if col in df.columns]
if to_remove:
    df_clean = df.drop(columns=to_remove)
    print(f"Removed columns: {to_remove}")
else:
    df_clean = df.copy()
    print("Columns 'rlm' and 'entry' not found, dataset unchanged.")

print(f"Dataset shape after removing rlm and entry (if present): {df_clean.shape}")
print(f"\nRemaining columns: {df_clean.columns.tolist()}")


Removed columns: ['rlm', 'entry']
Dataset shape after removing rlm and entry (if present): (3560, 24)

Remaining columns: ['date', 'slp', 'day_of_year', 'day_of_week', 'holiday', 'weathercode', 'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'apparent_temperature_max', 'apparent_temperature_min', 'apparent_temperature_mean', 'sunrise', 'sunset', 'daylight_duration', 'sunshine_duration', 'rain_sum', 'snowfall_sum', 'precipitation_hours', 'windspeed_10m_max', 'windgusts_10m_max', 'winddirection_10m_dominant', 'shortwave_radiation_sum', 'et0_fao_evapotranspiration']


In [344]:
# Parse date and sort by date (important for time series)
df_clean['date'] = pd.to_datetime(df_clean['date'])
df_clean = df_clean.sort_values('date').reset_index(drop=True)


In [345]:
# Separate features and target
X = df_clean.drop(columns=['date', 'slp'])
y = df_clean['slp']

# Define feature types for proper preprocessing
boolean_cols = ['holiday']
categorical_cols = ['weathercode']  # Leave as-is for tree-based models
cyclical_cols = ['day_of_week', 'day_of_year', 'winddirection_10m_dominant']

# All other columns are continuous and should be scaled
continuous_cols = [col for col in X.columns 
                   if col not in boolean_cols + categorical_cols + cyclical_cols]

# Convert continuous columns to numeric (CSV has mixed decimal formats: some use ',' some use '.')
for col in continuous_cols:
    X[col] = pd.to_numeric(X[col], errors='coerce')

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nBoolean columns (not scaled): {boolean_cols}")
print(f"Categorical columns (not scaled): {categorical_cols}")
print(f"Cyclical columns (will be sin/cos encoded): {cyclical_cols}")
print(f"Continuous columns (will be scaled): {continuous_cols}")


Features shape: (3560, 22)
Target shape: (3560,)

Boolean columns (not scaled): ['holiday']
Categorical columns (not scaled): ['weathercode']
Cyclical columns (will be sin/cos encoded): ['day_of_week', 'day_of_year', 'winddirection_10m_dominant']
Continuous columns (will be scaled): ['temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'apparent_temperature_max', 'apparent_temperature_min', 'apparent_temperature_mean', 'sunrise', 'sunset', 'daylight_duration', 'sunshine_duration', 'rain_sum', 'snowfall_sum', 'precipitation_hours', 'windspeed_10m_max', 'windgusts_10m_max', 'shortwave_radiation_sum', 'et0_fao_evapotranspiration']


## 2. Feature Preprocessing

- **Boolean features** (holiday): Left unchanged (0/1)
- **Categorical features** (weathercode): Left unchanged (tree-based models handle them well)
- **Cyclical features** (day_of_week, day_of_year, winddirection): Sin/cos encoding to preserve circular nature
- **Continuous features**: StandardScaler normalization

In [346]:
# Apply cyclical encoding for cyclical features
# This preserves the circular nature of these variables (e.g., day 365 is close to day 1)

def cyclical_encode(df, col, max_val):
    """Encode a cyclical feature using sine and cosine transformation."""
    df[f'{col}_sin'] = np.sin(2 * np.pi * df[col] / max_val)
    df[f'{col}_cos'] = np.cos(2 * np.pi * df[col] / max_val)
    return df

# Create a copy and apply cyclical encoding
X_encoded = X.copy()

# Encode day_of_week (0-6, period=7)
X_encoded = cyclical_encode(X_encoded, 'day_of_week', 7)

# Encode day_of_year (1-366, period=366)
X_encoded = cyclical_encode(X_encoded, 'day_of_year', 366)

# Encode wind direction (0-360 degrees, period=360)
X_encoded = cyclical_encode(X_encoded, 'winddirection_10m_dominant', 360)

# Drop original cyclical columns (replaced by sin/cos versions)
X_encoded = X_encoded.drop(columns=cyclical_cols)

print(f"Shape after cyclical encoding: {X_encoded.shape}")
print(f"\nNew cyclical features added:")
for col in cyclical_cols:
    print(f"  {col} -> {col}_sin, {col}_cos")
print(f"\nAll features: {X_encoded.columns.tolist()}")


Shape after cyclical encoding: (3560, 25)

New cyclical features added:
  day_of_week -> day_of_week_sin, day_of_week_cos
  day_of_year -> day_of_year_sin, day_of_year_cos
  winddirection_10m_dominant -> winddirection_10m_dominant_sin, winddirection_10m_dominant_cos

All features: ['holiday', 'weathercode', 'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'apparent_temperature_max', 'apparent_temperature_min', 'apparent_temperature_mean', 'sunrise', 'sunset', 'daylight_duration', 'sunshine_duration', 'rain_sum', 'snowfall_sum', 'precipitation_hours', 'windspeed_10m_max', 'windgusts_10m_max', 'shortwave_radiation_sum', 'et0_fao_evapotranspiration', 'day_of_week_sin', 'day_of_week_cos', 'day_of_year_sin', 'day_of_year_cos', 'winddirection_10m_dominant_sin', 'winddirection_10m_dominant_cos']


In [347]:
# Check feature ranges before scaling (continuous features only)
print("Continuous feature statistics before scaling:")
X_encoded[continuous_cols].describe().T[['min', 'max', 'mean', 'std']]


Continuous feature statistics before scaling:


Unnamed: 0,min,max,mean,std
temperature_2m_max,-6.7,36.1,14.928006,8.401043
temperature_2m_min,-17.5,23.1,7.235309,6.70036
temperature_2m_mean,-10.2,29.1,11.075225,7.463579
apparent_temperature_max,-13.0,36.7,12.627893,9.957706
apparent_temperature_min,-22.8,23.5,4.740225,8.176099
apparent_temperature_mean,-14.7,29.6,8.742556,9.013316
sunrise,9960.0,26400.0,17927.696629,5540.841244
sunset,53760.0,70560.0,62434.162921,5664.918279
daylight_duration,27607.04,60523.94,44506.771744,11157.694989
sunshine_duration,0.0,55333.02,27306.354107,17105.95547


In [348]:
# Apply StandardScaler ONLY to continuous features
scaler = StandardScaler()

# Start with the encoded data
X_scaled = X_encoded.copy()

# Scale only continuous columns
X_scaled[continuous_cols] = scaler.fit_transform(X_encoded[continuous_cols])

print("Feature preprocessing summary:")
print(f"  - Boolean features (unchanged): {boolean_cols}")
print(f"  - Categorical features (unchanged): {categorical_cols}")
print(f"  - Cyclical features (sin/cos encoded): {[f'{c}_sin, {c}_cos' for c in cyclical_cols]}")
print(f"  - Continuous features (standardized): {len(continuous_cols)} columns")
print(f"\nFinal feature matrix shape: {X_scaled.shape}")
print(f"\nContinuous feature statistics after scaling:")
X_scaled[continuous_cols].describe().T[['min', 'max', 'mean', 'std']]


Feature preprocessing summary:
  - Boolean features (unchanged): ['holiday']
  - Categorical features (unchanged): ['weathercode']
  - Cyclical features (sin/cos encoded): ['day_of_week_sin, day_of_week_cos', 'day_of_year_sin, day_of_year_cos', 'winddirection_10m_dominant_sin, winddirection_10m_dominant_cos']
  - Continuous features (standardized): 17 columns

Final feature matrix shape: (3560, 25)

Continuous feature statistics after scaling:


Unnamed: 0,min,max,mean,std
temperature_2m_max,-2.574805,2.520517,6.386901000000001e-17,1.00014
temperature_2m_min,-3.692157,2.36807,1.5967250000000003e-17,1.00014
temperature_2m_mean,-2.85094,2.41537,-3.1934500000000005e-17,1.00014
apparent_temperature_max,-2.574036,2.417775,-6.386901000000001e-17,1.00014
apparent_temperature_min,-3.368855,2.294788,6.786082000000001e-17,1.00014
apparent_temperature_mean,-2.601246,2.314395,-3.1934500000000005e-17,1.00014
sunrise,-1.438196,1.529279,1.91607e-16,1.00014
sunset,-1.531422,1.434615,-5.029685e-16,1.00014
daylight_duration,-1.514838,1.435728,2.794269e-17,1.00014
sunshine_duration,-1.596531,1.638646,-1.437053e-16,1.00014


In [349]:
# Export preprocessed data to CSV
export_df = X_scaled.copy()
export_df['slp'] = y.values
export_df['date'] = df_clean['date'].values

# Reorder columns to put date first
cols = ['date', 'slp'] + [col for col in export_df.columns if col not in ['date', 'slp']]
export_df = export_df[cols]

# Save to dataset folder
export_df.to_csv('dataset/data_v2_step_5.csv', sep=';', decimal=',', index=False)
print(f"Exported preprocessed data to 'dataset/data_v2_step_5.csv'")
print(f"Shape: {export_df.shape}")


Exported preprocessed data to 'dataset/data_v2_step_5.csv'
Shape: (3560, 27)


## 3. Time Series Split

In [350]:
# Fixed train/test split: train on past data, test on newest 1 year
# Test set: last 1 year of data
# Train set: all data before the test period

test_end = df_clean['date'].max()
test_start = test_end - pd.DateOffset(years=1)

# Create train/test masks
train_mask_main = df_clean['date'] <= test_start
test_mask_main = df_clean['date'] > test_start

# Get indices
train_idx_main = df_clean[train_mask_main].index.tolist()
test_idx_main = df_clean[test_mask_main].index.tolist()

# Create train/test sets
X_train_main = X_scaled.iloc[train_idx_main]
X_test_main = X_scaled.iloc[test_idx_main]
y_train_main = y.iloc[train_idx_main]
y_test_main = y.iloc[test_idx_main]

print("Time Series Train/Test Split:")
print("=" * 60)
print(f"\nTrain set (past data):")
print(f"  Period: {df_clean.iloc[train_idx_main]['date'].min().date()} to {df_clean.iloc[train_idx_main]['date'].max().date()}")
print(f"  Samples: {len(train_idx_main)}")

print(f"\nTest set (newest 1 year):")
print(f"  Period: {df_clean.iloc[test_idx_main]['date'].min().date()} to {df_clean.iloc[test_idx_main]['date'].max().date()}")
print(f"  Samples: {len(test_idx_main)}")


Time Series Train/Test Split:

Train set (past data):
  Period: 2016-01-01 to 2024-09-30
  Samples: 3196

Test set (newest 1 year):
  Period: 2024-10-01 to 2025-09-30
  Samples: 364


## 4. Define Models

In [351]:
# Define different types of models with standard parameters
models = {
    # Linear Models
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet': ElasticNet(),
    
    # Tree-based Models
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42, n_jobs=-1, verbosity=0),
    'LightGBM': LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1),
    
    # Distance-based Models
    'K-Nearest Neighbors': KNeighborsRegressor(),
    
    # Support Vector Machine (tuned for large target values)
    'SVR': SVR(C=1e6, epsilon=1e4, kernel='rbf'),
}

print(f"Total models to evaluate: {len(models)}")
for name in models.keys():
    print(f"  - {name}")


Total models to evaluate: 12
  - Linear Regression
  - Ridge Regression
  - Lasso Regression
  - ElasticNet
  - Decision Tree
  - Random Forest
  - Gradient Boosting
  - AdaBoost
  - XGBoost
  - LightGBM
  - K-Nearest Neighbors
  - SVR


## 5. Train and Test Models

In [352]:
def evaluate_model_single_split(model, X_train, X_test, y_train, y_test):
    """
    Evaluate a model using a single train/test split.
    Train on past data, test on newest data.
    """
    from sklearn.base import clone
    model_clone = clone(model)
    
    # Train the model
    model_clone.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model_clone.predict(X_test)
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return {
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
    }


In [353]:
# Train and evaluate all models using single train/test split
results = {}

print("Training and evaluating models...")
print("=" * 60)
print(f"Train: {len(X_train_main)} samples | Test: {len(X_test_main)} samples (newest 1 year)")
print("=" * 60)

for name, model in models.items():
    print(f"Training: {name}...", end=" ")
    try:
        metrics = evaluate_model_single_split(model, X_train_main, X_test_main, y_train_main, y_test_main)
        results[name] = metrics
        print(f"Done! (R² = {metrics['R2']:.4f})")
    except Exception as e:
        print(f"Error: {str(e)}")
        results[name] = {'RMSE': np.nan, 'MAE': np.nan, 'R2': np.nan}

print("\nAll models trained!")


Training and evaluating models...
Train: 3196 samples | Test: 364 samples (newest 1 year)
Training: Linear Regression... Done! (R² = 0.9529)
Training: Ridge Regression... Done! (R² = 0.9541)
Training: Lasso Regression... Done! (R² = 0.9516)
Training: ElasticNet... Done! (R² = 0.9179)
Training: Decision Tree... Done! (R² = 0.8393)
Training: Random Forest... Done! (R² = 0.9658)
Training: Gradient Boosting... Done! (R² = 0.9726)
Training: AdaBoost... Done! (R² = 0.9365)
Training: XGBoost... Done! (R² = 0.9532)
Training: LightGBM... Done! (R² = 0.9686)
Training: K-Nearest Neighbors... Done! (R² = 0.9492)
Training: SVR... Done! (R² = 0.9466)

All models trained!


## 6. Results


In [354]:
# Create results DataFrame
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('R2', ascending=False)

# Format for display
results_display = results_df.copy()
results_display['RMSE_fmt'] = results_display['RMSE'].apply(lambda x: f"{x:.2f}")
results_display['MAE_fmt'] = results_display['MAE'].apply(lambda x: f"{x:.2f}")
results_display['R²_fmt'] = results_display['R2'].apply(lambda x: f"{x:.4f}")

print("\n" + "=" * 80)
print("MODEL COMPARISON RESULTS (sorted by R² score)")
print("=" * 80)
print(f"\nTrain/Test Split: Train on past data, Test on newest 1 year")
print()
results_display[['RMSE_fmt', 'MAE_fmt', 'R²_fmt']].rename(columns={'RMSE_fmt': 'RMSE', 'MAE_fmt': 'MAE', 'R²_fmt': 'R²'})



MODEL COMPARISON RESULTS (sorted by R² score)

Train/Test Split: Train on past data, Test on newest 1 year



Unnamed: 0,RMSE,MAE,R²
Gradient Boosting,113948.63,84081.41,0.9726
LightGBM,122178.16,86431.19,0.9686
Random Forest,127367.3,87380.37,0.9658
Ridge Regression,147543.92,118234.78,0.9541
XGBoost,149041.34,97857.87,0.9532
Linear Regression,149468.96,118886.27,0.9529
Lasso Regression,151636.33,122385.1,0.9516
K-Nearest Neighbors,155227.98,112004.32,0.9492
SVR,159204.14,128672.9,0.9466
AdaBoost,173621.09,142553.2,0.9365


In [355]:
# Summary statistics
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)

best_model = results_df['R2'].idxmax()
best_r2 = results_df.loc[best_model, 'R2']
best_rmse = results_df.loc[best_model, 'RMSE']
best_mae = results_df.loc[best_model, 'MAE']

print(f"\nBest Model: {best_model}")
print(f"   - R² Score: {best_r2:.4f}")
print(f"   - RMSE: {best_rmse:.2f}")
print(f"   - MAE: {best_mae:.2f}")

print(f"\nTarget variable (slp) statistics:")
print(f"   - Mean: {y.mean():.2f}")
print(f"   - Std: {y.std():.2f}")
print(f"   - Min: {y.min():.2f}")
print(f"   - Max: {y.max():.2f}")



SUMMARY

Best Model: Gradient Boosting
   - R² Score: 0.9726
   - RMSE: 113948.63
   - MAE: 84081.41

Target variable (slp) statistics:
   - Mean: 988192.36
   - Std: 752107.47
   - Min: -435171.56
   - Max: 3341410.76


In [356]:
# Visual comparison (text-based bar chart)
print("\n" + "=" * 80)
print("R² SCORE COMPARISON")
print("=" * 80 + "\n")

max_bar_length = 50
max_r2 = results_df['R2'].max()

for model_name in results_df.index:
    r2 = results_df.loc[model_name, 'R2']
    if r2 > 0:
        bar_length = int((r2 / max_r2) * max_bar_length)
        bar = '█' * bar_length
    else:
        bar_length = 0
        bar = ''
    print(f"{model_name:25s} | {bar} {r2:.4f}")



R² SCORE COMPARISON

Gradient Boosting         | ██████████████████████████████████████████████████ 0.9726
LightGBM                  | █████████████████████████████████████████████████ 0.9686
Random Forest             | █████████████████████████████████████████████████ 0.9658
Ridge Regression          | █████████████████████████████████████████████████ 0.9541
XGBoost                   | █████████████████████████████████████████████████ 0.9532
Linear Regression         | ████████████████████████████████████████████████ 0.9529
Lasso Regression          | ████████████████████████████████████████████████ 0.9516
K-Nearest Neighbors       | ████████████████████████████████████████████████ 0.9492
SVR                       | ████████████████████████████████████████████████ 0.9466
AdaBoost                  | ████████████████████████████████████████████████ 0.9365
ElasticNet                | ███████████████████████████████████████████████ 0.9179
Decision Tree             | █████████████████████

## 7. Optimal Training Timespan Analysis

This section determines the optimal amount of historical data for predicting one year ahead.
We use the last year of data as the test set and vary the training period from 1 year to all available historical data.


In [357]:
# Define the test period: last 1 year of data
test_end_date = df_clean['date'].max()
test_start_date = test_end_date - pd.DateOffset(years=1)

# Create test set mask
test_mask = df_clean['date'] > test_start_date
X_test_final = X_scaled[test_mask]
y_test_final = y[test_mask]

print(f"Test period: {test_start_date.date()} to {test_end_date.date()}")
print(f"Test set size: {len(X_test_final)} samples")

# Available training data (everything before test period)
train_available_mask = df_clean['date'] <= test_start_date
train_start_date = df_clean[train_available_mask]['date'].min()
train_end_date = df_clean[train_available_mask]['date'].max()

print(f"\nAvailable training period: {train_start_date.date()} to {train_end_date.date()}")
total_train_years = (train_end_date - train_start_date).days / 365.25
print(f"Total available training data: {total_train_years:.1f} years ({train_available_mask.sum()} samples)")


Test period: 2024-09-30 to 2025-09-30
Test set size: 364 samples

Available training period: 2016-01-01 to 2024-09-30
Total available training data: 8.7 years (3196 samples)


In [358]:
# Define the top 3 models based on previous results
top_models = {
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1),
    'Random Forest': RandomForestRegressor(random_state=42),
}

# Calculate max years available for training
max_years = int(total_train_years)
print(f"Testing training periods from 1 to {max_years} years\n")

# Store results for each training period
timespan_results = {model_name: {'years': [], 'rmse': [], 'mae': [], 'r2': []} 
                    for model_name in top_models.keys()}

print("Evaluating models with different training timespans...")
print("=" * 70)

for n_years in range(1, max_years + 1):
    # Define training period: n_years before test_start_date
    train_period_start = test_start_date - pd.DateOffset(years=n_years)
    
    # Create training mask for this period
    train_mask = (df_clean['date'] > train_period_start) & (df_clean['date'] <= test_start_date)
    
    X_train = X_scaled[train_mask]
    y_train = y[train_mask]
    
    print(f"\n{n_years} year(s) of training data: {train_period_start.date()} to {test_start_date.date()} ({len(X_train)} samples)")
    
    for model_name, model in top_models.items():
        # Clone the model to avoid refitting issues
        from sklearn.base import clone
        model_clone = clone(model)
        
        # Train and predict
        model_clone.fit(X_train, y_train)
        y_pred = model_clone.predict(X_test_final)
        
        # Calculate metrics
        rmse = np.sqrt(mean_squared_error(y_test_final, y_pred))
        mae = mean_absolute_error(y_test_final, y_pred)
        r2 = r2_score(y_test_final, y_pred)
        
        # Store results
        timespan_results[model_name]['years'].append(n_years)
        timespan_results[model_name]['rmse'].append(rmse)
        timespan_results[model_name]['mae'].append(mae)
        timespan_results[model_name]['r2'].append(r2)
        
        print(f"  {model_name}: R² = {r2:.4f}, RMSE = {rmse:.2f}")

print("\n" + "=" * 70)
print("Training timespan analysis complete!")


Testing training periods from 1 to 8 years

Evaluating models with different training timespans...

1 year(s) of training data: 2023-09-30 to 2024-09-30 (366 samples)
  Gradient Boosting: R² = 0.9528, RMSE = 149755.45
  LightGBM: R² = 0.9498, RMSE = 154429.61
  Random Forest: R² = 0.9642, RMSE = 130379.30

2 year(s) of training data: 2022-09-30 to 2024-09-30 (731 samples)
  Gradient Boosting: R² = 0.9741, RMSE = 110844.72
  LightGBM: R² = 0.9754, RMSE = 108016.84
  Random Forest: R² = 0.9746, RMSE = 109777.40

3 year(s) of training data: 2021-09-30 to 2024-09-30 (1096 samples)
  Gradient Boosting: R² = 0.9766, RMSE = 105484.76
  LightGBM: R² = 0.9738, RMSE = 111470.16
  Random Forest: R² = 0.9745, RMSE = 110079.05

4 year(s) of training data: 2020-09-30 to 2024-09-30 (1461 samples)
  Gradient Boosting: R² = 0.9746, RMSE = 109892.19
  LightGBM: R² = 0.9700, RMSE = 119244.25
  Random Forest: R² = 0.9677, RMSE = 123747.13

5 year(s) of training data: 2019-09-30 to 2024-09-30 (1827 samples

In [359]:
# Create a comprehensive results DataFrame
timespan_df_list = []
for model_name, results in timespan_results.items():
    for i in range(len(results['years'])):
        timespan_df_list.append({
            'Model': model_name,
            'Training Years': results['years'][i],
            'RMSE': results['rmse'][i],
            'MAE': results['mae'][i],
            'R²': results['r2'][i]
        })

timespan_df = pd.DataFrame(timespan_df_list)

# Pivot table for R² scores
r2_pivot = timespan_df.pivot(index='Training Years', columns='Model', values='R²')
rmse_pivot = timespan_df.pivot(index='Training Years', columns='Model', values='RMSE')

print("\n" + "=" * 80)
print("R² SCORES BY TRAINING TIMESPAN")
print("=" * 80)
print(r2_pivot.round(4).to_string())

print("\n" + "=" * 80)
print("RMSE BY TRAINING TIMESPAN")
print("=" * 80)
print(rmse_pivot.round(2).to_string())



R² SCORES BY TRAINING TIMESPAN
Model           Gradient Boosting  LightGBM  Random Forest
Training Years                                            
1                          0.9528    0.9498         0.9642
2                          0.9741    0.9754         0.9746
3                          0.9766    0.9738         0.9745
4                          0.9746    0.9700         0.9677
5                          0.9691    0.9603         0.9566
6                          0.9723    0.9653         0.9641
7                          0.9732    0.9652         0.9690
8                          0.9724    0.9639         0.9660

RMSE BY TRAINING TIMESPAN
Model           Gradient Boosting   LightGBM  Random Forest
Training Years                                             
1                       149755.45  154429.61      130379.30
2                       110844.72  108016.84      109777.40
3                       105484.76  111470.16      110079.05
4                       109892.19  119244.25      1

In [360]:
# Find optimal training timespan for each model
print("\n" + "=" * 80)
print("OPTIMAL TRAINING TIMESPAN ANALYSIS")
print("=" * 80)

optimal_timespans = {}
for model_name in top_models.keys():
    model_data = timespan_df[timespan_df['Model'] == model_name]
    best_idx = model_data['R²'].idxmax()
    best_row = timespan_df.loc[best_idx]
    optimal_timespans[model_name] = {
        'years': int(best_row['Training Years']),
        'r2': best_row['R²'],
        'rmse': best_row['RMSE'],
        'mae': best_row['MAE']
    }
    print(f"\n{model_name}:")
    print(f"  Optimal training period: {optimal_timespans[model_name]['years']} year(s)")
    print(f"  Best R²: {optimal_timespans[model_name]['r2']:.4f}")
    print(f"  RMSE: {optimal_timespans[model_name]['rmse']:.2f}")
    print(f"  MAE: {optimal_timespans[model_name]['mae']:.2f}")

# Calculate average R² for each training timespan across all models
avg_r2_by_years = timespan_df.groupby('Training Years')['R²'].mean()
optimal_years_overall = avg_r2_by_years.idxmax()
optimal_r2_overall = avg_r2_by_years.max()

print("\n" + "-" * 80)
print("OVERALL RECOMMENDATION (averaged across all 3 models)")
print("-" * 80)
print(f"\nOptimal training timespan: {optimal_years_overall} year(s)")
print(f"Average R² score: {optimal_r2_overall:.4f}")



OPTIMAL TRAINING TIMESPAN ANALYSIS

Gradient Boosting:
  Optimal training period: 3 year(s)
  Best R²: 0.9766
  RMSE: 105484.76
  MAE: 71462.74

LightGBM:
  Optimal training period: 2 year(s)
  Best R²: 0.9754
  RMSE: 108016.84
  MAE: 72961.57

Random Forest:
  Optimal training period: 2 year(s)
  Best R²: 0.9746
  RMSE: 109777.40
  MAE: 73192.08

--------------------------------------------------------------------------------
OVERALL RECOMMENDATION (averaged across all 3 models)
--------------------------------------------------------------------------------

Optimal training timespan: 3 year(s)
Average R² score: 0.9749


In [361]:
# Visual comparison of R² scores by training timespan
print("\n" + "=" * 80)
print("R² SCORE BY TRAINING TIMESPAN (Visual)")
print("=" * 80 + "\n")

for model_name in top_models.keys():
    print(f"\n{model_name}:")
    model_data = timespan_df[timespan_df['Model'] == model_name].sort_values('Training Years')
    
    max_r2 = model_data['R²'].max()
    for _, row in model_data.iterrows():
        years = int(row['Training Years'])
        r2 = row['R²']
        bar_length = int((r2 / max_r2) * 40) if r2 > 0 else 0
        bar = '█' * bar_length
        marker = ' ← BEST' if r2 == max_r2 else ''
        print(f"  {years:2d} year(s) | {bar} {r2:.4f}{marker}")

# Average across models
print(f"\nAverage (all models):")
for years in sorted(avg_r2_by_years.index):
    r2 = avg_r2_by_years[years]
    bar_length = int((r2 / avg_r2_by_years.max()) * 40) if r2 > 0 else 0
    bar = '█' * bar_length
    marker = ' ← BEST' if years == optimal_years_overall else ''
    print(f"  {years:2d} year(s) | {bar} {r2:.4f}{marker}")



R² SCORE BY TRAINING TIMESPAN (Visual)


Gradient Boosting:
   1 year(s) | ███████████████████████████████████████ 0.9528
   2 year(s) | ███████████████████████████████████████ 0.9741
   3 year(s) | ████████████████████████████████████████ 0.9766 ← BEST
   4 year(s) | ███████████████████████████████████████ 0.9746
   5 year(s) | ███████████████████████████████████████ 0.9691
   6 year(s) | ███████████████████████████████████████ 0.9723
   7 year(s) | ███████████████████████████████████████ 0.9732
   8 year(s) | ███████████████████████████████████████ 0.9724

LightGBM:
   1 year(s) | ██████████████████████████████████████ 0.9498
   2 year(s) | ████████████████████████████████████████ 0.9754 ← BEST
   3 year(s) | ███████████████████████████████████████ 0.9738
   4 year(s) | ███████████████████████████████████████ 0.9700
   5 year(s) | ███████████████████████████████████████ 0.9603
   6 year(s) | ███████████████████████████████████████ 0.9653
   7 year(s) | ██████████████████████████████

In [362]:
# Final summary
print("\n" + "=" * 80)
print("FINAL SUMMARY: OPTIMAL TRAINING TIMESPAN")
print("=" * 80)

print(f"""
Test Period: {test_start_date.date()} to {test_end_date.date()} ({len(y_test_final)} days)

Results by Model:
""")

for model_name, opt in optimal_timespans.items():
    print(f"  • {model_name}: {opt['years']} year(s) → R² = {opt['r2']:.4f}")

print(f"""
════════════════════════════════════════════════════════════════════════════════
RECOMMENDATION: Use {optimal_years_overall} year(s) of historical data for training
                when predicting 1 year ahead.
                
                This achieves an average R² of {optimal_r2_overall:.4f} across the
                top 3 models (Random Forest, Gradient Boosting, LightGBM).
════════════════════════════════════════════════════════════════════════════════
""")



FINAL SUMMARY: OPTIMAL TRAINING TIMESPAN

Test Period: 2024-09-30 to 2025-09-30 (364 days)

Results by Model:

  • Gradient Boosting: 3 year(s) → R² = 0.9766
  • LightGBM: 2 year(s) → R² = 0.9754
  • Random Forest: 2 year(s) → R² = 0.9746

════════════════════════════════════════════════════════════════════════════════
RECOMMENDATION: Use 3 year(s) of historical data for training
                when predicting 1 year ahead.
                
                This achieves an average R² of 0.9749 across the
                top 3 models (Random Forest, Gradient Boosting, LightGBM).
════════════════════════════════════════════════════════════════════════════════



In [363]:
# Export data with optimal training timespan + test timespan
# Define the optimal training period start date
optimal_train_start = test_start_date - pd.DateOffset(years=optimal_years_overall)

# Create mask for optimal training + test data
export_mask = df_clean['date'] > optimal_train_start

# Create export dataframe with scaled features
export_df_optimal = X_scaled[export_mask].copy()
export_df_optimal['slp'] = y[export_mask].values
export_df_optimal['date'] = df_clean.loc[export_mask, 'date'].values

# Reorder columns to put date first
cols = ['date', 'slp'] + [col for col in export_df_optimal.columns if col not in ['date', 'slp']]
export_df_optimal = export_df_optimal[cols]

# Save to CSV
export_path = 'dataset/data_v2_step_5_and_4.csv'
export_df_optimal.to_csv(export_path, sep=';', decimal=',', index=False)

print(f"Exported data to '{export_path}'")
print(f"\nData period: {optimal_train_start.date()} to {test_end_date.date()}")
print(f"  - Training period: {optimal_train_start.date()} to {test_start_date.date()} ({optimal_years_overall} years)")
print(f"  - Test period: {test_start_date.date()} to {test_end_date.date()} (1 year)")
print(f"\nTotal samples: {len(export_df_optimal)}")
print(f"Shape: {export_df_optimal.shape}")

Exported data to 'dataset/data_v2_step_5_and_4.csv'

Data period: 2021-09-30 to 2025-09-30
  - Training period: 2021-09-30 to 2024-09-30 (3 years)
  - Test period: 2024-09-30 to 2025-09-30 (1 year)

Total samples: 1460
Shape: (1460, 27)


## 8. Model Selection on Optimal Timespan Dataset

Now we repeat the model selection process (Steps 4-6) using only the optimal training timespan + test data exported to `data_v2_step_5_and_4.csv`.


In [364]:
# Load the optimal timespan dataset
df_optimal = pd.read_csv('dataset/data_v2_step_5_and_4.csv', sep=';', decimal=',')

print(f"Optimal timespan dataset shape: {df_optimal.shape}")
print(f"\nColumns: {df_optimal.columns.tolist()}")

# Parse date and sort
df_optimal['date'] = pd.to_datetime(df_optimal['date'])
df_optimal = df_optimal.sort_values('date').reset_index(drop=True)

# Separate features and target
X_opt = df_optimal.drop(columns=['date', 'slp'])
y_opt = df_optimal['slp']

print(f"\nFeatures shape: {X_opt.shape}")
print(f"Target shape: {y_opt.shape}")
print(f"\nDate range: {df_optimal['date'].min().date()} to {df_optimal['date'].max().date()}")


Optimal timespan dataset shape: (1460, 27)

Columns: ['date', 'slp', 'holiday', 'weathercode', 'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'apparent_temperature_max', 'apparent_temperature_min', 'apparent_temperature_mean', 'sunrise', 'sunset', 'daylight_duration', 'sunshine_duration', 'rain_sum', 'snowfall_sum', 'precipitation_hours', 'windspeed_10m_max', 'windgusts_10m_max', 'shortwave_radiation_sum', 'et0_fao_evapotranspiration', 'day_of_week_sin', 'day_of_week_cos', 'day_of_year_sin', 'day_of_year_cos', 'winddirection_10m_dominant_sin', 'winddirection_10m_dominant_cos']

Features shape: (1460, 25)
Target shape: (1460,)

Date range: 2021-10-01 to 2025-09-30


In [365]:
# Fixed train/test split for optimal dataset: train on past data, test on newest 1 year
test_end_opt = df_optimal['date'].max()
test_start_opt = test_end_opt - pd.DateOffset(years=1)

# Create train/test masks
train_mask_opt = df_optimal['date'] <= test_start_opt
test_mask_opt = df_optimal['date'] > test_start_opt

# Get indices
train_idx_opt = df_optimal[train_mask_opt].index.tolist()
test_idx_opt = df_optimal[test_mask_opt].index.tolist()

# Create train/test sets
X_train_opt = X_opt.iloc[train_idx_opt]
X_test_opt = X_opt.iloc[test_idx_opt]
y_train_opt = y_opt.iloc[train_idx_opt]
y_test_opt = y_opt.iloc[test_idx_opt]

print("Time Series Train/Test Split (Optimal Dataset):")
print("=" * 60)
print(f"\nTrain set (past data):")
print(f"  Period: {df_optimal.iloc[train_idx_opt]['date'].min().date()} to {df_optimal.iloc[train_idx_opt]['date'].max().date()}")
print(f"  Samples: {len(train_idx_opt)}")

print(f"\nTest set (newest 1 year):")
print(f"  Period: {df_optimal.iloc[test_idx_opt]['date'].min().date()} to {df_optimal.iloc[test_idx_opt]['date'].max().date()}")
print(f"  Samples: {len(test_idx_opt)}")


Time Series Train/Test Split (Optimal Dataset):

Train set (past data):
  Period: 2021-10-01 to 2024-09-30
  Samples: 1096

Test set (newest 1 year):
  Period: 2024-10-01 to 2025-09-30
  Samples: 364


### 8.1 Define Models (on Optimal Dataset)


In [366]:
# Define the same models for comparison on optimal dataset
models_opt = {
    # Linear Models
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet': ElasticNet(),
    
    # Tree-based Models
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42, n_jobs=-1, verbosity=0),
    'LightGBM': LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1),
    
    # Distance-based Models
    'K-Nearest Neighbors': KNeighborsRegressor(),
    
    # Support Vector Machine
    'SVR': SVR(C=1e6, epsilon=1e4, kernel='rbf'),
}

print(f"Total models to evaluate: {len(models_opt)}")
for name in models_opt.keys():
    print(f"  - {name}")


Total models to evaluate: 12
  - Linear Regression
  - Ridge Regression
  - Lasso Regression
  - ElasticNet
  - Decision Tree
  - Random Forest
  - Gradient Boosting
  - AdaBoost
  - XGBoost
  - LightGBM
  - K-Nearest Neighbors
  - SVR


### 8.2 Train and Test Models (on Optimal Dataset)


In [367]:
# Train and evaluate all models on optimal dataset using single train/test split
results_opt = {}

print("Training and evaluating models on optimal timespan dataset...")
print("=" * 60)
print(f"Train: {len(X_train_opt)} samples | Test: {len(X_test_opt)} samples (newest 1 year)")
print("=" * 60)

for name, model in models_opt.items():
    print(f"Training: {name}...", end=" ")
    try:
        metrics = evaluate_model_single_split(model, X_train_opt, X_test_opt, y_train_opt, y_test_opt)
        results_opt[name] = metrics
        print(f"Done! (R² = {metrics['R2']:.4f})")
    except Exception as e:
        print(f"Error: {str(e)}")
        results_opt[name] = {'RMSE': np.nan, 'MAE': np.nan, 'R2': np.nan}

print("\nAll models trained!")


Training and evaluating models on optimal timespan dataset...
Train: 1096 samples | Test: 364 samples (newest 1 year)
Training: Linear Regression... Done! (R² = 0.9717)
Training: Ridge Regression... Done! (R² = 0.9731)
Training: Lasso Regression... Done! (R² = 0.9699)
Training: ElasticNet... Done! (R² = 0.9320)
Training: Decision Tree... Done! (R² = 0.9448)
Training: Random Forest... Done! (R² = 0.9745)
Training: Gradient Boosting... Done! (R² = 0.9766)
Training: AdaBoost... Done! (R² = 0.9397)
Training: XGBoost... Done! (R² = 0.9715)
Training: LightGBM... Done! (R² = 0.9741)
Training: K-Nearest Neighbors... Done! (R² = 0.9546)
Training: SVR... Done! (R² = 0.9497)

All models trained!


### 8.3 Results (on Optimal Dataset)


In [368]:
# Create results DataFrame for optimal dataset
results_opt_df = pd.DataFrame(results_opt).T
results_opt_df = results_opt_df.sort_values('R2', ascending=False)

# Format for display
results_opt_display = results_opt_df.copy()
results_opt_display['RMSE_fmt'] = results_opt_display['RMSE'].apply(lambda x: f"{x:.2f}")
results_opt_display['MAE_fmt'] = results_opt_display['MAE'].apply(lambda x: f"{x:.2f}")
results_opt_display['R²_fmt'] = results_opt_display['R2'].apply(lambda x: f"{x:.4f}")

print("\n" + "=" * 80)
print("MODEL COMPARISON RESULTS - OPTIMAL TIMESPAN DATASET (sorted by R² score)")
print("=" * 80)
print(f"\nDataset: {len(X_opt)} samples ({optimal_years_overall} years training + 1 year test)")
print(f"Train/Test Split: Train on past data, Test on newest 1 year")
print()
results_opt_display[['RMSE_fmt', 'MAE_fmt', 'R²_fmt']].rename(columns={'RMSE_fmt': 'RMSE', 'MAE_fmt': 'MAE', 'R²_fmt': 'R²'})



MODEL COMPARISON RESULTS - OPTIMAL TIMESPAN DATASET (sorted by R² score)

Dataset: 1460 samples (3 years training + 1 year test)
Train/Test Split: Train on past data, Test on newest 1 year



Unnamed: 0,RMSE,MAE,R²
Gradient Boosting,105484.76,71462.74,0.9766
Random Forest,110079.05,75278.12,0.9745
LightGBM,110803.48,74066.48,0.9741
Ridge Regression,112924.49,88289.2,0.9731
Linear Regression,115904.58,88654.02,0.9717
XGBoost,116334.73,77569.96,0.9715
Lasso Regression,119610.81,94925.42,0.9699
K-Nearest Neighbors,146833.02,98389.01,0.9546
SVR,154547.69,124787.12,0.9497
Decision Tree,161897.56,110685.31,0.9448


In [369]:
# Summary statistics for optimal dataset
print("\n" + "=" * 80)
print("SUMMARY - OPTIMAL TIMESPAN DATASET")
print("=" * 80)

best_model_opt = results_opt_df['R2'].idxmax()
best_r2_opt = results_opt_df.loc[best_model_opt, 'R2']
best_rmse_opt = results_opt_df.loc[best_model_opt, 'RMSE']
best_mae_opt = results_opt_df.loc[best_model_opt, 'MAE']

print(f"\nBest Model: {best_model_opt}")
print(f"   - R² Score: {best_r2_opt:.4f}")
print(f"   - RMSE: {best_rmse_opt:.2f}")
print(f"   - MAE: {best_mae_opt:.2f}")

print(f"\nTarget variable (slp) statistics:")
print(f"   - Mean: {y_opt.mean():.2f}")
print(f"   - Std: {y_opt.std():.2f}")
print(f"   - Min: {y_opt.min():.2f}")
print(f"   - Max: {y_opt.max():.2f}")



SUMMARY - OPTIMAL TIMESPAN DATASET

Best Model: Gradient Boosting
   - R² Score: 0.9766
   - RMSE: 105484.76
   - MAE: 71462.74

Target variable (slp) statistics:
   - Mean: 913692.63
   - Std: 705517.94
   - Min: -435171.56
   - Max: 2846941.16


In [370]:
# Visual comparison for optimal dataset
print("\n" + "=" * 80)
print("R² SCORE COMPARISON - OPTIMAL TIMESPAN DATASET")
print("=" * 80 + "\n")

max_bar_length = 50
max_r2_opt = results_opt_df['R2'].max()

for model_name in results_opt_df.index:
    r2 = results_opt_df.loc[model_name, 'R2']
    if r2 > 0:
        bar_length = int((r2 / max_r2_opt) * max_bar_length)
        bar = '█' * bar_length
    else:
        bar_length = 0
        bar = ''
    print(f"{model_name:25s} | {bar} {r2:.4f}")



R² SCORE COMPARISON - OPTIMAL TIMESPAN DATASET

Gradient Boosting         | ██████████████████████████████████████████████████ 0.9766
Random Forest             | █████████████████████████████████████████████████ 0.9745
LightGBM                  | █████████████████████████████████████████████████ 0.9741
Ridge Regression          | █████████████████████████████████████████████████ 0.9731
Linear Regression         | █████████████████████████████████████████████████ 0.9717
XGBoost                   | █████████████████████████████████████████████████ 0.9715
Lasso Regression          | █████████████████████████████████████████████████ 0.9699
K-Nearest Neighbors       | ████████████████████████████████████████████████ 0.9546
SVR                       | ████████████████████████████████████████████████ 0.9497
Decision Tree             | ████████████████████████████████████████████████ 0.9448
AdaBoost                  | ████████████████████████████████████████████████ 0.9397
ElasticNet         

In [371]:
# Compare results: Full dataset vs Optimal timespan dataset
print("\n" + "=" * 80)
print("COMPARISON: FULL DATASET vs OPTIMAL TIMESPAN DATASET")
print("=" * 80)

comparison_data = []
for model_name in results_df.index:
    if model_name in results_opt_df.index:
        r2_full = results_df.loc[model_name, 'R2']
        r2_opt = results_opt_df.loc[model_name, 'R2']
        diff = r2_opt - r2_full
        comparison_data.append({
            'Model': model_name,
            'R² (Full Data)': r2_full,
            'R² (Optimal)': r2_opt,
            'Difference': diff,
            'Change': f"{'+' if diff > 0 else ''}{diff:.4f}"
        })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('R² (Optimal)', ascending=False)

print(f"\nFull dataset: Train {len(X_train_main)} samples, Test {len(X_test_main)} samples")
print(f"Optimal timespan: Train {len(X_train_opt)} samples, Test {len(X_test_opt)} samples")
print()
print(comparison_df.to_string(index=False))

# Summary
improved = (comparison_df['Difference'] > 0).sum()
total = len(comparison_df)
print(f"\n{improved}/{total} models improved with optimal timespan dataset")
print(f"Average R² change: {comparison_df['Difference'].mean():+.4f}")



COMPARISON: FULL DATASET vs OPTIMAL TIMESPAN DATASET

Full dataset: Train 3196 samples, Test 364 samples
Optimal timespan: Train 1096 samples, Test 364 samples

              Model  R² (Full Data)  R² (Optimal)  Difference  Change
  Gradient Boosting        0.972644      0.976557    0.003913 +0.0039
      Random Forest        0.965822      0.974471    0.008649 +0.0086
           LightGBM        0.968550      0.974133    0.005583 +0.0056
   Ridge Regression        0.954136      0.973134    0.018998 +0.0190
  Linear Regression        0.952931      0.971697    0.018766 +0.0188
            XGBoost        0.953200      0.971487    0.018286 +0.0183
   Lasso Regression        0.951556      0.969858    0.018302 +0.0183
K-Nearest Neighbors        0.949234      0.954577    0.005343 +0.0053
                SVR        0.946600      0.949678    0.003078 +0.0031
      Decision Tree        0.839291      0.944778    0.105487 +0.1055
           AdaBoost        0.936491      0.939651    0.003160 +0.003

In [372]:
# Final conclusion
print("\n" + "=" * 80)
print("FINAL CONCLUSION")
print("=" * 80)

print(f"""
Dataset Comparison:
  • Full dataset: {len(X_scaled)} samples (~{total_train_years + 1:.1f} years)
  • Optimal dataset: {len(X_opt)} samples ({optimal_years_overall + 1} years)

Best Performing Models:
  • Full dataset: {best_model} (R² = {best_r2:.4f})
  • Optimal dataset: {best_model_opt} (R² = {best_r2_opt:.4f})

════════════════════════════════════════════════════════════════════════════════
RECOMMENDATION:
  Using {optimal_years_overall} years of training data + 1 year test data achieves
  comparable or better results with significantly less data.
  
  Best model for prediction: {best_model_opt}
  Expected R² score: {best_r2_opt:.4f}
════════════════════════════════════════════════════════════════════════════════
""")



FINAL CONCLUSION

Dataset Comparison:
  • Full dataset: 3560 samples (~9.7 years)
  • Optimal dataset: 1460 samples (4 years)

Best Performing Models:
  • Full dataset: Gradient Boosting (R² = 0.9726)
  • Optimal dataset: Gradient Boosting (R² = 0.9766)

════════════════════════════════════════════════════════════════════════════════
RECOMMENDATION:
  Using 3 years of training data + 1 year test data achieves
  comparable or better results with significantly less data.
  
  Best model for prediction: Gradient Boosting
  Expected R² score: 0.9766
════════════════════════════════════════════════════════════════════════════════

