In [124]:
import re
import itertools
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import seaborn as sns
from scipy import stats
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from pycaret.regression import *
from sklearn.linear_model import LinearRegression

In [125]:
# Setting the maximum number of rows when you printing out a DataFrame
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

### Time features

First, we made 3 sets of time features to get models that capture diffrent important aspects from the data

In [126]:
# Time features nr 1
def add_time_features(df, time_column):
    
    df[time_column] = pd.to_datetime(df[time_column])  

    # Extract various time features
    df['hour'] = df[time_column].dt.hour
    df['day_of_week'] = df[time_column].dt.dayofweek
    df['month'] = df[time_column].dt.month
    df['day_of_year'] = df[time_column].dt.dayofyear
    df['week_of_year'] = df[time_column].dt.isocalendar().week 
    df['year'] = df[time_column].dt.year

    return df

In [127]:
# Time features nr 2
def add_time_features_cat(df, time_column):
    
    df[time_column] = pd.to_datetime(df[time_column])  
    
    df['sin_hour'] = np.sin(np.pi * df[time_column].dt.hour/23.)
    df['sin_month'] = np.sin(np.pi * df[time_column].dt.month/12.)
    
    return df

In [128]:
def is_est(observed, estimated, test):
    
    # 1. Create time-delta for estimated data
      estimated['time_dummy'] = (estimated['date_forecast'] - estimated['date_forecast'].dt.normalize()).dt.total_seconds() / 3600
      observed['time_dummy'] = 0 
      test['time_dummy'] = (test['date_forecast'] - test['date_forecast'].dt.normalize()).dt.total_seconds() / 3600
      
      estimated['time_delta'] = (estimated['date_calc'] - estimated['date_forecast']).dt.total_seconds() / 3600
      observed['time_delta'] = 0  # since observed data is not forecasting ahead
      test['time_delta'] = (test['date_calc'] - test['date_forecast']).dt.total_seconds() / 3600
      
      # 2. Add indicator variable for estimated data
      estimated['is_estimated'] = 1
      observed['is_estimated'] = 0
      test['is_estimated'] = 1
      # Merge or concatenate data
      df = pd.concat([observed, estimated], axis=0).sort_values(by='date_forecast')
      
      return df, test

In [129]:
def delete_stationary(df):
    
    # Removing data where the power output is saturated
    # Step 1: Calculate the difference
    df['diff'] = df['pv_measurement'].diff().fillna(0)

    # Step 2: Create an indicator where diff is zero
    df['constant'] = (df['diff'] == 0).astype(int)

    # Step 3: Use the indicator where diff is zero. The diff() function here identifies change-points.
    df['block'] = (df['constant'].diff() != 0).astype(int).cumsum()
    block_sizes = df.groupby('block')['constant'].sum()

    # Identify blocks that are constant for more than 2 consecutive time points (you can adjust this threshold)
    constant_blocks = block_sizes[block_sizes > 2].index

    # Step 4: Remove the constant where diff is zero
    filtered_data = df[~df['block'].isin(constant_blocks)]

    # Drop time and temporary features
    targets_ny = filtered_data[ ['time', 'pv_measurement']]
    filtered_data = filtered_data.drop(columns=['diff', 'constant', 'block'])
    return filtered_data, targets_ny

In [130]:
# Preprocessing LigtGBM + catboost
def preprocessing(targets, observed, estimated, test):
    
    # Ensure the datetime columns are in datetime format
    targets['time'] = pd.to_datetime(targets['time'])
    observed['date_forecast'] = pd.to_datetime(observed['date_forecast'])
    estimated['date_forecast'] = pd.to_datetime(estimated['date_forecast'])
    test['date_forecast'] = pd.to_datetime(test['date_forecast'])

    date_calc_resampled_ob = estimated.set_index('date_forecast')['date_calc'].resample('1H').first().to_frame()
    date_calc_resampled_te = test.set_index('date_forecast')['date_calc'].resample('1H').first().to_frame()
    
    observed_resampled = observed.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    
    estimated_resampled = estimated_resampled.merge(date_calc_resampled_ob, left_on='date_forecast', right_index=True)
    test_resampled = test_resampled.merge(date_calc_resampled_te, left_on='date_forecast', right_index=True)
    
    #Save the is_day feature as this says a lot about when the power output is zero or not
    is_day_feature = test_resampled[['date_forecast', 'is_day:idx']]
    
    #Drop some features that is noise
    test_resampled = test_resampled.drop(columns =['is_day:idx', 'snow_density:kgm3','elevation:m'])
    observed_resampled = observed_resampled.drop(columns =[ 'is_day:idx', 'snow_density:kgm3','elevation:m'])
    estimated_resampled = estimated_resampled.drop(columns =[ 'is_day:idx', 'snow_density:kgm3','elevation:m'])

    
    # Add extra features
    weather_data, test_resampled = is_est(observed_resampled, estimated_resampled, test_resampled)
    
    # Merge with target values
    merged_data = pd.merge(targets, weather_data, how='inner', left_on='time', right_on='date_forecast')

    # Add the time-based features
    merged_data = add_time_features(merged_data, 'time')  
    test_resampled = add_time_features(test_resampled, 'date_forecast') 
    
    filtered_data, targets_ny = delete_stationary(merged_data)
    
    # Drop time features
    filtered_data = filtered_data.drop(columns=['time', 'pv_measurement','date_calc'])
    test_resampled = test_resampled.drop(columns=[ 'date_forecast', 'date_calc'])
    
    return filtered_data, test_resampled, is_day_feature, targets_ny

### Preprocessing for catboost models

In [131]:
def is_est_cat(observed, estimated, test):
      # Add indicator variable for estimated data
      estimated['is_estimated'] = 1
      observed['is_estimated'] = 0
      test['is_estimated'] = 1

      # Merge or concatenate data
      df = pd.concat([observed, estimated], axis=0).sort_values(by='date_forecast')
      
      return df, test

In [132]:
# Preprocessing nr 2 for catboost models
def preprocessing_cat(targets, observed, estimated, test):
    
    targets['time'] = pd.to_datetime(targets['time'])
    observed['date_forecast'] = pd.to_datetime(observed['date_forecast'])
    estimated['date_forecast'] = pd.to_datetime(estimated['date_forecast'])
    test['date_forecast'] = pd.to_datetime(test['date_forecast'])

    # Start the resampling from 15min to 1 hour
    date_calc_resampled_ob = estimated.set_index('date_forecast')['date_calc'].resample('1H').first().to_frame()
    date_calc_resampled_te = test.set_index('date_forecast')['date_calc'].resample('1H').first().to_frame()
    
    observed_resampled = observed.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    
    estimated_resampled = estimated_resampled.merge(date_calc_resampled_ob, left_on='date_forecast', right_index=True)
    test_resampled = test_resampled.merge(date_calc_resampled_te, left_on='date_forecast', right_index=True)
    
    #Save the is_day feature as this says a lot about when the power output is zero or not
    is_day_feature = test_resampled[['date_forecast', 'is_day:idx']]
    
    #Drop some features that is noise
    test_resampled = test_resampled.drop(columns =['is_day:idx', 'snow_density:kgm3','elevation:m'])
    observed_resampled = observed_resampled.drop(columns =[ 'is_day:idx', 'snow_density:kgm3','elevation:m'])
    estimated_resampled = estimated_resampled.drop(columns =[ 'is_day:idx', 'snow_density:kgm3','elevation:m'])

    # Filter observed and estimated data for April to August
    observed_resampled = observed_resampled[observed_resampled['date_forecast'].dt.month.isin([4, 5, 6, 7, 8])]
    estimated_resampled = estimated_resampled[estimated_resampled['date_forecast'].dt.month.isin([4, 5, 6, 7, 8])]
    targets = targets[targets['time'].dt.month.isin([4, 5, 6, 7, 8])]

    # Merge the observed and estimated data
    weather_data, test_resampled = is_est_cat(observed_resampled, estimated_resampled, test_resampled)

    # Merge with target values filtering for the same months
    merged_data = pd.merge(targets, weather_data, how='inner', left_on='time', right_on='date_forecast')
    merged_data = add_time_features_cat(merged_data, 'time')  
    test_resampled = add_time_features_cat(test_resampled, 'date_forecast')

    filtered_data, targets_ny = delete_stationary(merged_data)
    
    # Drop some time features
    filtered_data = filtered_data.drop(columns=['time', 'date_forecast', 'pv_measurement','date_calc'])
    test_resampled = test_resampled.drop(columns=[ 'date_forecast','date_calc'])
    
    return filtered_data, test_resampled, is_day_feature, targets_ny

### LightGBM with some extra features

In [133]:
# LightGBM with some extra features
def process_location_ex(X, y, location_name,seeds):
    
    # Combine feature data and target into a single DataFrame
    data = X.copy()
    data['target'] = y['pv_measurement']

    # Added some extra features to this one model, did it here so we could reuse the same preprocesssing function on diffrent models
    # Feature Combination 1: Solar Radiation and Cloud Cover Combination
    data['weighted_rad'] = ((data['direct_rad:W'] * (1 - data['total_cloud_cover:p']/100)) +
                        (data['diffuse_rad:W'] * (data['total_cloud_cover:p']/100)))

    # Feature Combination 2: Atmospheric Conditions Combination
    data['adjusted_clear_sky_rad'] = (data['clear_sky_rad:W'] *
                                  np.exp(-0.0001 * data['absolute_humidity_2m:gm3']) *
                                  (1 - 0.1 * (data['air_density_2m:kgm3'] - 1.225)))  # Adjusted based on humidity and air density
    
    # Setup the environment in PyCaret
    exp_reg = setup(data=data, target='target', session_id=seeds,
                    imputation_type="iterative", categorical_iterative_imputer="lightgbm", numeric_iterative_imputer="lightgbm", iterative_imputation_iters = 5,

                    categorical_features=['dew_or_rime:idx', 'is_in_shadow:idx','is_estimated'],
                    #remove_outliers=True,  #Ble dårligere med denne
                    html=False, 
                    experiment_name=f'exp_{location_name}')

    # Create a LightGBM model
    lightgbm = create_model('lightgbm')
    
    # Tune the model
    tuned_lightgbm = tune_model(lightgbm)#, early_stopping=True, fold=15)

    # Create a bagged version of the tuned model
    bagged_lightgbm = ensemble_model(tuned_lightgbm, method='Bagging')

    # Finalize the model by training on whole dataset
    final_model = finalize_model(bagged_lightgbm)
    # Save the model for future use
    save_model(final_model, f'final_model_for_location_{location_name}')
        
    return final_model

### Catboost model nr 2

In [134]:
# Catboost model nr 2
def process_location_cat_2(X, y, location_name,seeds):
    
    # Dropping some features for this one model
    features_to_drop = ['dew_or_rime:idx', #'snow_density:kgm3',
                        'fresh_snow_3h:cm', 'fresh_snow_1h:cm', 'snow_drift:idx', 
                        'snow_depth:cm', 'wind_speed_w_1000hPa:ms', 'prob_rime:p', 
                        'fresh_snow_6h:cm', 'snow_melt_10min:mm', 
                        'fresh_snow_12h:cm', 'rain_water:kgm2', 
                        'super_cooled_liquid_water:kgm2']
    
    X = X.drop(columns=features_to_drop)
    
    data = X.copy()
    data['target'] = y['pv_measurement']
    
    # Setup the environment in PyCaret
    exp_reg = setup(data=data, target='target', session_id=seeds,
                    imputation_type="iterative", categorical_iterative_imputer="lightgbm", numeric_iterative_imputer="lightgbm", iterative_imputation_iters = 5,

                    #categorical_features=['dew_or_rime:idx', 'is_in_shadow:idx','is_estimated'],
                    #remove_outliers=True,  #Ble dårligere med denne
                    html=False,
                    experiment_name=f'exp_{location_name}')

    # Create a Catboost model
    cat = create_model('catboost')

    # Tune the model
    tuned_cat = tune_model(cat)
    
    # Create a bagged version of the tuned model
    bagged_cat = ensemble_model(tuned_cat, method='Bagging')

    # Train on whole dataset
    final_model = finalize_model(bagged_cat)

    # Save the model for future use
    save_model(final_model, f'final_model_for_location_{location_name}')
        
    return final_model

#### Initializing lists and dataframes for storing predictions

In [135]:
# Some global lists to save predictions in
locations = ['A']


# Training and predictions

### LightGBM

In [136]:
# LightGBM training and predictions
all_predictions_lGBM_e = []
for loc in locations:

    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')
    
    # Calling preprocessing
    X_train_1, X_test_1, is_day_feature_1, targets_1 = preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated)
    
    # Adding the extra features to the test set as well
    X_train_1 = X_train_1.drop(columns=['date_forecast'])
    X_test_1['weighted_rad'] = ((X_test_1['direct_rad:W'] * (1 - X_test_1['total_cloud_cover:p']/100)) +
                        (X_test_1['diffuse_rad:W'] * (X_test_1['total_cloud_cover:p']/100)))

    X_test_1['adjusted_clear_sky_rad'] = (X_test_1['clear_sky_rad:W'] *
                                  np.exp(-0.0001 * X_test_1['absolute_humidity_2m:gm3']) *
                                  (1 - 0.1 * (X_test_1['air_density_2m:kgm3'] - 1.225)))
    
    # Training and prediction for diffrent seeds
    total_predictions_light = None
    seeds = [42]
    for seed in seeds: 
        final_model_lGBM_e = process_location_ex(X_train_1, targets_1, loc, seed)
        predictions_lGBM_e = predict_model(final_model_lGBM_e, data=X_test_1)
        final_predictions_lGBM_e = predictions_lGBM_e['prediction_label']
        if total_predictions_light is None:
            total_predictions_light = np.zeros_like(final_predictions_lGBM_e)
        total_predictions_light += final_predictions_lGBM_e

    mean_pred_light = total_predictions_light/len(seeds)

    # Multiplying the predictions with is_day, so setting predictions at night to zero
    adjusted_final_predictions_lGBM_e = mean_pred_light * is_day_feature_1['is_day:idx']

    # Setting negative predictions to zero
    adjusted_final_predictions_lGBM_e = np.clip(adjusted_final_predictions_lGBM_e, 0, None)

    # Appening predictions for each location to final list
    all_predictions_lGBM_e.append([adjusted_final_predictions_lGBM_e])

# Changing final list to array
all_predictions_lGBM_e = np.array(all_predictions_lGBM_e)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003352 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8668
[LightGBM] [Info] Number of data points in the train set: 12714, number of used features: 50
[LightGBM] [Info] Start training from score 1799.064605
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001541 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8631
[LightGBM] [Info] Number of data points in the train set: 10988, number of used features: 50
[LightGBM] [Info] Start training from score 3135.630038
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002440 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8668
[LightGBM] [Info] Number of data points in the tra



### Catboost model nr 1

In [None]:
all_predictions_cat = []
# Catboost model nr 1
for loc in locations:
    
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

    # Calling preprocessing
    X_train_cat, X_test_cat, is_day_feature1, targets_cat = preprocessing_cat(train, X_train_observed, X_train_estimated, X_test_estimated)

    # Making categorical features
    cat_features = ['dew_or_rime:idx' ,'is_in_shadow:idx']
    X_train_cat['dew_or_rime:idx'] = X_train_cat['dew_or_rime:idx'].astype(int)
    X_train_cat['is_in_shadow:idx'] = X_train_cat['is_in_shadow:idx'].astype(int)
    X_test_cat['dew_or_rime:idx'] = X_test_cat['dew_or_rime:idx'].astype(int)
    X_test_cat['is_in_shadow:idx'] = X_test_cat['is_in_shadow:idx'].astype(int)

    # Catboooooooozt fun
    model_cat = CatBoostRegressor(
        loss_function='MAE', 
        learning_rate=0.1, 
        verbose=200,
        cat_features=cat_features,
        random_state=42) 
        #n_estimators=20000,
        #early_stopping_rounds=50,)

    #X_train_cat1, X_val_cat1, y_train_cat1, y_val_cat1 = train_test_split(X_train_cat, targets_cat, test_size=0.2, random_state=42)
    
    # Training
    model_cat.fit(X_train_cat,targets_cat['pv_measurement']) #X_train_cat1, y_train_cat1['pv_measurement'],eval_set=(X_val_cat1, y_val_cat1['pv_measurement']),)

    # Prediction
    predictions_cat = model_cat.predict(X_test_cat)
    feature_importances = model_cat.get_feature_importance()
    # Multiplying the predictions with is_day, so setting predictions at night to zero
    adjusted_final_predictions_cat = predictions_cat * is_day_feature1['is_day:idx']

    # Setting negative predictions to zero
    adjusted_final_predictions_cat = np.clip(adjusted_final_predictions_cat, 0, None)

    # Appening predictions for each location to final list
    all_predictions_cat.append(adjusted_final_predictions_cat)

# Changing final list to array
all_predictions_cat = np.array(all_predictions_cat)

The dataset starts from 2019-06-02 22:00:00 and ends at 2023-04-30 23:00:00
0:	learn: 1098.0450846	total: 170ms	remaining: 2m 50s
200:	learn: 364.8966719	total: 10s	remaining: 39.9s
400:	learn: 328.6032161	total: 18.8s	remaining: 28.1s
600:	learn: 306.8253447	total: 27.7s	remaining: 18.4s
800:	learn: 291.9455358	total: 36.4s	remaining: 9.04s
999:	learn: 280.1029739	total: 45.7s	remaining: 0us
The dataset starts from 2018-12-31 23:00:00 and ends at 2023-04-30 23:00:00
0:	learn: 218.7008398	total: 38.8ms	remaining: 38.8s
200:	learn: 62.6422414	total: 8.56s	remaining: 34s
400:	learn: 55.2251326	total: 17s	remaining: 25.4s
600:	learn: 50.7912516	total: 25.3s	remaining: 16.8s
800:	learn: 47.4006980	total: 33.4s	remaining: 8.31s
999:	learn: 44.7965259	total: 41.6s	remaining: 0us
The dataset starts from 2018-12-31 23:00:00 and ends at 2023-04-30 23:00:00
0:	learn: 180.1584679	total: 32.3ms	remaining: 32.2s
200:	learn: 52.7492691	total: 8.3s	remaining: 33s
400:	learn: 45.5206549	total: 16.6s	r

### Catboost model nr 2

In [None]:
all_predictions_cat_2 = []
# Catboost model nr 2
for loc in locations:
    
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

    # Calling preprocessing
    X_train, X_test, is_day_feature, targets = preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated)

    # Dropping some winter months
    X_train_cat_2 = X_train#[X_train['date_forecast'].dt.month.isin([3,4, 5, 6, 7, 8, 9,10])]

    # Dropping date feature
    X_train_cat_2 = X_train_cat_2.drop(columns=['date_forecast'])

    # Training and prediction for diffrent seeds
    seeds = [42]
    total_predictions_cat_2 = None
    for seed in seeds: 
        final_model_cat_2 = process_location_cat_2(X_train_cat_2, targets, loc,seed)#its aactually a catboost wohoo
        predictions_cat_2 = predict_model(final_model_cat_2, X_test)
        final_predictions_cat_2 = predictions_cat_2['prediction_label']
        if total_predictions_cat_2 is None:
            total_predictions_cat_2 = np.zeros_like(final_predictions_cat_2)
            total_predictions_cat_2+=final_predictions_cat_2

    mean_pred_cat_2 = total_predictions_cat_2/len(seeds)

    adjusted_final_predictions_cat_2 = mean_pred_cat_2 * is_day_feature['is_day:idx']
    adjusted_final_predictions_cat_2 = np.clip(adjusted_final_predictions_cat_2, 0, None)
    all_predictions_cat_2.append([adjusted_final_predictions_cat_2])
all_predictions_cat_2 = np.array(all_predictions_cat_2)

The dataset starts from 2019-06-02 22:00:00 and ends at 2023-04-30 23:00:00
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002576 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7013
[LightGBM] [Info] Number of data points in the train set: 11699, number of used features: 38
[LightGBM] [Info] Start training from score 1772.496152
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001382 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6993
[LightGBM] [Info] Number of data points in the train set: 10159, number of used features: 38
[LightGBM] [Info] Start training from score 3107.896390
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001929 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7013
[LightGBM] [Info] Number of data point

                                                         

           MAE          MSE      RMSE      R2   RMSLE    MAPE
Fold                                                         
0     331.5347  280088.3061  529.2337  0.8515  0.9573  4.4094
1     329.8633  282572.0687  531.5751  0.8523  0.9035  2.8594
2     330.1709  281579.4527  530.6406  0.8581  0.9094  3.5294
3     340.2220  302901.1310  550.3645  0.8468  0.9635  4.2181
4     311.7154  258253.4291  508.1864  0.8589  0.9887  4.1276
5     332.0868  275836.8078  525.2017  0.8565  0.9528  3.9372
6     327.8728  288634.4182  537.2471  0.8338  0.9640  3.7622
7     343.9738  296294.5591  544.3295  0.8475  0.9642  4.1479
8     297.9808  225688.0367  475.0663  0.8745  0.9624  4.6959
9     317.6170  273433.8230  522.9090  0.8555  0.9261  3.9950
Mean  326.3038  276528.2032  525.4754  0.8535  0.9492  3.9682
Std    12.9897   20582.1803   20.0952  0.0099  0.0258  0.4807


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                          

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
           MAE          MSE      RMSE      R2   RMSLE    MAPE
Fold                                                         
0     373.7750  347800.1610  589.7458  0.8155  0.9998  5.4426
1     376.3020  357862.4528  598.2161  0.8129  0.9240  3.4955
2     372.8907  353602.6585  594.6450  0.8219  0.9237  2.8008
3     369.0966  350865.7319  592.3392  0.8226  0.9733  4.1112
4     350.0916  314306.3659  560.6303  0.8283  1.0124  4.6743
5     383.1707  355459.8671  596.2046  0.8151  0.9840  4.2884
6     374.6368  349193.2898  590.9258  0.7989  1.0071  5.1602
7     394.0195  374185.4020  611.7070  0.8074  0.9794  3.5532
8     327.8279  282676.6929  531.6735  0.8428  0.9712  4.4658
9     362.1430  330657.0625  575.0279  0.8252  0.9519  4.1905
Mean  368.3954  341660.9684  584.1115  0.8191  0.9727  4.2183
Std    17.4524   24861.6143   21.7879  0.0115

                                                         

           MAE          MSE      RMSE      R2   RMSLE    MAPE
Fold                                                         
0     330.8393  280379.3867  529.5086  0.8513  0.9278  3.9378
1     331.4055  288341.1455  536.9741  0.8492  0.8880  2.8825
2     329.2066  290338.0350  538.8302  0.8537  0.8742  2.9036
3     335.5599  298457.0409  546.3122  0.8491  0.8987  3.7164
4     309.5971  261297.1835  511.1724  0.8572  0.9686  3.5521
5     334.3443  281169.8822  530.2545  0.8537  0.8949  3.3587
6     330.4999  288589.0153  537.2048  0.8338  0.9092  3.3626
7     344.4032  303321.0872  550.7459  0.8439  0.9245  3.4580
8     295.8456  228783.1977  478.3129  0.8728  0.9317  3.9928
9     314.2570  266088.7083  515.8379  0.8594  0.9046  3.1702
Mean  325.5958  278676.4682  527.5154  0.8524  0.9122  3.4335
Std    13.7749   20700.8013   20.1002  0.0096  0.0256  0.3642
Transformation Pipeline and Model Successfully Saved
The dataset starts from 2018-12-31 23:00:00 and ends at 2023-04-30 23:00:00
[Li

                                                         

          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     55.2925  8195.9723  90.5316  0.8887  0.7858  2.6975
1     53.4155  7351.7509  85.7424  0.8916  0.7345  1.9512
2     53.7811  7897.9747  88.8706  0.8881  0.7448  2.6786
3     52.5334  7244.4916  85.1146  0.8917  0.7109  1.2701
4     52.3604  7499.6344  86.6004  0.8886  0.7334  1.6543
5     57.6694  9346.7952  96.6788  0.8704  0.7365  1.5813
6     52.9883  7892.9986  88.8425  0.8766  0.7724  1.7505
7     47.2678  6279.9268  79.2460  0.9005  0.7849  2.0325
8     50.8885  7776.8520  88.1865  0.8783  0.6928  1.7984
9     52.4412  8269.1442  90.9348  0.8814  0.7160  1.6146
Mean  52.8638  7775.5541  88.0748  0.8856  0.7412  1.9029
Std    2.5673   754.4953   4.2872  0.0084  0.0299  0.4399


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                         

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
          MAE         MSE      RMSE      R2   RMSLE    MAPE
Fold                                                       
0     64.6567  10419.5884  102.0764  0.8585  0.8909  2.5365
1     64.4230   9838.8150   99.1908  0.8549  0.8234  2.2928
2     62.6430   9648.3255   98.2259  0.8633  0.8418  3.1937
3     61.1695   8997.7021   94.8562  0.8655  0.7800  1.6399
4     60.0173   8831.9214   93.9783  0.8688  0.7910  2.0048
5     65.3801  10933.2821  104.5623  0.8484  0.8238  1.8896
6     62.7953  10104.8497  100.5229  0.8420  0.8259  2.2908
7     55.8646   7852.5380   88.6145  0.8755  0.8770  2.6198
8     59.0839   9425.5016   97.0850  0.8525  0.7849  1.5691
9     61.0257  10251.3966  101.2492  0.8530  0.7976  1.8593
Mean  61.7059   9630.3921   98.0362  0.8582  0.8236  2.1896
Std    2.7614    849.4392    4.3936  0.0096  0.0358  0.4749


                                                         

          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     55.3481  8157.0904  90.3166  0.8892  0.7818  2.3528
1     53.0148  7226.8425  85.0108  0.8934  0.7221  2.2099
2     54.5020  8022.0094  89.5657  0.8863  0.7349  2.3513
3     52.6041  7394.3988  85.9907  0.8895  0.6614  1.1496
4     52.5246  7497.5845  86.5886  0.8886  0.7051  1.4672
5     57.5614  9372.7464  96.8129  0.8700  0.7073  1.4039
6     53.4591  8020.5427  89.5575  0.8746  0.7271  1.5350
7     47.2117  6083.6467  77.9977  0.9036  0.7497  1.8689
8     50.2438  7662.7244  87.5370  0.8801  0.6814  1.5604
9     52.8751  8459.5152  91.9756  0.8787  0.6919  1.4282
Mean  52.9345  7789.7101  88.1353  0.8854  0.7163  1.7327
Std    2.6504   817.4389   4.6772  0.0093  0.0331  0.4115
Transformation Pipeline and Model Successfully Saved
The dataset starts from 2018-12-31 23:00:00 and ends at 2023-04-30 23:00:00
[LightGBM] [Info] Auto-choosing col-wise multi-threading, t

                                                         

          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     44.8659  5268.6844  72.5857  0.8859  0.5239  0.5684
1     46.1461  5759.6166  75.8921  0.8595  0.4926  0.5369
2     45.3551  5537.7865  74.4163  0.8883  0.5426  0.5980
3     44.9488  5252.4417  72.4737  0.8953  0.5338  0.5999
4     49.0496  6176.6203  78.5915  0.8697  0.5570  0.7175
5     47.1803  5853.1389  76.5058  0.8781  0.5294  0.5987
6     45.4758  5118.4743  71.5435  0.8872  0.5676  0.7396
7     43.7447  5454.5422  73.8549  0.8800  0.5243  0.5232
8     48.5103  6544.9285  80.9007  0.8691  0.5110  0.5280
9     43.9500  5093.3190  71.3675  0.8828  0.5181  0.5822
Mean  45.9227  5605.9552  74.8132  0.8796  0.5300  0.5992
Std    1.7125   453.7368   2.9906  0.0102  0.0207  0.0705


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                          

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     53.3827  7200.2031  84.8540  0.8440  0.6493  0.9292
1     54.3933  7370.2562  85.8502  0.8202  0.6951  0.9240
2     54.2086  7450.0224  86.3135  0.8498  0.6261  0.7692
3     52.3703  6536.3136  80.8475  0.8697  0.6301  0.7945
4     55.3366  7307.3833  85.4832  0.8459  0.6681  0.9720
5     54.6105  7152.4735  84.5723  0.8511  0.6218  0.6945
6     51.7842  6267.6417  79.1684  0.8619  0.6677  1.0852
7     53.2772  7268.7824  85.2572  0.8401  0.6341  0.7491
8     55.3948  8029.9917  89.6102  0.8395  0.6178  0.7265
9     52.6034  6550.2556  80.9336  0.8493  0.6561  0.8468
Mean  53.7362  7113.3324  84.2890  0.8471  0.6466  0.8491
Std    1.1843   495.5534   2.9485  0.0126  0.0238  0.1188


                                                         

          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     45.2991  5414.5559  73.5837  0.8827  0.5139  0.5651
1     46.4801  6160.3509  78.4879  0.8497  0.4966  0.5301
2     45.0682  5489.9397  74.0941  0.8893  0.5237  0.5914
3     44.9686  5418.9988  73.6138  0.8919  0.5259  0.5868
4     49.0807  6354.1695  79.7130  0.8660  0.5666  0.7223
5     46.2135  5647.0769  75.1470  0.8824  0.5231  0.5988
6     44.8437  5064.7652  71.1672  0.8884  0.5606  0.7628
7     44.7537  5785.2334  76.0607  0.8727  0.4802  0.5115
8     47.4073  6294.5041  79.3379  0.8742  0.4822  0.5201
9     43.9148  5090.3176  71.3465  0.8829  0.5235  0.5990
Mean  45.8030  5671.9912  75.2552  0.8780  0.5196  0.5988
Std    1.4498   444.4455   2.9407  0.0122  0.0274  0.0788
Transformation Pipeline and Model Successfully Saved


### Catboost model nr 3

In [None]:
# Catboost model nr 3
for loc in locations:
    
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

    # Calling preprocessing
    X_train_3, X_test_3, is_day_feature_3, targets_3 = preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated)

    # Dropping date feature
    X_train_3 = X_train_3.drop(columns=['date_forecast'])
    
    # Catboooooooozt fun round 3 wohoo
    model_cat_3 = CatBoostRegressor(
    verbose=200, 
    learning_rate=0.03,
    depth=10,
    l2_leaf_reg=5,
    random_state=42, 
    n_estimators=20000, 
    loss_function='MAE', 
    early_stopping_rounds=100,)

    # Create 'sin_sun_azimuth' and 'cos_sun_azimuth' from 'sun_azimuth' in radians
    X_train_3['sin_sun_azimuth'] = np.sin(np.radians(X_train_3['sun_azimuth:d']))
    X_train_3['cos_sun_azimuth'] = np.cos(np.radians(X_train_3['sun_azimuth:d']))
    X_test_3['sin_sun_azimuth'] = np.sin(np.radians(X_test_3['sun_azimuth:d']))
    X_test_3['cos_sun_azimuth'] = np.cos(np.radians(X_test_3['sun_azimuth:d']))

    # Now drop the original 'sun_azimuth' feature
    X_train_3.drop('sun_azimuth:d', axis=1, inplace=True)
    X_test_3.drop('sun_azimuth:d', axis=1, inplace=True)

    # Split the training data into training and validation sets
    X_train_cat_3, X_test_cat_3, y_train_cat_3, y_test_cat_3 = train_test_split(X_train_3, targets_3, test_size=0.2, random_state=42)

    # Train model
    model_cat_3.fit(X_train_cat_3, y_train_cat_3['pv_measurement'],eval_set=(X_test_cat_3, y_test_cat_3['pv_measurement']),)  
    
    # Pred
    pred_cat_3 = model_cat_3.predict(X_test_3)

    # Multiplying the predictions with is_day, so setting predictions at night to zero
    adjusted_final_predictions_cat_3 = pred_cat_3 * is_day_feature_3['is_day:idx']

    # Setting negative predictions to zero
    adjusted_final_predictions_cat_3 = np.clip(adjusted_final_predictions_cat_3, 0, None)

    # Appening predictions for each location to final list
    all_predictions_cat_3.append(adjusted_final_predictions_cat_3) 

# Changing final list to array   
all_predictions_cat_3 = np.array(all_predictions_cat_3)

The dataset starts from 2019-06-02 22:00:00 and ends at 2023-04-30 23:00:00
0:	learn: 981.0541229	test: 986.5219433	best: 986.5219433 (0)	total: 59.3ms	remaining: 19m 46s
200:	learn: 320.9450192	test: 341.6960508	best: 341.6960508 (200)	total: 16.3s	remaining: 26m 47s
400:	learn: 281.8109323	test: 323.5911619	best: 323.5911619 (400)	total: 31.2s	remaining: 25m 23s
600:	learn: 246.4965531	test: 313.0847849	best: 313.0847849 (600)	total: 45.9s	remaining: 24m 42s
800:	learn: 217.5035448	test: 306.7147854	best: 306.7147854 (800)	total: 1m	remaining: 24m 12s
1000:	learn: 190.6737885	test: 302.6330363	best: 302.6323358 (999)	total: 1m 15s	remaining: 23m 50s
1200:	learn: 174.3532699	test: 299.9268452	best: 299.9268452 (1200)	total: 1m 30s	remaining: 23m 29s
1400:	learn: 160.3087673	test: 297.9194447	best: 297.9095416 (1398)	total: 1m 44s	remaining: 23m 13s
1600:	learn: 150.9765890	test: 296.3387070	best: 296.3278847 (1598)	total: 1m 59s	remaining: 22m 57s
1800:	learn: 142.7118904	test: 295.07

# Combining Model Predictions

In [None]:
all_predictions_lGBM_e = np.array(all_predictions_lGBM_e).flatten()
#all_predictions_rf = np.array(all_predictions_rf).flatten()
all_predictions_cat = np.array(all_predictions_cat).flatten()
all_predictions_cat_2 = np.array(all_predictions_cat_2).flatten()
all_predictions_cat_3 = np.array(all_predictions_cat_3).flatten()
#all_pred_stacked = np.array(all_pred_stacked).flatten()
all_pred = 0.25*all_predictions_cat+0.25 * all_predictions_lGBM_e+ 0.25* all_predictions_cat_2+0.25*all_predictions_cat_3#+ 0.1*all_predictions_rf#+ 0.45*all_predictions_lGBM  +  0.1*all_predictions_cat + 0.25*all_predictions_lasso + 0.1*all_predictions_rf
all_pred[all_pred < 6] = 0
print(all_pred.shape)

(2160,)


#### Saving the final predictions to CSV

In [None]:
'''sample_submission = pd.read_csv('sample_submission.csv')
sample_submission
sample_submission = sample_submission[['id']].merge(final_df[['id', 'prediction']], on='id', how='left')
sample_submission.to_csv('my_first_submission.csv', index=False)'''

final_predictions = all_pred

# Save the final_predictions to CSV
df = pd.DataFrame(final_predictions, columns=['prediction'])
df['id'] = df.index
df = df[['id', 'prediction']]
df.to_csv('final_predictions.csv', index=False)

In [None]:
df1 = pd.read_csv('final_predictions_ok.csv')
df_diff = df1[0:720] - df
df_diff.plot()