In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from pycaret.regression import *



pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [None]:
# ADD TIME FEATURES
def add_time_features(df, time_column, mode = 'lgbm'):
    '''
        This function will add some time feature based on the param 'time_columns'
        
        Params:
            df-> Dataframe with the column contained in 'time_column'
            time_column -> the column that is a datetime object
        
        Returns:
            A dataframe with time features
    '''
    
    df[time_column] = pd.to_datetime(df[time_column])  # Make sure the time column is in datetime format
    if mode == 'lgbm':
        df['hour'] = df[time_column].dt.hour
        df['day_of_week'] = df[time_column].dt.dayofweek
        df['month'] = df[time_column].dt.month
        df['day_of_year'] = df[time_column].dt.dayofyear
        df['week_of_year'] = df[time_column].dt.isocalendar().week
        df['year'] = df[time_column].dt.year
    elif mode == 'cat_boost':
        df['day_of_week'] = df[time_column].dt.dayofweek
        df['sin_hour'] = np.sin(2*np.pi * df[time_column].dt.hour/23.)
        df['sin_month'] = np.sin(2*np.pi * df[time_column].dt.month/12.)
        df['cos_hour'] = np.cos(2*np.pi * df[time_column].dt.hour/23.)
        df['cos_month'] = np.cos(2*np.pi * df[time_column].dt.month/12.)
    elif mode == 'cat':
        df['sin_hour'] = np.sin(2*np.pi * df[time_column].dt.hour/23.)
        df['sin_month'] = np.sin(2*np.pi * df[time_column].dt.month/12.)   
    

    return df

In [None]:
def to_datetime(df,column):
    '''
        Make the column in datetime format
    '''
    return pd.to_datetime(df[column])

In [None]:
def resampling(df,column):
    '''
        Resample df to 1 hour using mean() as aggregator and drop rows where all columns are NaN
        
        Params :
            df -> the dataframe to be resampled
            column -> the time column
    '''
    return df.set_index(keys = column).resample('1H').mean().dropna(how='all').reset_index()

In [None]:
def filter_df(df,columnlist):
    return df.drop(columns = columnlist)

In [None]:
def extract_data_calc(df):
    '''
    This function create a dataframe with 'date_forecast' as index and the column 'date_calc' resampled by '1H'.
    If there's no data in a specific bin, the resulting value for that bin would be NaN (not a number).
    Params:
        df -> dataframe with 'date_forecast' and 'date_calc' columns.
            'date_calc' is expected to contain data that the user wants to resample or analyze.
    Returns:
        A dataframe with 'date_calc' resampled.
    '''
    return df.set_index('date_forecast')['date_calc'].resample('1H').first().to_frame()



In [None]:
def is_estimated_feature(df):
    '''
        This function will create some time feature and estimated information. It's need to let the model understand is
        estimated value.
        Params:
            df -> It MUST be an estimated dataframe, that contains 'data_forecast' as datetime type
        Returns:
            A dataframe with 'time_dummy', 'time_delta' and 'is_estimated'     
    '''
    df['time_delta'] = (df['date_calc'] - df['date_forecast']).dt.total_seconds() / 3600
    df['is_estimated'] = 1
    return df

In [None]:
def delete_stationarity(df):
    '''
    Removes constant stretches of data within a DataFrame where the 'pv_measurement' column does not change.    
    The function identifies blocks of data where the 'pv_measurement' stays constant for more than two consecutive
    points and removes these blocks to address data stationarity.

    params:
        df -> DataFrame
              A pandas DataFrame with a 'pv_measurement' column which contains the data from which to remove stationarity.
        
    return:
        The DataFrame with constant stretches of data removed from the 'pv_measurement' column.
    '''
    
    #Calculate the difference, this need for check the constant
    df['diff'] = df['pv_measurement'].diff().fillna(0)

    # Create an indicator for constant stretches
    df['constant'] = (df['diff'] == 0).astype(int)

    # Use the indicator to mark stretches. The diff() function here identifies change-points.
    df['block'] = (df['constant'].diff() != 0).astype(int).cumsum()

    # Get the size of each constant block
    block_sizes = df.groupby('block')['constant'].sum()

    # Identify blocks that are constant for more than N consecutive time points (in this case 2)
    constant_blocks = block_sizes[block_sizes > 2].index
    
    # Remove the constant
    filtered_df = df[~df['block'].isin(constant_blocks)]
        
    return filtered_df.drop(columns=['diff', 'constant', 'block'])


In [None]:
def preprocessing(targets, observed, estimated, test, mode: str = 'lgbm'):
    '''
        This function makes all the preprocessing needed for the correct run of the model, it will perform:
            - Resampling
            - Filtering
            - Imputation
            - Outliers removal
            - Categorical Encoding
        
        Params:
            targets -> dataframe of the target parquet
            observed -> dataframe of observed train data
            estimated -> dataframe of estimated train data
            test -> dataframe of test data
        Returns:
            train_data -> dataframe of all data ready to train
            test_data -> dataframe of all data ready to test
            is_day -> dataframe of is_day categorical feature for post processing
    
    '''    
    targets['time'] = to_datetime(targets,'time')
    estimated['date_forecast'] = to_datetime(estimated,'date_forecast')
    observed['date_forecast'] = to_datetime(observed,'date_forecast')
    test['date_forecast'] = to_datetime(test,'date_forecast')

    observed_resampled = resampling(observed,'date_forecast')
    estimated_resampled = resampling(estimated,'date_forecast')
    test_resampled = resampling(test,'date_forecast')
    
    date_calc_resampled_observed = extract_data_calc(estimated)
    date_calc_resampled_test = extract_data_calc(test)
    
    estimated_resampled = estimated_resampled.merge(date_calc_resampled_observed, left_on='date_forecast', right_index=True)
    test_resampled = test_resampled.merge(date_calc_resampled_test, left_on='date_forecast', right_index=True)
    
    is_day = test_resampled[['date_forecast', 'is_day:idx']]   
    test_resampled = filter_df(test_resampled,['is_day:idx', 'snow_density:kgm3','elevation:m'])
    observed_resampled = filter_df(observed_resampled,['is_day:idx', 'snow_density:kgm3','elevation:m']) 
    estimated_resampled = filter_df(estimated_resampled,[ 'is_day:idx', 'snow_density:kgm3','elevation:m'])
    
    #This MUST be zero because is not estimated.
    observed_resampled['is_estimated'] = 0
    observed_resampled['time_delta'] = 0
    
    estimated_resampled = is_estimated_feature(estimated_resampled)
    test_resampled = is_estimated_feature(test_resampled)
    
    X = pd.concat([observed_resampled,estimated_resampled],axis = 0)
    train_data = pd.merge(targets, X, how='inner', left_on='time', right_on='date_forecast')
    if mode == 'lgbm':
        train_data = add_time_features(train_data, 'time')
        test_data = add_time_features(test_resampled, 'date_forecast')
    elif mode == 'cat':
        train_data = add_time_features(train_data, 'time', mode = 'cat')
        test_data = add_time_features(test_resampled, 'date_forecast', mode = 'cat')
        train_data = train_data[train_data['date_forecast'].dt.month.isin([4,5,6,7,8])]
        test_data = test_data[test_data['date_forecast'].dt.month.isin([4,5,6,7,8])]
    elif mode == 'cat_boost':
        train_data = add_time_features(train_data, 'time', mode = 'cat_boost')
        test_data = add_time_features(test_resampled, 'date_forecast', mode = 'cat_boost')

    
    train_data = delete_stationarity(train_data)
    
    train_data = filter_df(train_data, ['time','date_calc'])
    test_data = filter_df(test_resampled, ['date_calc'])

    return train_data, test_data, is_day

In [None]:
# LightGBM with some extra features
def process_location(X, y, location_name,seeds):
    # Combine feature data and target into a single DataFrame
    data = X.copy()
    data['target'] = y['pv_measurement']
    
    # Setup the environment in PyCaret
    exp_reg = setup(data=data, target='target', session_id=seeds,
                    categorical_features=['dew_or_rime:idx', 'is_in_shadow:idx','is_estimated'],
                    imputation_type="iterative", categorical_iterative_imputer="lightgbm", numeric_iterative_imputer="lightgbm", iterative_imputation_iters = 5,
                    html=False, 
                    experiment_name=f'exp_{location_name}')

    # Create a LightGBM model
    lightgbm = create_model('lightgbm')
    
    # Tune the model
    tuned_lightgbm = tune_model(lightgbm ,optimize='MAE',n_iter=100,early_stopping=True,early_stopping_max_iters=10)

    # Create a bagged version of the tuned model
    bagged_lightgbm = ensemble_model(tuned_lightgbm, method='Bagging')

    # Finalize the model by training on whole dataset
    final_model = finalize_model(bagged_lightgbm)

    # Save the model for future use
    save_model(final_model, f'final_model_for_location_{location_name}')
        
    return final_model

In [None]:
WEATHER_FEATURES = [
    "direct_rad:W",
    "clear_sky_rad:W",
    "diffuse_rad:W",
    "direct_rad_1h:J",
    "is_in_shadow:idx",
    "clear_sky_energy_1h:J",
    "effective_cloud_cover:p",
    "visibility:m",
    "total_cloud_cover:p",
]

In [None]:
def feature_engineering(data):
    # Added some extra features to this one model, did it here so we could reuse the same preprocesssing function on diffrent models
    # Feature Combination 1: Solar Radiation and Cloud Cover Combination
    data['radcloud'] = ((data['direct_rad:W'] * (1 - data['total_cloud_cover:p']/100)) +
                        (data['diffuse_rad:W'] * (data['total_cloud_cover:p']/100)))

    # Feature Combination 2: Atmospheric Conditions Combination
    data['adjusted_clear_sky_rad'] = (data['clear_sky_rad:W'] * np.exp(-0.0001 * data['absolute_humidity_2m:gm3']) *
                                  (1 - 0.1 * (data['air_density_2m:kgm3'] - 1.225)))  # Adjusted based on humidity and air density
    data['solar_incidence_factor'] = np.cos(np.radians(90 - data['sun_elevation:d'])) * np.cos(np.radians(data['sun_azimuth:d']))
    data['seasonal_conversion_efficiency'] = data['radcloud'] * (1 - data['relative_humidity_1000hPa:p']/100) * (data['msl_pressure:hPa'] / 1013.25)
    
    return data

In [None]:
import re
def regex(df):
    '''
        This function let lgbm work, this because it cannot accept ':'
    '''
    return df.rename(columns=lambda x: re.sub('[^A-Za-z0-9_-]+', '', x))

In [None]:
# Some global lists to save predictions in
locations = ['A', 'B', 'C']

In [None]:
# LightGBM training and predictions
all_predictions_lGBM_e = []
for loc in locations:

    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')
    
    # Calling preprocessing
    train, test, is_day_feature = preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated)
   
    targets = pd.DataFrame( {'pv_measurement': train['pv_measurement']})
    X_train = train.drop(columns=['date_forecast','pv_measurement'])
    X_train = feature_engineering(X_train)
    X_test = test.drop(columns=['date_forecast'])
    X_test = feature_engineering(X_test)
    
    # Training and prediction for diffrent seeds
    total_predictions_light = None
    seeds = [42]
    for seed in seeds: 
        final_model_lGBM_e = process_location(X_train, targets, loc, seed)
        predictions_lGBM_e = predict_model(final_model_lGBM_e, data=X_test)
        final_predictions_lGBM_e = predictions_lGBM_e['prediction_label']
        if total_predictions_light is None:
            total_predictions_light = np.zeros_like(final_predictions_lGBM_e)
        total_predictions_light += final_predictions_lGBM_e

    mean_pred_light = total_predictions_light/len(seeds)

    # Multiplying the predictions with is_day, so setting predictions at night to zero
    adjusted_final_predictions_lGBM_e = mean_pred_light * is_day_feature['is_day:idx']

    # Setting negative predictions to zero
    adjusted_final_predictions_lGBM_e = np.clip(adjusted_final_predictions_lGBM_e, 0, None)

    # Appening predictions for each location to final list
    all_predictions_lGBM_e.append([adjusted_final_predictions_lGBM_e])

# Changing final list to array
all_predictions_lGBM_e = np.array(all_predictions_lGBM_e)

In [None]:
cat_features = ['dew_or_rime:idx' ,'is_in_shadow:idx']
cat_params = { 'A': {
                        "iterations": 5000,
                        "learning_rate": 0.034867396508006264,
                        "depth": 8,
                        "l2_leaf_reg": 1,
                        "loss_function": "MAE",
                        "border_count": 92,
                        "verbose": 500,
                        "subsample": 0.7641850606486046,
                        'early_stopping_rounds': 100,
                        'cat_features': cat_features,
                        'random_state': 42, 
                    },
              'B': {
                        "iterations": 5000,
                        "learning_rate": 0.037511244177544326,
                        "depth": 6,
                        "l2_leaf_reg": 5,
                        "loss_function": "MAE",
                        "border_count": 128,
                        "verbose": 500,
                        "subsample": 0.8012204629505595,
                        'early_stopping_rounds': 100,
                        'cat_features': cat_features,
                        'random_state': 42, 
                    },
              'C': {"iterations": 5000, 
                    "learning_rate": 0.03425599789981457,
                    "depth": 8,
                    "l2_leaf_reg": 4,
                    "loss_function": "MAE", 
                    "border_count": 218, 
                    "verbose": 500, 
                    "subsample": 0.6848272280307022, 
                    'early_stopping_rounds': 100,
                    'cat_features': cat_features,
                    'random_state': 42, }
}
              
cat_params_no_feature =  { 'A': {
                        "iterations": 5000,
                        "learning_rate": 0.034867396508006264,
                        "depth": 8,
                        "l2_leaf_reg": 1,
                        "loss_function": "MAE",
                        "border_count": 92,
                        "verbose": 500,
                        "subsample": 0.7641850606486046,
                        'early_stopping_rounds': 100,
                        'random_state': 42, 
                    },
              'B': {
                        "iterations": 5000,
                        "learning_rate": 0.037511244177544326,
                        "depth": 6,
                        "l2_leaf_reg": 5,
                        "loss_function": "MAE",
                        "border_count": 128,
                        "verbose": 500,
                        "subsample": 0.8012204629505595,
                        'early_stopping_rounds': 100,
                        'random_state': 42, 
                    },
              'C': {"iterations": 5000, 
                    "learning_rate": 0.03425599789981457,
                    "depth": 8,
                    "l2_leaf_reg": 4,
                    "loss_function": "MAE", 
                    "border_count": 218, 
                    "verbose": 500, 
                    "subsample": 0.6848272280307022, 
                    'early_stopping_rounds': 100,
                    'random_state': 42, }
}

In [None]:
def CATegorical(df):
    df['dew_or_rime:idx'] = df['dew_or_rime:idx'].astype(int)
    df['is_in_shadow:idx'] = df['is_in_shadow:idx'].astype(int)
    return df

In [None]:
all_predictions_cat = []

from sklearn.impute import IterativeImputer
# Cat_1 training and predictions
for loc in locations:
    
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

    # Calling preprocessing
    X_train_cat, X_test_cat, is_day_feature1 = preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated,mode = 'cat')
    X_train_cat.drop(columns=['date_forecast'], inplace=True)
    print(f'Doing... {loc}')

    imputer = IterativeImputer(max_iter=5, random_state=42)
    for col in X_train_cat.columns:
        X_train_cat[col] = imputer.fit_transform(np.array(X_train_cat[col]).reshape(-1,1))
    for col in X_test_cat.columns:
        X_test_cat[col] = imputer.fit_transform(np.array(X_test_cat[col]).reshape(-1,1))
    
    targets_cat = pd.DataFrame( {'pv_measurement': X_train_cat['pv_measurement']})
    X_train_cat = X_train_cat.drop(columns=['pv_measurement'])
    X_train_cat = CATegorical(X_train_cat)
    X_test_cat = CATegorical(X_test_cat)

    model_cat = CatBoostRegressor(**cat_params[loc])

    X_train_cat1, X_val_cat1, y_train_cat1, y_val_cat1 = train_test_split(X_train_cat, targets_cat, test_size=0.2, random_state=42)
    
    # Training
    model_cat.fit(X_train_cat1, y_train_cat1['pv_measurement'],eval_set=(X_val_cat1, y_val_cat1['pv_measurement']),)

    # Prediction
    predictions_cat = model_cat.predict(X_test_cat[model_cat.feature_names_])
    
    # Multiplying the predictions with is_day, so setting predictions at night to zero
    adjusted_final_predictions_cat = predictions_cat * is_day_feature1['is_day:idx']

    # Setting negative predictions to zero
    adjusted_final_predictions_cat = np.clip(adjusted_final_predictions_cat, 0, None)

    # Appening predictions for each location to final list
    all_predictions_cat.append(adjusted_final_predictions_cat)

# Changing final list to array
all_predictions_cat = np.array(all_predictions_cat)

In [None]:
all_predictions_cat_2 = []

# Catboost nr 3
for loc in locations:
    
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

    X_train_3, X_test_3, is_day_feature_3 = preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated)
    X_train_3.drop(columns=['date_forecast'], inplace=True)
    imputer = IterativeImputer(max_iter=5, random_state=42)
    for col in X_train_3.columns:
        X_train_3[col] = imputer.fit_transform(np.array(X_train_3[col]).reshape(-1,1))
    for col in X_test_3.columns:
        X_test_3[col] = imputer.fit_transform(np.array(X_test_3[col]).reshape(-1,1))
    targets_3 = pd.DataFrame( {'pv_measurement': X_train_3['pv_measurement']})
    X_train_3 = X_train_3.drop(columns=['pv_measurement'])
    
    model_cat_3 = CatBoostRegressor(**cat_params_no_feature[loc])
    X_train_3 = feature_engineering(X_train_3)
    X_test_3 = feature_engineering(X_test_3)

    # Create 'sin_sun_azimuth' and 'cos_sun_azimuth' from 'sun_azimuth' in radians
    X_train_3['sin_sun_azimuth'] = np.sin(np.radians(X_train_3['sun_azimuth:d']))
    X_train_3['cos_sun_azimuth'] = np.cos(np.radians(X_train_3['sun_azimuth:d']))
    X_test_3['sin_sun_azimuth'] = np.sin(np.radians(X_test_3['sun_azimuth:d']))
    X_test_3['cos_sun_azimuth'] = np.cos(np.radians(X_test_3['sun_azimuth:d']))

    # Now drop the original 'sun_azimuth' feature
    X_train_3.drop('sun_azimuth:d', axis=1, inplace=True)
    X_test_3.drop('sun_azimuth:d', axis=1, inplace=True)

    # Split the training data into training and validation sets
    X_train_cat_3, X_test_cat_3, y_train_cat_3, y_test_cat_3 = train_test_split(X_train_3, targets_3, test_size=0.2, random_state=42)

    # Train model
    model_cat_3.fit(X_train_cat_3, y_train_cat_3['pv_measurement'],eval_set=(X_test_cat_3, y_test_cat_3['pv_measurement']),)  
    
    # Pred
    pred_cat_2 = model_cat_3.predict(X_test_3[model_cat_3.feature_names_])

    # Multiplying the predictions with is_day, so setting predictions at night to zero
    adjusted_final_predictions_cat_2 = pred_cat_2 * is_day_feature_3['is_day:idx']

    # Setting negative predictions to zero
    adjusted_final_predictions_cat_2 = np.clip(adjusted_final_predictions_cat_2, 0, None)

    # Appening predictions for each location to final list
    all_predictions_cat_2.append(adjusted_final_predictions_cat_2) 

# Changing final list to array   
all_predictions_cat_2 = np.array(all_predictions_cat_2)

In [None]:
all_predictions_cat_3 = []
# Catboost nr 3
for loc in locations:
    
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

    X_train_3, X_test_3, is_day_feature_3 = preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated,mode = 'catboost')
    X_train_3.drop(columns=['date_forecast'], inplace=True)
    imputer = IterativeImputer(max_iter=5, random_state=42)
    for col in X_train_3.columns:
        X_train_3[col] = imputer.fit_transform(np.array(X_train_3[col]).reshape(-1,1))
    for col in X_test_3.columns:
        X_test_3[col] = imputer.fit_transform(np.array(X_test_3[col]).reshape(-1,1))
    targets_3 = pd.DataFrame( {'pv_measurement': X_train_3['pv_measurement']})
    X_train_3 = X_train_3.drop(columns=['pv_measurement'])
    
    model_cat_3 = CatBoostRegressor(**cat_params_no_feature[loc])
   # X_train_3 = feature_engineering(X_train_3)
   # X_test_3 = feature_engineering(X_test_3)

    # Create 'sin_sun_azimuth' and 'cos_sun_azimuth' from 'sun_azimuth' in radians
    X_train_3['sin_sun_azimuth'] = np.sin(np.radians(X_train_3['sun_azimuth:d']))
    X_train_3['cos_sun_azimuth'] = np.cos(np.radians(X_train_3['sun_azimuth:d']))
    X_test_3['sin_sun_azimuth'] = np.sin(np.radians(X_test_3['sun_azimuth:d']))
    X_test_3['cos_sun_azimuth'] = np.cos(np.radians(X_test_3['sun_azimuth:d']))

    # Now drop the original 'sun_azimuth' feature
    X_train_3.drop('sun_azimuth:d', axis=1, inplace=True)
    X_test_3.drop('sun_azimuth:d', axis=1, inplace=True)

    # Split the training data into training and validation sets
    X_train_cat_3, X_test_cat_3, y_train_cat_3, y_test_cat_3 = train_test_split(X_train_3, targets_3, test_size=0.2, random_state=42)

    # Train model
    model_cat_3.fit(X_train_cat_3, y_train_cat_3['pv_measurement'],eval_set=(X_test_cat_3, y_test_cat_3['pv_measurement']),)  
    
    # Pred
    pred_cat_3 = model_cat_3.predict(X_test_3[model_cat_3.feature_names_])

    # Multiplying the predictions with is_day, so setting predictions at night to zero
    adjusted_final_predictions_cat_3 = pred_cat_3 * is_day_feature_3['is_day:idx']

    # Setting negative predictions to zero
    adjusted_final_predictions_cat_3 = np.clip(adjusted_final_predictions_cat_3, 0, None)

    # Appening predictions for each location to final list
    all_predictions_cat_3.append(adjusted_final_predictions_cat_3) 

# Changing final list to array   
all_predictions_cat_3 = np.array(all_predictions_cat_3)

In [None]:
# LightGBM with some extra features
def process_location_xgb(X, y, location_name,seeds):
    # Combine feature data and target into a single DataFrame
    data = X.copy()
    data['target'] = y['pv_measurement']
    
    # Setup the environment in PyCaret
    exp_reg = setup(data=data, target='target', session_id=seeds,
                    categorical_features=['dew_or_rime:idx', 'is_in_shadow:idx','is_estimated'],
                    imputation_type="iterative", categorical_iterative_imputer="lightgbm", numeric_iterative_imputer="lightgbm", iterative_imputation_iters = 5,
                    html=False, 
                    experiment_name=f'exp_{location_name}')

    # Create a LightGBM model
    lightgbm = create_model('xgboost')
    
    # Tune the model
    tuned_lightgbm = tune_model(lightgbm, optimize='MAE',early_stopping=True,early_stopping_max_iters=100)

    # Create a bagged version of the tuned model
    bagged_lightgbm = ensemble_model(tuned_lightgbm, method='Bagging')

    # Finalize the model by training on whole dataset
    final_model = finalize_model(bagged_lightgbm)

    # Save the model for future use
    save_model(final_model, f'final_model_for_location_{location_name}')
        
    return final_model

In [30]:
# LightGBM training and predictions
all_predictions_xb = []
all_predictions_xb_e =[]

for loc in locations:

    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')
    
    # Calling preprocessing
    train, test, is_day_feature = preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated)
   
    targets = pd.DataFrame( {'pv_measurement': train['pv_measurement']})
    X_train = train.drop(columns=['date_forecast','pv_measurement'])
    X_train = feature_engineering(X_train)
    X_test = test.drop(columns=['date_forecast'])
    X_test = feature_engineering(X_test)
    
    # Training and prediction for diffrent seeds
    total_predictions_light = None
    seeds = [42]
    for seed in seeds: 
        final_model_lGBM_e = process_location_xgb(X_train, targets, loc, seed)
        predictions_lGBM_e = predict_model(final_model_lGBM_e, data=X_test)
        final_predictions_lGBM_e = predictions_lGBM_e['prediction_label']
        if total_predictions_light is None:
            total_predictions_light = np.zeros_like(final_predictions_lGBM_e)
        total_predictions_light += final_predictions_lGBM_e

    mean_pred_light = total_predictions_light/len(seeds)

    # Multiplying the predictions with is_day, so setting predictions at night to zero
    all_predictions_xb = mean_pred_light * is_day_feature['is_day:idx']

    # Setting negative predictions to zero
    all_predictions_xb = np.clip(all_predictions_xb, 0, None)

    # Appening predictions for each location to final list
    all_predictions_xb_e.append([all_predictions_xb])

# Changing final list to array
all_predictions_xb_e = np.array(all_predictions_xb_e)

                                                         

          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     41.3532  5063.6689  71.1595  0.8991  0.7087  0.5641
1     44.4771  5887.4707  76.7299  0.8792  0.6708  0.6331
2     44.0571  5441.9141  73.7693  0.8788  0.7011  0.4971
3     37.6023  3837.0566  61.9440  0.9193  0.7106  0.5198
4     43.9118  5673.2422  75.3209  0.8809  0.6758  0.5680
5     41.9584  5037.9727  70.9787  0.8993  0.6519  0.5430
6     43.9647  5516.7354  74.2747  0.8823  0.7940  0.6082
7     45.0832  6017.8247  77.5746  0.8631  0.7907  0.8307
8     44.8390  5954.8374  77.1676  0.8744  0.7082  0.6193
9     41.4639  5491.3062  74.1033  0.8680  0.6624  0.5832
Mean  42.8711  5392.2029  73.3023  0.8844  0.7074  0.5967
Std    2.1888   609.2559   4.3569  0.0160  0.0468  0.0880
Transformation Pipeline and Model Successfully Saved
                        Description        Value
0                        Session id          123
1                            Target  

                                                         

          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     45.2625  5623.4331  74.9896  0.8875  0.7610  0.6306
1     41.4932  5022.3350  70.8684  0.8924  0.7472  0.6514
2     43.4137  6220.1274  78.8678  0.8504  0.7258  0.6338
3     41.7869  5195.7363  72.0815  0.8860  0.7950  0.7583
4     47.6616  6776.6821  82.3206  0.8512  0.7446  0.5330
5     45.1653  6171.8315  78.5610  0.8727  0.6967  0.5457
6     42.8435  5246.7451  72.4344  0.8777  0.7740  0.6210
7     46.8483  5839.6104  76.4173  0.8667  0.7823  0.6272
8     40.1953  4660.3511  68.2668  0.8978  0.7559  0.6395
9     48.1545  6512.3594  80.6992  0.8660  0.6914  0.5267
Mean  44.2825  5726.9211  75.5507  0.8748  0.7474  0.6167
Std    2.6129   658.0469   4.3611  0.0156  0.0326  0.0652


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                          

          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     44.3402  5385.5034  73.3860  0.8922  0.7274  0.5496
1     39.4099  4482.0000  66.9477  0.9040  0.6871  0.6184
2     43.6661  5939.7202  77.0696  0.8571  0.7301  0.6621
3     39.1597  4675.4556  68.3773  0.8974  0.7233  0.6315
4     45.9394  6408.5015  80.0531  0.8593  0.6615  0.5366
5     45.2852  5876.8018  76.6603  0.8788  0.7507  0.5685
6     41.9569  4995.2900  70.6774  0.8835  0.7594  0.6181
7     45.6832  5625.2056  75.0014  0.8716  0.7521  0.5755
8     40.6453  4804.5625  69.3149  0.8946  0.7304  0.6872
9     47.9766  6483.3159  80.5190  0.8666  0.6420  0.5049
Mean  43.4063  5467.6356  73.8007  0.8805  0.7164  0.5952
Std    2.8406   678.2693   4.5930  0.0157  0.0378  0.0549


                                                         

          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     42.9893  5137.0322  71.6731  0.8972  0.6793  0.5666
1     39.6269  4606.0547  67.8679  0.9013  0.6428  0.5673
2     42.1229  5548.8276  74.4905  0.8665  0.6709  0.6071
3     40.0424  4889.0815  69.9220  0.8927  0.6926  0.7584
4     45.0504  6048.3354  77.7710  0.8672  0.6112  0.5108
5     43.9826  5680.3647  75.3682  0.8828  0.6765  0.6499
6     41.1172  4932.3711  70.2308  0.8850  0.7050  0.6045
7     42.4068  5038.0610  70.9793  0.8850  0.6994  0.5835
8     39.2616  4416.5195  66.4569  0.9031  0.6671  0.6324
9     46.4418  6021.0352  77.5953  0.8761  0.6038  0.4813
Mean  42.3042  5231.7683  72.2355  0.8857  0.6649  0.5962
Std    2.2570   538.6286   3.7149  0.0124  0.0333  0.0728
Transformation Pipeline and Model Successfully Saved


In [31]:
all_predictions_lGBM_e = np.array(all_predictions_lGBM_e).flatten()
all_predictions_cat = np.array(all_predictions_cat).flatten()
all_predictions_cat_2 = np.array(all_predictions_cat_2).flatten()
all_predictions_cat_3 = np.array(all_predictions_cat_3).flatten()
all_predictions_xb_e = np.array(all_predictions_xb_e).flatten()
all_pred = 0.2*all_predictions_cat+0.2 * all_predictions_lGBM_e+0.2*all_predictions_cat_2 + 0.2*all_predictions_cat_3 + 0.2*all_predictions_xb_e
print(all_pred.shape)

ValueError: operands could not be broadcast together with shapes (2160,) (720,) 

In [None]:
final_predictions = all_pred

# Save the final_predictions to CSV
df = pd.DataFrame(final_predictions, columns=['prediction'])
df['id'] = df.index
df = df[['id', 'prediction']]
df.to_csv('sub.csv', index=False)

In [None]:
df1 = pd.read_csv('best_score1.csv')
df2 = pd.read_csv('best_score2.csv')


import matplotlib.pyplot as plt

# plot delle differenze tra df e df1
plt.plot(df1['prediction'] - df['prediction'])
plt.title('Differenze tra df1 e df')
plt.xlabel('Indice')
plt.ylabel('Differenza')
plt.show()

# plot delle differenze tra df e df2
plt.plot(df2['prediction'] - df['prediction'])
plt.title('Differenze tra df2 e df')
plt.xlabel('Indice')
plt.ylabel('Differenza')
plt.show()
