In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import seaborn as sns
from scipy import stats
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from pycaret.regression import *
from sklearn.linear_model import LinearRegression


pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [2]:
# ADD TIME FEATURES
def add_time_features(df, time_column, mode = 'lgbm'):
    '''
        This function will add some time feature based on the param 'time_columns'
        
        Params:
            df-> Dataframe with the column contained in 'time_column'
            time_column -> the column that is a datetime object
        
        Returns:
            A dataframe with time features
    '''
    
    df[time_column] = pd.to_datetime(df[time_column])  # Make sure the time column is in datetime format
    if mode == 'lgbm':
        df['hour'] = df[time_column].dt.hour
        df['day_of_week'] = df[time_column].dt.dayofweek
        df['month'] = df[time_column].dt.month
        df['day_of_year'] = df[time_column].dt.dayofyear
        df['week_of_year'] = df[time_column].dt.isocalendar().week
        df['year'] = df[time_column].dt.year
    elif mode == 'cat_boost':
        df['day_of_week'] = df[time_column].dt.dayofweek
        df['sin_hour'] = np.sin(2*np.pi * df[time_column].dt.hour/23.)
        df['sin_month'] = np.sin(2*np.pi * df[time_column].dt.month/12.)
        df['cos_hour'] = np.cos(2*np.pi * df[time_column].dt.hour/23.)
        df['cos_month'] = np.cos(2*np.pi * df[time_column].dt.month/12.)
    elif mode == 'cat':
        df['sin_hour'] = np.sin(2*np.pi * df[time_column].dt.hour/23.)
        df['sin_month'] = np.sin(2*np.pi * df[time_column].dt.month/12.)   
    

    return df

In [3]:
def to_datetime(df,column):
    '''
        Make the column in datetime format
    '''
    return pd.to_datetime(df[column])

In [4]:
def resampling(df,column):
    '''
        Resample df to 1 hour using mean() as aggregator and drop rows where all columns are NaN
        
        Params :
            df -> the dataframe to be resampled
            column -> the time column
    '''
    return df.set_index(keys = column).resample('1H').mean().dropna(how='all').reset_index()

In [5]:
def filter_df(df,columnlist):
    return df.drop(columns = columnlist)

In [6]:
def extract_data_calc(df):
    '''
    This function create a dataframe with 'date_forecast' as index and the column 'date_calc' resampled by '1H'.
    If there's no data in a specific bin, the resulting value for that bin would be NaN (not a number).
    Params:
        df -> dataframe with 'date_forecast' and 'date_calc' columns.
            'date_calc' is expected to contain data that the user wants to resample or analyze.
    Returns:
        A dataframe with 'date_calc' resampled.
    '''
    return df.set_index('date_forecast')['date_calc'].resample('1H').first().to_frame()



In [7]:
def is_estimated_feature(df):
    '''
        This function will create some time feature and estimated information. It's need to let the model understand is
        estimated value.
        Params:
            df -> It MUST be an estimated dataframe, that contains 'data_forecast' as datetime type
        Returns:
            A dataframe with 'time_dummy', 'time_delta' and 'is_estimated'     
    '''
    df['time_delta'] = (df['date_calc'] - df['date_forecast']).dt.total_seconds() / 3600
    df['is_estimated'] = 1
    return df

In [8]:
def delete_stationarity(df):
    '''
    Removes constant stretches of data within a DataFrame where the 'pv_measurement' column does not change.    
    The function identifies blocks of data where the 'pv_measurement' stays constant for more than two consecutive
    points and removes these blocks to address data stationarity.

    params:
        df -> DataFrame
              A pandas DataFrame with a 'pv_measurement' column which contains the data from which to remove stationarity.
        
    return:
        The DataFrame with constant stretches of data removed from the 'pv_measurement' column.
    '''
    
    #Calculate the difference, this need for check the constant
    df['diff'] = df['pv_measurement'].diff().fillna(0)

    # Create an indicator for constant stretches
    df['constant'] = (df['diff'] == 0).astype(int)

    # Use the indicator to mark stretches. The diff() function here identifies change-points.
    df['block'] = (df['constant'].diff() != 0).astype(int).cumsum()

    # Get the size of each constant block
    block_sizes = df.groupby('block')['constant'].sum()

    # Identify blocks that are constant for more than N consecutive time points (in this case 2)
    constant_blocks = block_sizes[block_sizes > 2].index
    
    # Remove the constant
    filtered_df = df[~df['block'].isin(constant_blocks)]
        
    return filtered_df.drop(columns=['diff', 'constant', 'block'])


In [9]:
def preprocessing(targets, observed, estimated, test, mode: str = 'lgbm'):
    '''
        This function makes all the preprocessing needed for the correct run of the model, it will perform:
            - Resampling
            - Filtering
            - Imputation
            - Outliers removal
            - Categorical Encoding
        
        Params:
            targets -> dataframe of the target parquet
            observed -> dataframe of observed train data
            estimated -> dataframe of estimated train data
            test -> dataframe of test data
        Returns:
            train_data -> dataframe of all data ready to train
            test_data -> dataframe of all data ready to test
            is_day -> dataframe of is_day categorical feature for post processing
    
    '''    
    targets['time'] = to_datetime(targets,'time')
    estimated['date_forecast'] = to_datetime(estimated,'date_forecast')
    observed['date_forecast'] = to_datetime(observed,'date_forecast')
    test['date_forecast'] = to_datetime(test,'date_forecast')

    observed_resampled = resampling(observed,'date_forecast')
    estimated_resampled = resampling(estimated,'date_forecast')
    test_resampled = resampling(test,'date_forecast')
    
    date_calc_resampled_observed = extract_data_calc(estimated)
    date_calc_resampled_test = extract_data_calc(test)
    
    estimated_resampled = estimated_resampled.merge(date_calc_resampled_observed, left_on='date_forecast', right_index=True)
    test_resampled = test_resampled.merge(date_calc_resampled_test, left_on='date_forecast', right_index=True)
    
    is_day = test_resampled[['date_forecast', 'is_day:idx']]   
    test_resampled = filter_df(test_resampled,['is_day:idx', 'snow_density:kgm3','elevation:m'])
    observed_resampled = filter_df(observed_resampled,['is_day:idx', 'snow_density:kgm3','elevation:m']) 
    estimated_resampled = filter_df(estimated_resampled,[ 'is_day:idx', 'snow_density:kgm3','elevation:m'])
    
    #This MUST be zero because is not estimated.
    observed_resampled['is_estimated'] = 0
    observed_resampled['time_delta'] = 0
    
    estimated_resampled = is_estimated_feature(estimated_resampled)
    test_resampled = is_estimated_feature(test_resampled)
    
    X = pd.concat([observed_resampled,estimated_resampled],axis = 0)
    train_data = pd.merge(targets, X, how='inner', left_on='time', right_on='date_forecast')
    if mode == 'lgbm':
        train_data = add_time_features(train_data, 'time')
        test_data = add_time_features(test_resampled, 'date_forecast')
    elif mode == 'cat':
        train_data = add_time_features(train_data, 'time', mode = 'cat')
        test_data = add_time_features(test_resampled, 'date_forecast', mode = 'cat')
        train_data = train_data[train_data['date_forecast'].dt.month.isin([4,5,6,7,8])]
        test_data = test_data[test_data['date_forecast'].dt.month.isin([4,5,6,7,8])]
    elif mode == 'cat_boost':
        train_data = add_time_features(train_data, 'time', mode = 'cat_boost')
        test_data = add_time_features(test_resampled, 'date_forecast', mode = 'cat_boost')

    
    train_data = delete_stationarity(train_data)
    
    train_data = filter_df(train_data, ['time','date_calc'])
    test_data = filter_df(test_resampled, ['date_calc'])

    return train_data, test_data, is_day

In [10]:
# LightGBM with some extra features
def process_location(X, y, location_name,seeds):
    # Combine feature data and target into a single DataFrame
    data = X.copy()
    data['target'] = y['pv_measurement']
    
    # Setup the environment in PyCaret
    exp_reg = setup(data=data, target='target', session_id=seeds,
                    categorical_features=['dew_or_rime:idx', 'is_in_shadow:idx','is_estimated'],
                    imputation_type="iterative", categorical_iterative_imputer="lightgbm", numeric_iterative_imputer="lightgbm", iterative_imputation_iters = 5,
                    html=False, 
                    experiment_name=f'exp_{location_name}')

    # Create a LightGBM model
    lightgbm = create_model('lightgbm')
    
    # Tune the model
    tuned_lightgbm = tune_model(lightgbm, optimize='MAE')

    # Create a bagged version of the tuned model
    bagged_lightgbm = ensemble_model(tuned_lightgbm, method='Bagging')

    # Finalize the model by training on whole dataset
    final_model = finalize_model(bagged_lightgbm)

    # Save the model for future use
    save_model(final_model, f'final_model_for_location_{location_name}')
        
    return final_model

In [11]:
def feature_engineering(data):
    # Added some extra features to this one model, did it here so we could reuse the same preprocesssing function on diffrent models
    # Feature Combination 1: Solar Radiation and Cloud Cover Combination
    data['weighted_rad'] = ((data['direct_rad:W'] * (1 - data['total_cloud_cover:p']/100)) +
                        (data['diffuse_rad:W'] * (data['total_cloud_cover:p']/100)))

    # Feature Combination 2: Atmospheric Conditions Combination
    data['adjusted_clear_sky_rad'] = (data['clear_sky_rad:W'] *
                                  np.exp(-0.0001 * data['absolute_humidity_2m:gm3']) *
                                  (1 - 0.1 * (data['air_density_2m:kgm3'] - 1.225)))  # Adjusted based on humidity and air density
    data['solar_incidence_factor'] = np.cos(np.radians(90 - data['sun_elevation:d'])) * np.cos(np.radians(data['sun_azimuth:d']))
    data['seasonal_conversion_efficiency'] = data['weighted_rad'] * (1 - data['relative_humidity_1000hPa:p']/100) * (data['msl_pressure:hPa'] / 1013.25)
    

    return data

In [12]:
# Some global lists to save predictions in
locations = ['A', 'B', 'C']
all_predictions_lGBM = []
all_predictions_lGBM_e = []
all_predictions_rf = []
all_predictions_lasso = []
all_predictions_cat = []
all_predictions_cat_2 = []
final_df_list = [] 
all_pred_stacked =[]
all_predictions_cat_3=[]

all_X_train_cat = pd.DataFrame()
all_X_test_cat = pd.DataFrame()
all_is_day_feature1 = pd.Series(dtype='float64')
all_targets_cat = pd.DataFrame()


In [13]:
# LightGBM training and predictions
for loc in locations:

    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')
    
    # Calling preprocessing
    train, test, is_day_feature = preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated)
   
    targets = pd.DataFrame( {'pv_measurement': train['pv_measurement']})
    X_train = train.drop(columns=['date_forecast','pv_measurement'])
    X_train = feature_engineering(X_train)
    X_test = test.drop(columns=['date_forecast'])
    X_test = feature_engineering(X_test)
    
    # Training and prediction for diffrent seeds
    total_predictions_light = None
    seeds = [42,123]
    for seed in seeds: 
        final_model_lGBM_e = process_location(X_train, targets, loc, seed)
        predictions_lGBM_e = predict_model(final_model_lGBM_e, data=X_test)
        final_predictions_lGBM_e = predictions_lGBM_e['prediction_label']
        if total_predictions_light is None:
            total_predictions_light = np.zeros_like(final_predictions_lGBM_e)
        total_predictions_light += final_predictions_lGBM_e

    mean_pred_light = total_predictions_light/len(seeds)

    # Multiplying the predictions with is_day, so setting predictions at night to zero
    adjusted_final_predictions_lGBM_e = mean_pred_light * is_day_feature['is_day:idx']

    # Setting negative predictions to zero
    adjusted_final_predictions_lGBM_e = np.clip(adjusted_final_predictions_lGBM_e, 0, None)

    # Appening predictions for each location to final list
    all_predictions_lGBM_e.append([adjusted_final_predictions_lGBM_e])

# Changing final list to array
all_predictions_lGBM_e = np.array(all_predictions_lGBM_e)

                        Description        Value
0                        Session id           42
1                            Target       target
2                       Target type   Regression
3               Original data shape  (19622, 55)
4            Transformed data shape  (19622, 67)
5       Transformed train set shape  (13735, 67)
6        Transformed test set shape   (5887, 67)
7                  Ordinal features            1
8                  Numeric features           51
9              Categorical features            3
10         Rows with missing values        20.2%
11                       Preprocess         True
12                  Imputation type    iterative
13  Iterative imputation iterations            5
14        Numeric iterative imputer     lightgbm
15    Categorical iterative imputer     lightgbm
16         Maximum one-hot encoding           25
17                  Encoding method         None
18                   Fold Generator        KFold
19                  

                                                         

           MAE          MSE      RMSE      R2   RMSLE    MAPE
Fold                                                         
0     310.2230  273202.6471  522.6879  0.8613  1.0853  1.9864
1     299.0549  254415.8433  504.3965  0.8585  1.1656  2.4947
2     307.1856  275831.0135  525.1962  0.8553  1.0775  2.4394
3     322.0855  293524.1154  541.7787  0.8505  1.0877  1.7441
4     306.2584  245169.7561  495.1462  0.8701  1.0745  2.7818
5     304.6759  276448.5465  525.7837  0.8538  1.1579  2.4854
6     308.0701  269758.9885  519.3833  0.8391  1.1275  3.3045
7     306.5295  265946.7614  515.7003  0.8601  1.0957  3.4485
8     315.4189  274251.8715  523.6906  0.8525  1.0566  1.9057
9     311.2788  286220.6045  534.9959  0.8430  1.1044  2.1519
Mean  309.0780  271477.0148  520.8759  0.8544  1.1033  2.4742
Std     5.9535   13320.9981   12.8564  0.0085  0.0343  0.5416


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                          

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
           MAE          MSE      RMSE      R2   RMSLE    MAPE
Fold                                                         
0     313.2298  278308.8581  527.5499  0.8587  1.1585  1.9501
1     309.3468  269268.1434  518.9105  0.8502  1.2519  3.0296
2     307.9427  280229.4251  529.3670  0.8530  1.1749  3.2426
3     326.1188  297647.8075  545.5711  0.8484  1.1125  1.9030
4     307.4927  253172.8048  503.1628  0.8658  1.1996  3.1029
5     303.1808  272931.2645  522.4282  0.8557  1.2111  2.6367
6     312.5930  281852.1829  530.8975  0.8319  1.1967  3.6350
7     302.0644  257202.1888  507.1511  0.8647  1.2025  3.4493
8     316.6752  277554.1709  526.8341  0.8507  1.1982  2.7345
9     314.7728  294089.0496  542.2998  0.8387  1.1413  2.9538
Mean  311.3417  276225.5895  525.4172  0.8518  1.1847  2.8638
Std     6.6838   13358.0998   12.7419  0.0101

                                                         

           MAE          MSE      RMSE      R2   RMSLE    MAPE
Fold                                                         
0     314.6244  280969.4244  530.0655  0.8573  1.0286  1.5416
1     302.0399  260054.6417  509.9555  0.8554  1.0883  2.2941
2     311.4288  280928.3584  530.0268  0.8526  1.0985  2.8713
3     326.9863  304725.6033  552.0196  0.8448  1.0166  1.6375
4     305.1731  244459.7811  494.4287  0.8705  1.0451  2.6943
5     302.6081  275160.5440  524.5575  0.8545  1.0947  2.3392
6     310.1370  270238.0240  519.8442  0.8389  1.0763  3.1113
7     304.1020  265488.3864  515.2557  0.8604  1.0723  2.9576
8     312.4422  274824.6165  524.2372  0.8522  1.0176  1.9147
9     308.2166  286711.1876  535.4542  0.8427  1.0364  1.9941
Mean  309.7759  274356.0568  523.5845  0.8529  1.0574  2.3356
Std     7.0478   15351.4452   14.6748  0.0087  0.0305  0.5308
Transformation Pipeline and Model Successfully Saved
                        Description        Value
0                        Sessi

                                                         

           MAE          MSE      RMSE      R2   RMSLE    MAPE
Fold                                                         
0     299.9363  261785.1309  511.6494  0.8544  1.0844  2.4444
1     321.8247  292466.3033  540.8015  0.8421  1.1234  1.8595
2     307.5598  267317.6653  517.0277  0.8484  1.0888  3.1074
3     295.2908  239820.2811  489.7145  0.8685  1.0341  1.9681
4     322.5253  290795.2864  539.2544  0.8426  1.0496  2.4292
5     334.5336  311891.9066  558.4728  0.8269  1.0622  2.0647
6     312.5375  270096.0050  519.7076  0.8560  1.1231  2.2184
7     315.6672  280945.0016  530.0425  0.8578  1.0690  2.5388
8     297.5678  267930.6848  517.6202  0.8535  1.0539  2.8401
9     288.5257  243804.5021  493.7656  0.8717  1.1364  2.4439
Mean  309.5969  272685.2767  521.8056  0.8522  1.0825  2.3914
Std    13.6925   21013.7725   20.1038  0.0124  0.0333  0.3662


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                          

           MAE          MSE      RMSE      R2   RMSLE    MAPE
Fold                                                         
0     302.1471  267787.1762  517.4816  0.8510  1.1280  2.7228
1     315.5948  289681.7695  538.2209  0.8436  1.1403  1.9823
2     306.0767  261043.1810  510.9239  0.8520  1.1544  3.4059
3     293.9771  237044.6747  486.8723  0.8701  1.1234  2.0639
4     324.0204  294660.5859  542.8265  0.8405  1.1584  2.9377
5     336.6670  317151.5209  563.1621  0.8240  1.1248  2.2087
6     317.7921  286581.0082  535.3326  0.8472  1.1736  2.1218
7     311.9533  281629.1095  530.6874  0.8575  1.1215  2.4755
8     293.9081  259405.2836  509.3185  0.8582  1.1972  2.7669
9     280.7995  234733.3418  484.4929  0.8765  1.1604  2.5495
Mean  308.2936  272971.7651  521.9319  0.8520  1.1482  2.5235
Std    15.5558   24632.8936   23.6411  0.0142  0.0238  0.4262


                                                          

           MAE          MSE      RMSE      R2   RMSLE    MAPE
Fold                                                         
0     298.0121  259377.5615  509.2912  0.8557  1.0268  2.2656
1     315.6434  287461.7531  536.1546  0.8448  1.0431  1.7083
2     299.1230  256437.2997  506.3964  0.8546  1.0376  2.6144
3     285.0867  227737.6085  477.2186  0.8752  1.0358  1.9707
4     314.8975  284693.1218  533.5664  0.8459  1.0353  2.1832
5     326.5729  300890.6659  548.5350  0.8331  1.0183  1.8943
6     309.4515  271803.2660  521.3475  0.8551  1.0420  1.7914
7     311.1823  281940.3131  530.9805  0.8573  0.9962  2.0429
8     291.4587  259145.0570  509.0629  0.8583  1.0324  1.8348
9     280.7790  232887.9170  482.5846  0.8774  1.0781  2.1695
Mean  303.2207  266237.4564  515.5138  0.8557  1.0346  2.0475
Std    13.9748   22488.4830   21.9770  0.0126  0.0196  0.2564
Transformation Pipeline and Model Successfully Saved
                        Description        Value
0                        Sessi

                                                         

          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     48.9960  7070.7580  84.0878  0.8969  0.8381  1.5630
1     47.0249  6755.6431  82.1927  0.8991  0.8417  1.5689
2     53.6609  9792.4154  98.9566  0.8503  0.8095  1.3646
3     48.8259  7354.8449  85.7604  0.8858  0.8278  2.0641
4     48.3632  7236.0509  85.0650  0.8868  0.8556  1.1172
5     47.9635  6543.8838  80.8943  0.8961  0.8734  1.5897
6     55.0159  8212.8208  90.6246  0.8792  0.8978  1.6338
7     51.3328  7319.8684  85.5562  0.8923  0.7836  1.0596
8     48.5277  6821.5999  82.5930  0.8959  0.8569  1.6791
9     47.1046  6148.0507  78.4095  0.9072  0.8095  1.1904
Mean  49.6815  7325.5936  85.4140  0.8890  0.8394  1.4830
Std    2.6065   975.8463   5.4809  0.0149  0.0319  0.2893


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                          

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     48.9512  7026.3158  83.8231  0.8976  0.8860  2.0005
1     47.5178  6973.7975  83.5093  0.8959  0.8577  1.5936
2     54.4331  9780.7643  98.8977  0.8505  0.8287  1.3616
3     49.9346  8042.2220  89.6784  0.8751  0.8384  2.2953
4     49.7536  7895.4536  88.8564  0.8765  0.8727  1.1204
5     46.9338  6259.3467  79.1160  0.9006  0.9367  1.4997
6     55.0627  8157.4278  90.3185  0.8800  0.9136  1.6371
7     50.5617  7229.4396  85.0261  0.8936  0.7575  1.0403
8     50.3756  7097.1206  84.2444  0.8917  0.9075  1.8890
9     47.4060  6196.0557  78.7150  0.9064  0.8393  1.3855
Mean  50.0930  7465.7944  86.2185  0.8868  0.8638  1.5823
Std    2.6289  1002.4499   5.6714  0.0157  0.0491  0.3717


                                                         

          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     50.1183  7447.5901  86.2994  0.8914  0.8232  1.5792
1     47.4349  7112.7802  84.3373  0.8938  0.8155  1.5315
2     53.0926  9375.0240  96.8247  0.8567  0.7911  1.4493
3     48.9194  7541.6300  86.8426  0.8829  0.8094  2.2005
4     49.0471  7486.8330  86.5265  0.8829  0.8382  1.1288
5     47.7065  6474.0982  80.4618  0.8972  0.8623  1.5331
6     54.0398  8030.6692  89.6140  0.8819  0.8662  1.6009
7     51.1579  7233.6385  85.0508  0.8935  0.7579  1.0350
8     48.0263  6722.5813  81.9913  0.8974  0.8189  1.6310
9     46.6042  6020.1941  77.5899  0.9091  0.7733  1.1954
Mean  49.6147  7344.5039  85.5538  0.8887  0.8156  1.4885
Std    2.3473   873.3702   5.0046  0.0133  0.0333  0.3122
Transformation Pipeline and Model Successfully Saved
                        Description        Value
0                        Session id          123
1                            Target  

                                                         

          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     46.9314  6414.0499  80.0878  0.9030  0.7991  1.2843
1     55.1828  8485.5039  92.1168  0.8757  0.7952  1.2794
2     52.6451  7852.1197  88.6122  0.8758  0.8268  1.4871
3     48.8623  7536.6173  86.8137  0.8880  0.7819  1.2645
4     47.1979  6771.7838  82.2908  0.8943  0.8479  7.7770
5     48.6593  7308.8530  85.4918  0.8917  0.8886  2.0210
6     47.8624  6778.0835  82.3291  0.8977  0.9132  2.1079
7     49.2003  8003.4263  89.4619  0.8820  0.8052  1.1821
8     49.4943  7157.1215  84.5998  0.8852  0.8667  1.7908
9     54.0456  9027.5123  95.0132  0.8628  0.8112  1.9480
Mean  50.0081  7533.5071  86.6817  0.8856  0.8336  2.2142
Std    2.7577   778.2612   4.4484  0.0114  0.0417  1.8836


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                          

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
          MAE        MSE     RMSE      R2   RMSLE     MAPE
Fold                                                      
0     48.3161  6805.0940  82.4930  0.8971  0.8099   1.1715
1     56.8564  9181.8562  95.8220  0.8655  0.7734   1.0873
2     55.3279  8698.6879  93.2668  0.8624  0.8094   1.3494
3     51.8586  8330.7958  91.2732  0.8761  0.8006   1.2597
4     46.6508  6845.9487  82.7402  0.8932  0.8351  39.7010
5     48.1757  7341.4332  85.6822  0.8912  0.8551   1.4381
6     48.4873  6742.2308  82.1111  0.8982  0.8669   1.8017
7     50.2644  8260.5900  90.8878  0.8782  0.8293   1.2037
8     50.9860  7725.9745  87.8975  0.8760  0.8307   2.1490
9     53.5530  8655.0903  93.0327  0.8684  0.7997   1.7794
Mean  51.0476  7858.7702  88.5206  0.8806  0.8210   5.2941
Std    3.1814   845.1263   4.7817  0.0127  0.0265  11.4735


                                                         

          MAE        MSE     RMSE      R2   RMSLE     MAPE
Fold                                                      
0     46.3860  6216.0817  78.8421  0.9060  0.7694   1.3339
1     55.8818  8511.9863  92.2604  0.8753  0.7812   1.3415
2     52.4151  7840.7186  88.5478  0.8759  0.8001   1.4268
3     48.9449  7389.9997  85.9651  0.8901  0.7816   1.2914
4     46.8720  6752.9755  82.1765  0.8946  0.8308  52.4978
5     48.3957  7219.8472  84.9697  0.8930  0.8516   1.9202
6     47.1982  6405.1377  80.0321  0.9033  0.8843   2.0416
7     49.4075  7856.1398  88.6349  0.8842  0.7976   1.2346
8     49.5065  6978.5519  83.5377  0.8880  0.8149   1.7863
9     53.0825  8611.1941  92.7965  0.8691  0.7949   1.4540
Mean  49.8090  7378.2633  85.7763  0.8880  0.8107   6.6328
Std    2.9076   782.3876   4.5488  0.0115  0.0339  15.2907
Transformation Pipeline and Model Successfully Saved
                        Description        Value
0                        Session id           42
1                      

                                                         

          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     43.0832  5233.8293  72.3452  0.8957  0.7793  0.6213
1     45.5000  6163.6976  78.5092  0.8735  0.7160  0.5981
2     44.4576  5531.9945  74.3774  0.8768  0.7529  0.5200
3     38.9623  4000.1756  63.2469  0.9159  0.7456  0.5119
4     43.6505  5312.5825  72.8875  0.8885  0.7125  0.5440
5     42.9988  5229.9332  72.3183  0.8955  0.7085  0.5643
6     42.6852  5006.0038  70.7531  0.8932  0.8047  0.5967
7     46.4997  6207.9372  78.7905  0.8588  0.8251  0.8038
8     44.9686  5807.2804  76.2055  0.8775  0.7788  0.6274
9     42.9722  5763.0818  75.9150  0.8614  0.7524  0.6118
Mean  43.5778  5425.6516  73.5349  0.8837  0.7576  0.5999
Std    1.9492   610.2019   4.2751  0.0166  0.0375  0.0784


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                          

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     42.8463  5382.9565  73.3686  0.8928  0.7779  0.5963
1     45.7138  6069.1475  77.9047  0.8754  0.7291  0.6246
2     45.3100  5505.8173  74.2012  0.8774  0.7892  0.6147
3     39.3652  4016.8643  63.3787  0.9155  0.7730  0.5440
4     44.8579  5711.1229  75.5720  0.8801  0.7317  0.5816
5     43.1582  5125.5638  71.5930  0.8976  0.7124  0.5426
6     43.7395  5264.2547  72.5552  0.8876  0.8456  0.5852
7     45.2053  6074.2170  77.9373  0.8618  0.8409  0.7463
8     44.6330  5503.4901  74.1855  0.8839  0.8042  0.6420
9     43.0042  5554.3783  74.5277  0.8664  0.7415  0.6422
Mean  43.7834  5420.7812  73.5224  0.8839  0.7746  0.6119
Std    1.7707   552.0546   3.9036  0.0148  0.0440  0.0560


                                                         

          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     42.2376  5131.9256  71.6375  0.8978  0.7675  0.6185
1     44.8651  5916.5076  76.9188  0.8786  0.7108  0.6254
2     44.7663  5459.3031  73.8871  0.8784  0.7372  0.5227
3     37.8876  3907.1564  62.5073  0.9178  0.7444  0.5498
4     43.9335  5472.6896  73.9776  0.8851  0.7094  0.5852
5     42.5302  4995.8445  70.6813  0.9002  0.6896  0.5543
6     43.9774  5374.6465  73.3120  0.8853  0.8220  0.6810
7     45.7340  6127.3515  78.2774  0.8606  0.8328  0.8719
8     45.3512  5838.9803  76.4132  0.8769  0.7628  0.6784
9     43.0194  5647.0810  75.1471  0.8642  0.7308  0.6394
Mean  43.4302  5387.1486  73.2759  0.8845  0.7507  0.6327
Std    2.1567   593.3930   4.2176  0.0163  0.0446  0.0945
Transformation Pipeline and Model Successfully Saved
                        Description        Value
0                        Session id          123
1                            Target  

                                                         

          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     45.3362  5389.0193  73.4099  0.8922  0.7562  0.5600
1     40.7585  4525.8876  67.2747  0.9030  0.7286  0.6090
2     43.1949  5570.3385  74.6347  0.8660  0.7373  0.5792
3     41.2055  4921.5067  70.1535  0.8920  0.7672  0.7804
4     45.6015  5997.1158  77.4410  0.8683  0.7089  0.5424
5     44.0639  5596.5630  74.8102  0.8845  0.7216  0.5879
6     43.2800  5130.6942  71.6289  0.8804  0.7981  0.6444
7     43.0817  5214.1554  72.2091  0.8810  0.7576  0.5662
8     39.6486  4465.5613  66.8249  0.9020  0.7325  0.6519
9     46.9510  6047.1012  77.7631  0.8755  0.7115  0.5046
Mean  43.3122  5285.7943  72.6150  0.8845  0.7420  0.6026
Std    2.1816   518.2565   3.5856  0.0122  0.0264  0.0728


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                          

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     46.2881  5787.1802  76.0735  0.8842  0.7403  0.5913
1     43.0764  5342.3431  73.0913  0.8855  0.7323  0.6094
2     44.6561  5677.5530  75.3495  0.8634  0.7375  0.6668
3     41.6807  5041.4277  71.0030  0.8894  0.7235  0.7204
4     47.6342  6582.2679  81.1312  0.8555  0.6918  0.5339
5     46.2027  6325.2109  79.5312  0.8695  0.7146  0.6109
6     44.7822  5713.0811  75.5849  0.8668  0.7623  0.5954
7     46.2069  5930.6986  77.0110  0.8646  0.7654  0.6202
8     41.8287  4953.9443  70.3843  0.8913  0.7081  0.6382
9     48.3897  6429.6127  80.1849  0.8677  0.6815  0.4832
Mean  45.0746  5778.3319  75.9345  0.8738  0.7257  0.6070
Std    2.1881   531.0361   3.5051  0.0119  0.0262  0.0622


                                                         

          MAE        MSE     RMSE      R2   RMSLE    MAPE
Fold                                                     
0     44.9221  5301.7843  72.8134  0.8939  0.7528  0.5970
1     40.0711  4503.2584  67.1063  0.9035  0.7200  0.6229
2     42.4739  5516.8753  74.2757  0.8673  0.7358  0.6013
3     41.0778  4883.7824  69.8841  0.8928  0.7536  0.7714
4     46.5148  6291.5225  79.3191  0.8619  0.6813  0.5326
5     44.2256  5677.7184  75.3506  0.8829  0.7144  0.5807
6     42.6061  5068.9049  71.1962  0.8818  0.7875  0.6730
7     43.7084  5191.2231  72.0501  0.8815  0.7590  0.6105
8     40.2895  4495.9639  67.0519  0.9014  0.7343  0.6919
9     47.4692  6096.7885  78.0819  0.8745  0.6726  0.5022
Mean  43.3358  5302.7822  72.7129  0.8841  0.7311  0.6184
Std    2.3864   576.8333   3.9510  0.0132  0.0336  0.0741
Transformation Pipeline and Model Successfully Saved


In [24]:
cat_features = ['dew_or_rime:idx' ,'is_in_shadow:idx']
cat_params = { 'A': {
                        "iterations": 5000,
                        "learning_rate": 0.034867396508006264,
                        "depth": 8,
                        "l2_leaf_reg": 1,
                        "loss_function": "MAE",
                        "border_count": 92,
                        "verbose": 500,
                        "subsample": 0.7641850606486046,
                        'early_stopping_rounds': 100,
                        'cat_features': cat_features,
                        'random_state': 42, 
                    },
              'B': {
                        "iterations": 5000,
                        "learning_rate": 0.037511244177544326,
                        "depth": 6,
                        "l2_leaf_reg": 5,
                        "loss_function": "MAE",
                        "border_count": 128,
                        "verbose": 500,
                        "subsample": 0.8012204629505595,
                        'early_stopping_rounds': 100,
                        'cat_features': cat_features,
                        'random_state': 42, 
                    },
              'C': {"iterations": 5000, 
                    "learning_rate": 0.03425599789981457,
                    "depth": 8,
                    "l2_leaf_reg": 4,
                    "loss_function": "MAE", 
                    "border_count": 218, 
                    "verbose": 500, 
                    "subsample": 0.6848272280307022, 
                    'early_stopping_rounds': 100,
                    'cat_features': cat_features,
                    'random_state': 42, }
}
              
cat_params_no_feature =  { 'A': {
                        "iterations": 5000,
                        "learning_rate": 0.034867396508006264,
                        "depth": 8,
                        "l2_leaf_reg": 1,
                        "loss_function": "MAE",
                        "border_count": 92,
                        "verbose": 500,
                        "subsample": 0.7641850606486046,
                        'early_stopping_rounds': 100,
                        'random_state': 42, 
                    },
              'B': {
                        "iterations": 5000,
                        "learning_rate": 0.037511244177544326,
                        "depth": 6,
                        "l2_leaf_reg": 5,
                        "loss_function": "MAE",
                        "border_count": 128,
                        "verbose": 500,
                        "subsample": 0.8012204629505595,
                        'early_stopping_rounds': 100,
                        'random_state': 42, 
                    },
              'C': {"iterations": 5000, 
                    "learning_rate": 0.03425599789981457,
                    "depth": 8,
                    "l2_leaf_reg": 4,
                    "loss_function": "MAE", 
                    "border_count": 218, 
                    "verbose": 500, 
                    "subsample": 0.6848272280307022, 
                    'early_stopping_rounds': 100,
                    'random_state': 42, }
}

In [15]:
def CATegorical(df):
    df['dew_or_rime:idx'] = df['dew_or_rime:idx'].astype(int)
    df['is_in_shadow:idx'] = df['is_in_shadow:idx'].astype(int)
    return df

In [27]:
from sklearn.impute import IterativeImputer
all_predictions_cat = []
# Cat_1 training and predictions
for loc in locations:
    
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

    # Calling preprocessing
    X_train_cat, X_test_cat, is_day_feature1 = preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated,mode = 'cat')
    X_train_cat.drop(columns=['date_forecast'], inplace=True)
    print(f'Doing... {loc}')

    imputer = IterativeImputer(max_iter=5, random_state=42)
    for col in X_train_cat.columns:
        X_train_cat[col] = imputer.fit_transform(np.array(X_train_cat[col]).reshape(-1,1))
    for col in X_test_cat.columns:
        X_test_cat[col] = imputer.fit_transform(np.array(X_test_cat[col]).reshape(-1,1))
    
    targets_cat = pd.DataFrame( {'pv_measurement': X_train_cat['pv_measurement']})
    X_train_cat = X_train_cat.drop(columns=['pv_measurement'])
    X_train_cat = CATegorical(X_train_cat)
    X_test_cat = CATegorical(X_test_cat)

    # Catboooooooozt fun
    model_cat = CatBoostRegressor(**cat_params[loc])

    X_train_cat1, X_val_cat1, y_train_cat1, y_val_cat1 = train_test_split(X_train_cat, targets_cat, test_size=0.2, random_state=42)
    
    # Training
    model_cat.fit(X_train_cat1, y_train_cat1['pv_measurement'],eval_set=(X_val_cat1, y_val_cat1['pv_measurement']),)

    # Prediction
    predictions_cat = model_cat.predict(X_test_cat[model_cat.feature_names_])
    
    # Multiplying the predictions with is_day, so setting predictions at night to zero
    adjusted_final_predictions_cat = predictions_cat * is_day_feature1['is_day:idx']

    # Setting negative predictions to zero
    adjusted_final_predictions_cat = np.clip(adjusted_final_predictions_cat, 0, None)

    # Appening predictions for each location to final list
    all_predictions_cat.append(adjusted_final_predictions_cat)

# Changing final list to array
all_predictions_cat = np.array(all_predictions_cat)

Doing... A
0:	learn: 1136.5478294	test: 1126.7779256	best: 1126.7779256 (0)	total: 132ms	remaining: 10m 59s
500:	learn: 299.1589179	test: 339.1212844	best: 339.1091885 (498)	total: 47s	remaining: 7m 1s
1000:	learn: 246.7139268	test: 328.1777351	best: 328.1777351 (1000)	total: 1m 25s	remaining: 5m 42s
1500:	learn: 215.3908163	test: 322.7291723	best: 322.7291723 (1500)	total: 2m 6s	remaining: 4m 54s
2000:	learn: 194.1990833	test: 320.5482988	best: 320.5151028 (1999)	total: 2m 56s	remaining: 4m 23s
2500:	learn: 177.6811755	test: 319.1596178	best: 319.1576476 (2499)	total: 3m 47s	remaining: 3m 47s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 318.7575341
bestIteration = 2575

Shrink model to first 2576 iterations.
Doing... B
0:	learn: 228.6978029	test: 226.7140601	best: 226.7140601 (0)	total: 58.1ms	remaining: 4m 50s
500:	learn: 58.1835306	test: 71.3786610	best: 71.3786610 (500)	total: 36.3s	remaining: 5m 26s
1000:	learn: 50.8143242	test: 69.1515526	best: 69.1496800 (9

In [29]:
# Catboost nr 3
all_predictions_cat_2 = []
for loc in locations:
    
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

    X_train_3, X_test_3, is_day_feature_3 = preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated)
    X_train_3.drop(columns=['date_forecast'], inplace=True)
    imputer = IterativeImputer(max_iter=5, random_state=42)
    for col in X_train_3.columns:
        X_train_3[col] = imputer.fit_transform(np.array(X_train_3[col]).reshape(-1,1))
    for col in X_test_3.columns:
        X_test_3[col] = imputer.fit_transform(np.array(X_test_3[col]).reshape(-1,1))
    targets_3 = pd.DataFrame( {'pv_measurement': X_train_3['pv_measurement']})
    X_train_3 = X_train_3.drop(columns=['pv_measurement'])
    
    model_cat_3 = CatBoostRegressor(**cat_params_no_feature[loc])
    X_train_3 = feature_engineering(X_train_3)
    X_test_3 = feature_engineering(X_test_3)

    # Create 'sin_sun_azimuth' and 'cos_sun_azimuth' from 'sun_azimuth' in radians
    X_train_3['sin_sun_azimuth'] = np.sin(np.radians(X_train_3['sun_azimuth:d']))
    X_train_3['cos_sun_azimuth'] = np.cos(np.radians(X_train_3['sun_azimuth:d']))
    X_test_3['sin_sun_azimuth'] = np.sin(np.radians(X_test_3['sun_azimuth:d']))
    X_test_3['cos_sun_azimuth'] = np.cos(np.radians(X_test_3['sun_azimuth:d']))

    # Now drop the original 'sun_azimuth' feature
    X_train_3.drop('sun_azimuth:d', axis=1, inplace=True)
    X_test_3.drop('sun_azimuth:d', axis=1, inplace=True)

    # Split the training data into training and validation sets
    X_train_cat_3, X_test_cat_3, y_train_cat_3, y_test_cat_3 = train_test_split(X_train_3, targets_3, test_size=0.2, random_state=42)

    # Train model
    model_cat_3.fit(X_train_cat_3, y_train_cat_3['pv_measurement'],eval_set=(X_test_cat_3, y_test_cat_3['pv_measurement']),)  
    
    # Pred
    pred_cat_3 = model_cat_3.predict(X_test_3[model_cat_3.feature_names_])

    # Multiplying the predictions with is_day, so setting predictions at night to zero
    adjusted_final_predictions_cat_3 = pred_cat_3 * is_day_feature_3['is_day:idx']

    # Setting negative predictions to zero
    adjusted_final_predictions_cat_3 = np.clip(adjusted_final_predictions_cat_3, 0, None)

    # Appening predictions for each location to final list
    all_predictions_cat_2.append(adjusted_final_predictions_cat_3) 

# Changing final list to array   
all_predictions_cat_2 = np.array(all_predictions_cat_2)

0:	learn: 945.7421297	test: 925.3651966	best: 925.3651966 (0)	total: 27.5ms	remaining: 2m 17s
500:	learn: 277.4531173	test: 301.6730739	best: 301.6730739 (500)	total: 12.1s	remaining: 1m 48s
1000:	learn: 237.6336661	test: 288.8311718	best: 288.8311718 (1000)	total: 23.2s	remaining: 1m 32s


KeyboardInterrupt: 

In [28]:
all_predictions_cat_3 = []
# Catboost nr 3
for loc in locations:
    
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

    X_train_3, X_test_3, is_day_feature_3 = preprocessing(train, X_train_observed, X_train_estimated, X_test_estimated,mode = 'catboost')
    X_train_3.drop(columns=['date_forecast'], inplace=True)
    imputer = IterativeImputer(max_iter=5, random_state=42)
    for col in X_train_3.columns:
        X_train_3[col] = imputer.fit_transform(np.array(X_train_3[col]).reshape(-1,1))
    for col in X_test_3.columns:
        X_test_3[col] = imputer.fit_transform(np.array(X_test_3[col]).reshape(-1,1))
    targets_3 = pd.DataFrame( {'pv_measurement': X_train_3['pv_measurement']})
    X_train_3 = X_train_3.drop(columns=['pv_measurement'])
    
    model_cat_3 = CatBoostRegressor(**cat_params_no_feature[loc])
   # X_train_3 = feature_engineering(X_train_3)
   # X_test_3 = feature_engineering(X_test_3)

    # Create 'sin_sun_azimuth' and 'cos_sun_azimuth' from 'sun_azimuth' in radians
    X_train_3['sin_sun_azimuth'] = np.sin(np.radians(X_train_3['sun_azimuth:d']))
    X_train_3['cos_sun_azimuth'] = np.cos(np.radians(X_train_3['sun_azimuth:d']))
    X_test_3['sin_sun_azimuth'] = np.sin(np.radians(X_test_3['sun_azimuth:d']))
    X_test_3['cos_sun_azimuth'] = np.cos(np.radians(X_test_3['sun_azimuth:d']))

    # Now drop the original 'sun_azimuth' feature
    X_train_3.drop('sun_azimuth:d', axis=1, inplace=True)
    X_test_3.drop('sun_azimuth:d', axis=1, inplace=True)

    # Split the training data into training and validation sets
    X_train_cat_3, X_test_cat_3, y_train_cat_3, y_test_cat_3 = train_test_split(X_train_3, targets_3, test_size=0.2, random_state=42)

    # Train model
    model_cat_3.fit(X_train_cat_3, y_train_cat_3['pv_measurement'],eval_set=(X_test_cat_3, y_test_cat_3['pv_measurement']),)  
    
    # Pred
    pred_cat_3 = model_cat_3.predict(X_test_3[model_cat_3.feature_names_])

    # Multiplying the predictions with is_day, so setting predictions at night to zero
    adjusted_final_predictions_cat_3 = pred_cat_3 * is_day_feature_3['is_day:idx']

    # Setting negative predictions to zero
    adjusted_final_predictions_cat_3 = np.clip(adjusted_final_predictions_cat_3, 0, None)

    # Appening predictions for each location to final list
    all_predictions_cat_3.append(adjusted_final_predictions_cat_3) 

# Changing final list to array   
all_predictions_cat_3 = np.array(all_predictions_cat_3)

0:	learn: 944.5035972	test: 923.1513579	best: 923.1513579 (0)	total: 18.8ms	remaining: 1m 33s
500:	learn: 281.9614880	test: 305.9260216	best: 305.9260216 (500)	total: 7.8s	remaining: 1m 10s
1000:	learn: 246.4577566	test: 294.1713330	best: 294.1713330 (1000)	total: 15.4s	remaining: 1m 1s
1500:	learn: 224.1178152	test: 289.0675935	best: 289.0675935 (1500)	total: 23.7s	remaining: 55.3s
2000:	learn: 209.1595791	test: 286.2417621	best: 286.2407106 (1999)	total: 32.8s	remaining: 49.2s
2500:	learn: 196.4720114	test: 283.7446587	best: 283.7254983 (2498)	total: 41.1s	remaining: 41.1s
3000:	learn: 185.7360923	test: 282.4002288	best: 282.3837594 (2984)	total: 49.3s	remaining: 32.9s
3500:	learn: 177.7502660	test: 281.3517613	best: 281.3517613 (3500)	total: 1m	remaining: 25.8s
4000:	learn: 171.3214326	test: 280.0675958	best: 280.0675958 (4000)	total: 1m 10s	remaining: 17.5s
4500:	learn: 165.6843097	test: 279.3612567	best: 279.3253236 (4483)	total: 1m 20s	remaining: 8.94s
4999:	learn: 160.8614180	te

In [20]:
all_predictions_cat_2.shape

(3, 720)

In [None]:
all_predictions_lGBM_e = np.array(all_predictions_lGBM_e).flatten()
all_predictions_cat = np.array(all_predictions_cat).flatten()
all_predictions_cat_2 = np.array(all_predictions_cat_2).flatten()
all_predictions_cat_3 = np.array(all_predictions_cat_3).flatten()
all_pred = 0.25*all_predictions_cat+0.25 * all_predictions_lGBM_e+0.25*all_predictions_cat_2 + 0.25*all_predictions_cat_3
print(all_pred.shape)

(2160,)


In [None]:


final_predictions = all_pred

# Save the final_predictions to CSV
df = pd.DataFrame(final_predictions, columns=['prediction'])
df['id'] = df.index
df = df[['id', 'prediction']]
df.to_csv('submit1.csv', index=False)

In [None]:
df1 = read_csv('submit.csv')
df1 = pd.read_csv('best_score1.csv')
df2 = pd.read_csv('best_score2.csv')


import matplotlib.pyplot as plt

# plot delle differenze tra df e df1
plt.plot(df1['prediction'] - df['prediction'])
plt.title('Differenze tra df1 e df')
plt.xlabel('Indice')
plt.ylabel('Differenza')
plt.show()
