# Data Exploration

In [4]:
# import necessary libraries

import pandas as pd
import xgboost as xgb
import torch
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import enefit
import time
import json
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt



In [5]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

print(device)

cuda


In [6]:
!nvidia-smi

Wed Jan 17 11:36:47 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   29C    P0              27W / 250W |      2MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [7]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/predict-energy-behavior-of-prosumers/client.csv
/kaggle/input/predict-energy-behavior-of-prosumers/gas_prices.csv
/kaggle/input/predict-energy-behavior-of-prosumers/electricity_prices.csv
/kaggle/input/predict-energy-behavior-of-prosumers/weather_station_to_county_mapping.csv
/kaggle/input/predict-energy-behavior-of-prosumers/public_timeseries_testing_util.py
/kaggle/input/predict-energy-behavior-of-prosumers/historical_weather.csv
/kaggle/input/predict-energy-behavior-of-prosumers/county_id_to_name_map.json
/kaggle/input/predict-energy-behavior-of-prosumers/train.csv
/kaggle/input/predict-energy-behavior-of-prosumers/forecast_weather.csv
/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/sample_submission.csv
/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/client.csv
/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/gas_prices.csv
/kaggle/input/predict-energy-behavior-of-prosumers/example_test_files/electricity

<h2>Dataset Details</h2>
<p><strong>train.csv</strong></p>
<ul>
<li><code>county</code> - An ID code for the county.</li>
<li><code>is_business</code> - Boolean for whether or not the prosumer is a business.</li>
<li><code>product_type</code> - ID code with the following mapping of codes to contract types: <code>{0: "Combined", 1: "Fixed", 2: "General service", 3: "Spot"}</code>.</li>
<li><code>target</code> - The consumption or production amount for the relevant segment for the hour. The segments are defined by the <code>county</code>, <code>is_business</code>, and <code>product_type</code>.</li>
<li><code>is_consumption</code> - Boolean for whether or not this row's target is consumption or production.</li>
<li><code>datetime</code> - The Estonian time in EET (UTC+2) / EEST (UTC+3). It describes the start of the 1-hour period on which target is given.</li>
<li><code>data_block_id</code> - All rows sharing the same <code>data_block_id</code> will be available at the same forecast time. This is a function of what information is available when forecasts are actually made, at 11 AM each morning. For example, if the forecast weather <code>data_block_id</code> for predictins made on October 31st is 100 then the historic weather <code>data_block_id</code> for October 31st will be 101 as the historic weather data is only actually available the next day.</li>
<li><code>row_id</code> - A unique identifier for the row.</li>
<li><code>prediction_unit_id</code> - A unique identifier for the <code>county</code>, <code>is_business</code>, and <code>product_type</code> combination. <em>New prediction units can appear or disappear in the test set.</em></li>
</ul>
<p><strong>gas_prices.csv</strong></p>
<ul>
<li><code>origin_date</code> - The date when the day-ahead prices became available.</li>
<li><code>forecast_date</code> - The date when the forecast prices should be relevant.</li>
<li><code>[lowest/highest]_price_per_mwh</code> - The lowest/highest price of natural gas that on the day ahead market that trading day, in Euros per megawatt hour equivalent.</li>
<li><code>data_block_id</code></li>
</ul>
<p><strong>client.csv</strong></p>
<ul>
<li><code>product_type</code></li>
<li><code>county</code> - An ID code for the county. See <code>county_id_to_name_map.json</code> for the mapping of ID codes to county names.</li>
<li><code>eic_count</code> - The aggregated number of consumption points (EICs - European Identifier Code).</li>
<li><code>installed_capacity</code> -  Installed photovoltaic solar panel capacity in kilowatts.</li>
<li><code>is_business</code> - Boolean for whether or not the prosumer is a business.</li>
<li><code>date</code></li>
<li><code>data_block_id</code></li>
</ul>
<p><strong>electricity_prices.csv</strong></p>
<ul>
<li><code>origin_date</code></li>
<li><code>forecast_date</code> - Represents the start of the 1-hour period when the price is valid</li>
<li><code>euros_per_mwh</code> - The price of electricity on the day ahead markets in euros per megawatt hour.</li>
<li><code>data_block_id</code></li>
</ul>
<p><strong>forecast_weather.csv</strong> Weather forecasts that would have been available at prediction time. Sourced from the <a rel="noreferrer nofollow" target="_blank" href="https://codes.ecmwf.int/grib/param-db/?filter=grib2">European Centre for Medium-Range Weather Forecasts</a>.</p>
<ul>
<li><code>[latitude/longitude]</code> - The coordinates of the weather forecast.</li>
<li><code>origin_datetime</code> - The timestamp of when the forecast was generated.</li>
<li><code>hours_ahead</code> - The number of hours between the forecast generation and the forecast weather. Each forecast covers 48 hours in total.</li>
<li><code>temperature</code> - The air temperature at 2 meters above ground in degrees Celsius. Estimated for the end of the 1-hour period.</li>
<li><code>dewpoint</code> - The dew point temperature at 2 meters above ground in degrees Celsius. Estimated for the end of the 1-hour period.</li>
<li><code>cloudcover_[low/mid/high/total]</code> - The percentage of the sky covered by clouds in the following altitude bands: 0-2 km, 2-6, 6+, and total. Estimated for the end of the 1-hour period.</li>
<li><code>10_metre_[u/v]_wind_component</code> - The [eastward/northward] component of wind speed measured 10 meters above surface in meters per second. Estimated for the end of the 1-hour period.</li>
<li><code>data_block_id</code></li>
<li><code>forecast_datetime</code> - The timestamp of the predicted weather. Generated from <code>origin_datetime</code> plus <code>hours_ahead</code>. This represents the start of the 1-hour period for which weather data are forecasted.</li>
<li><code>direct_solar_radiation</code> - The direct solar radiation reaching the surface on a plane perpendicular to the direction of the Sun accumulated during the hour, in watt-hours per square meter.</li>
<li><code>surface_solar_radiation_downwards</code> - The solar radiation, both direct and diffuse, that reaches a horizontal plane at the surface of the Earth, accumulated during the hour, in watt-hours per square meter.</li>
<li><code>snowfall</code> - Snowfall over hour in units of meters of water equivalent.</li>
<li><code>total_precipitation</code> - The accumulated liquid, comprising rain and snow that falls on Earth's surface over the described hour, in units of meters.</li>
</ul>
<p><strong>historical_weather.csv</strong> <a rel="noreferrer nofollow" target="_blank" href="https://open-meteo.com/en/docs">Historic weather data</a>.</p>
<ul>
<li><code>datetime</code> - This represents the start of the 1-hour period for which weather data are measured.</li>
<li><code>temperature</code> - Measured at the end of the 1-hour period.</li>
<li><code>dewpoint</code> - Measured at the end of the 1-hour period.</li>
<li><code>rain</code> - Different from the forecast conventions. The rain from large scale weather systems of the hour in millimeters.</li>
<li><code>snowfall</code> - Different from the forecast conventions. Snowfall over the hour in centimeters.</li>
<li><code>surface_pressure</code> - The air pressure at surface in hectopascals.</li>
<li><code>cloudcover_[low/mid/high/total]</code> - Different from the forecast conventions. Cloud cover at 0-3 km, 3-8, 8+, and total.</li>
<li><code>windspeed_10m</code> - Different from the forecast conventions. The wind speed at 10 meters above ground in meters per second.</li>
<li><code>winddirection_10m</code> - Different from the forecast conventions. The wind direction at 10 meters above ground in degrees.</li>
<li><code>shortwave_radiation</code> - Different from the forecast conventions. The global horizontal irradiation in watt-hours per square meter.</li>
<li><code>direct_solar_radiation</code></li>
<li><code>diffuse_radiation</code> - Different from the forecast conventions. The diffuse solar irradiation in watt-hours per square meter.</li>
<li><code>[latitude/longitude]</code> - The coordinates of the weather station.</li>
<li><code>data_block_id</code></li>
</ul>
</ul>
<p><strong>client.csv</strong></p>
<ul>
<li><code>product_type</code></li>
<li><code>county</code> - An ID code for the county. See <code>county_id_to_name_map.json</code> for the mapping of ID codes to county names.</li>
<li><code>longitude</code> - The longitude of a location.</li>
<li><code>latitude</code> - The latitude of a location.</li>
</ul>

In [8]:
input_root_folder = "data"
train = pd.read_csv(f"{input_root_folder}/train.csv")
client = pd.read_csv(f'{input_root_folder}/client.csv')
elec_price = pd.read_csv(f"{input_root_folder}/electricity_prices.csv")
gas_price =  pd.read_csv(f"{input_root_folder}/gas_prices.csv")
forecast_weather =  pd.read_csv(f"{input_root_folder}/forecast_weather.csv")
historical_weather =  pd.read_csv(f"{input_root_folder}/historical_weather.csv")
weather_station =  pd.read_csv(f"{input_root_folder}/weather_station_to_county_mapping.csv")
locations= pd.read_csv(f'{input_root_folder}/county_lon_lats.csv')
locations.drop("Unnamed: 0", axis=1, inplace=True)

In [9]:
train["datetime"].min(), train["datetime"].max()

('2021-09-01 00:00:00', '2023-05-31 23:00:00')

#### Generate county details

In [10]:
### County population and area size
### http://www.statoids.com/uee.html

with open(f"{input_root_folder}/county_id_to_name_map.json") as f:
    data = json.load(f)
data = {int(key): val for key, val in data.items()}
county_area = {
    0: 4333,
    1: 1023,
    2: 3364,
    3: 2460,
    4: 2604,
    5: 3628,
    6: 2383,
    7: 4807,
    8: 2165,
    9: 2980,
    10: 2922,
    11: 2993,
    12: 2895, # estimate
    13: 2044,
    14: 3422,
    15: 2305
}

county_pop = {
    0: 523588,
    1: 10385,
    2: 177471,
    3: 38514,
    4: 38060,
    5: 67364,
    6: 28394,
    7: 90507,
    8: 32308,
    9: 37319,
    10: 35746,
    11: 149160,
    12: 90749,# estimate
    13: 35479,
    14: 57482,
    15: 39465
}
county_details = {
    "county": [],
    "population": [],
    "area": []
}
for county in county_area:
    county_details["county"].append(county)
    county_details["population"].append(county_pop[county])
    county_details["area"].append(county_area[county])

county_details = pd.DataFrame(county_details)
county_details["popPerArea"] = county_details["population"] / county_details["area"]
county_details

Unnamed: 0,county,population,area,popPerArea
0,0,523588,4333,120.837295
1,1,10385,1023,10.151515
2,2,177471,3364,52.755945
3,3,38514,2460,15.656098
4,4,38060,2604,14.615975
5,5,67364,3628,18.567806
6,6,28394,2383,11.915233
7,7,90507,4807,18.828167
8,8,32308,2165,14.922864
9,9,37319,2980,12.523154


#### Generate forecasted weather

In [11]:
forecast_weather[["latitude",'longitude']] = forecast_weather[["latitude",'longitude']].astype(float).round(decimals=1)

forecast_weather= forecast_weather.merge(locations, how='left',
                                             on=['longitude','latitude'])
forecast_weather.dropna(axis= 0, inplace= True)
forecast_weather['county'] = forecast_weather['county'].astype(int)
forecast_weather.drop(['origin_datetime', 'latitude','longitude', 'hours_ahead', 'data_block_id'], axis=1, inplace= True)
forecast_weather.rename(columns={'forecast_datetime': 'datetime'}, inplace= True)
    
#Converting (datetime) column to datetime
forecast_weather['datetime']= pd.to_datetime(forecast_weather['datetime'], utc= True)

### Grouping all forecast_weather columns mean values by hour, So each hour
### will have the mean values of the forecast_weather columns
forecast_weather_datetime= forecast_weather.groupby([forecast_weather['datetime'].dt.to_period('h')])[list(forecast_weather.drop(['county','datetime'], axis= 1).columns)].mean().reset_index()

#After converting the (datetime) column to hour period for the groupby we convert it back to datetime
forecast_weather_datetime['datetime']= pd.to_datetime(forecast_weather_datetime['datetime'].dt.to_timestamp(), utc=True)

## Grouping all forecast_weather columns mean values by hour and county, So each hour and county
## will have the mean values of the forecast_weather columns for each county
forecast_weather_datetime_county= forecast_weather.groupby(['county',forecast_weather['datetime'].dt.to_period('h')])[list(forecast_weather.drop(['county','datetime'], axis= 1).columns)].mean().reset_index()

#After converting the (datetime) column to hour period for the groupby we convert it back to datetime
forecast_weather_datetime_county['datetime']= pd.to_datetime(forecast_weather_datetime_county['datetime'].dt.to_timestamp(), utc=True)


  forecast_weather_datetime= forecast_weather.groupby([forecast_weather['datetime'].dt.to_period('h')])[list(forecast_weather.drop(['county','datetime'], axis= 1).columns)].mean().reset_index()
  forecast_weather_datetime_county= forecast_weather.groupby(['county',forecast_weather['datetime'].dt.to_period('h')])[list(forecast_weather.drop(['county','datetime'], axis= 1).columns)].mean().reset_index()


#### Generate historical weather

In [12]:
#Rounding the (latitude) and (longitude) for 1 decimal fraction           
historical_weather[['latitude', 'longitude']] = historical_weather[['latitude', 'longitude']].astype(float).round(1)

#Merging counties in locations data with the coordinations in the historical_weather data
historical_weather= historical_weather.merge(locations, how='left', on=['longitude','latitude'])    

#Dropping nan values
historical_weather.dropna(axis= 0, inplace= True)

#Dropping the columns we won't need
historical_weather.drop(['latitude', 'longitude'], axis=1, inplace= True)

#Converting (county) to integer
historical_weather['county'] = historical_weather['county'].astype('int64')

#Converting (datetime) column to datetime
historical_weather['datetime']= pd.to_datetime(historical_weather['datetime'], utc= True)

### Grouping all historical_weather columns mean values by hour, So each hour
### will have the mean values of the historical_weather columns"""     
hist_weather_datetime= historical_weather.groupby([historical_weather['datetime'].dt.to_period('h')])[list(historical_weather.drop(['county','datetime','data_block_id'], axis= 1).columns)].mean().reset_index()    

#After converting the (datetime) column to hour period for the groupby we convert it back to datetime
hist_weather_datetime['datetime']= pd.to_datetime(hist_weather_datetime['datetime'].dt.to_timestamp(), utc=True)

# Merging (data_block_id) back after dropping it in the last step | (data_block_id will be used to merge with train data)
hist_weather_datetime= hist_weather_datetime.merge(
    historical_weather[['datetime', 'data_block_id']], how='left', on='datetime')

### Grouping all historical_weather columns mean values by hour and county, So each hour
### will have the mean values of the historical_weather columns for each county
hist_weather_datetime_county= historical_weather.groupby(['county',historical_weather['datetime'].dt.to_period('h')])[list(historical_weather.drop(['county','datetime', 'data_block_id'], axis= 1).columns)].mean().reset_index() 

# After converting the (datetime) column to hour period for the groupby we convert it back to datetime
hist_weather_datetime_county['datetime']= pd.to_datetime(
    hist_weather_datetime_county['datetime'].dt.to_timestamp(), utc=True)

# Merging (data_block_id) back after dropping it in the last step
hist_weather_datetime_county= hist_weather_datetime_county.merge(historical_weather[['datetime', 'data_block_id']], how='left', on='datetime')

hist_weather_datetime['hour']= hist_weather_datetime['datetime'].dt.hour
hist_weather_datetime_county['hour']= hist_weather_datetime_county['datetime'].dt.hour

#Dropping duplicates and (datetime) column
hist_weather_datetime.drop_duplicates(inplace=True)
hist_weather_datetime_county.drop_duplicates(inplace=True)
hist_weather_datetime.drop('datetime', axis= 1, inplace= True)
hist_weather_datetime_county.drop('datetime', axis= 1, inplace= True)

  hist_weather_datetime= historical_weather.groupby([historical_weather['datetime'].dt.to_period('h')])[list(historical_weather.drop(['county','datetime','data_block_id'], axis= 1).columns)].mean().reset_index()
  hist_weather_datetime_county= historical_weather.groupby(['county',historical_weather['datetime'].dt.to_period('h')])[list(historical_weather.drop(['county','datetime', 'data_block_id'], axis= 1).columns)].mean().reset_index()


# Feature Engineering

#### Date time transformations

In [13]:
# Time features from the train set
train["datetime"] = pd.to_datetime(train["datetime"], utc=True)
train['hour'] = train['datetime'].dt.hour
train['day_of_week'] = train['datetime'].dt.dayofweek
train['day'] = train['datetime'].dt.day
train['month'] = train['datetime'].dt.month
train['year'] = train['datetime'].dt.year
train['quarter'] = train['datetime'].dt.quarter
train['dayofyear'] = train['datetime'].dt.dayofyear
train['sin_hour']= (np.pi * np.sin(train['hour']) / 12)
train['cos_hour']= (np.pi * np.cos(train['hour']) / 12)
train['sin_dayofyear']= (np.pi * np.sin(train['dayofyear']) / 183)
train['cos_dayofyear']= (np.pi * np.cos(train['dayofyear']) / 183)

Electricity prices transformations

In [14]:
elec_price["forecast_date"] = pd.to_datetime(elec_price["forecast_date"], utc=True)
elec_price["hour"] = elec_price["forecast_date"].dt.hour
elec_price

Unnamed: 0,forecast_date,euros_per_mwh,origin_date,data_block_id,hour
0,2021-09-01 00:00:00+00:00,92.51,2021-08-31 00:00:00,1,0
1,2021-09-01 01:00:00+00:00,88.90,2021-08-31 01:00:00,1,1
2,2021-09-01 02:00:00+00:00,87.35,2021-08-31 02:00:00,1,2
3,2021-09-01 03:00:00+00:00,86.88,2021-08-31 03:00:00,1,3
4,2021-09-01 04:00:00+00:00,88.43,2021-08-31 04:00:00,1,4
...,...,...,...,...,...
15281,2023-05-30 19:00:00+00:00,82.10,2023-05-29 19:00:00,637,19
15282,2023-05-30 20:00:00+00:00,150.85,2023-05-29 20:00:00,637,20
15283,2023-05-30 21:00:00+00:00,82.10,2023-05-29 21:00:00,637,21
15284,2023-05-30 22:00:00+00:00,82.09,2023-05-29 22:00:00,637,22


In [15]:
def create_revealed_targets_train(data, lags):
    '''Creating lagged target values | A target value depends on what was a target value n days ago'''
    
    #Storing datetime in a variable
    original_datetime = data['datetime']
    
    #Taking a subset of the following columns from the train data and storing it in a variable
    revealed_targets = data[['datetime', 'prediction_unit_id', 'is_consumption', 'target']].copy()
    
    #Create revealed targets for n days lags
    for day_lag in range(2, lags+1):
        revealed_targets['datetime'] = original_datetime + pd.DateOffset(day_lag)
        data = data.merge(revealed_targets, 
                          how='left', 
                          on = ['datetime', 'prediction_unit_id', 'is_consumption'],
                          suffixes = ('', f'_{day_lag}_days_ago')
                         )
    return data

#### Mege the datasets and impute missing

In [16]:
def add_features(df):
    
    df = create_revealed_targets_train(df, lags=num_lag)
    num_lag = 7
    ## Add mean,std,var of the lagged target variables
    df['target_mean']= df[[f'target_{i}_days_ago' for i in range(2, num_lag+1)]].mean(1)
    df['target_std']= df[[f'target_{i}_days_ago' for i in range(2, num_lag+1)]].std(1)
    df['target_var']= df[[f'target_{i}_days_ago' for i in range(2, num_lag+1)]].var(1)

    ## Merge all datasets
    df = (
        df
        .merge(gas_price[["data_block_id","lowest_price_per_mwh","highest_price_per_mwh"]], on=["data_block_id"], how="left")
        .merge(elec_price[["hour","data_block_id", "euros_per_mwh"]], on=["hour","data_block_id"], how="left")
        .merge(client.drop("date",axis=1), on=['data_block_id', 'county', 'is_business', 'product_type'], how="left")
        .merge(forecast_weather_datetime, on=['datetime'], how='left')
        .merge(forecast_weather_datetime_county, on=['datetime', 'county'], suffixes= ('_fcast_mean','_fcast_mean_by_county'), how='left')
        .merge(hist_weather_datetime, on=['data_block_id', 'hour'], how='left')
        .merge(hist_weather_datetime_county, on=['data_block_id', 'county', 'hour'], how='left', suffixes= ('_hist_mean','_hist_mean_by_county'))
        .merge(county_details, on=["county"], how="left")
    )
    df= df.groupby(['year', 'day', 'hour'], as_index=False).apply(lambda x: x.ffill().bfill()).reset_index()
    
    return df

train = add_features(train)

#### Apply log transformation

In [17]:
log_this_columns = [
        'target_2_days_ago', 'target_3_days_ago','target_4_days_ago', 'target_5_days_ago', 'target_6_days_ago', 'target_7_days_ago',  
        'target_mean', 'target_std', 'population', 'area','popPerArea'  
        'installed_capacity', 'euros_per_mwh', 'temperature_fcast_mean', 'dewpoint_fcast_mean',
        'cloudcover_high_fcast_mean', 'cloudcover_low_fcast_mean', 'cloudcover_mid_fcast_mean', 'cloudcover_total_fcast_mean',
        '10_metre_u_wind_component_fcast_mean', '10_metre_v_wind_component_fcast_mean', 'direct_solar_radiation_fcast_mean',
        'snowfall_fcast_mean', 'total_precipitation_fcast_mean', 'temperature_fcast_mean_by_county', 'dewpoint_fcast_mean_by_county',
        'cloudcover_high_fcast_mean_by_county', 'cloudcover_low_fcast_mean_by_county', 'cloudcover_mid_fcast_mean_by_county',
        'cloudcover_total_fcast_mean_by_county', '10_metre_u_wind_component_fcast_mean_by_county', '10_metre_v_wind_component_fcast_mean_by_county',
        'surface_solar_radiation_downwards_fcast_mean_by_county', 'snowfall_fcast_mean_by_county', 'total_precipitation_fcast_mean_by_county',
        'rain_hist_mean', 'snowfall_hist_mean', 'windspeed_10m_hist_mean_by_county',         
        ]

for col in log_this_columns:
    train[f"{col}_log"] = np.where((train[col])!= 0, np.log(train[col]),0)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **k

In [None]:
all_features_raw  = [
    'county', 'is_business', 'product_type','is_consumption',
    'hour', 'day_of_week', 'day', 'month', 'year',
    'quarter', 'dayofyear', 'lowest_price_per_mwh', 'highest_price_per_mwh',
    'euros_per_mwh', 'eic_count', 'installed_capacity',
    'temperature_fcast_mean', 'dewpoint_fcast_mean',
    'cloudcover_high_fcast_mean', 'cloudcover_low_fcast_mean',
    'cloudcover_mid_fcast_mean', 'cloudcover_total_fcast_mean',
    '10_metre_u_wind_component_fcast_mean',
    '10_metre_v_wind_component_fcast_mean',
    'direct_solar_radiation_fcast_mean',
    'surface_solar_radiation_downwards_fcast_mean', 'snowfall_fcast_mean',
    'total_precipitation_fcast_mean', 'temperature_fcast_mean_by_county',
    'dewpoint_fcast_mean_by_county', 'cloudcover_high_fcast_mean_by_county',
    'cloudcover_low_fcast_mean_by_county',
    'cloudcover_mid_fcast_mean_by_county',
    'cloudcover_total_fcast_mean_by_county',
    '10_metre_u_wind_component_fcast_mean_by_county',
    '10_metre_v_wind_component_fcast_mean_by_county',
    'direct_solar_radiation_fcast_mean_by_county',
    'surface_solar_radiation_downwards_fcast_mean_by_county',
    'snowfall_fcast_mean_by_county',
    'total_precipitation_fcast_mean_by_county', 'temperature_hist_mean',
    'dewpoint_hist_mean', 'rain_hist_mean', 'snowfall_hist_mean',
    'surface_pressure_hist_mean', 'cloudcover_total_hist_mean',
    'cloudcover_low_hist_mean', 'cloudcover_mid_hist_mean',
    'cloudcover_high_hist_mean', 'windspeed_10m_hist_mean',
    'winddirection_10m_hist_mean', 'shortwave_radiation_hist_mean',
    'direct_solar_radiation_hist_mean', 'diffuse_radiation_hist_mean',
    'temperature_hist_mean_by_county', 'dewpoint_hist_mean_by_county',
    'rain_hist_mean_by_county', 'snowfall_hist_mean_by_county',
    'surface_pressure_hist_mean_by_county',
    'cloudcover_total_hist_mean_by_county',
    'cloudcover_low_hist_mean_by_county',
    'cloudcover_mid_hist_mean_by_county',
    'cloudcover_high_hist_mean_by_county',
    'windspeed_10m_hist_mean_by_county',
    'winddirection_10m_hist_mean_by_county',
    'shortwave_radiation_hist_mean_by_county',
    'direct_solar_radiation_hist_mean_by_county',
    'diffuse_radiation_hist_mean_by_county', 'population', 'area',
]


In [1]:
all_features_transformed = [
    'hour','dayofyear',
    'sin_hour', 'cos_hour', 'sin_dayofyear', 'cos_dayofyear',
    
    'day_of_week', 'month','quarter','year', 'county', 'is_business', 'product_type',
    'euros_per_mwh', 'lowest_price_per_mwh', 'highest_price_per_mwh', 'eic_count', 'installed_capacity',
    'temperature_fcast_mean', 'dewpoint_fcast_mean',
    'cloudcover_high_fcast_mean', 'cloudcover_low_fcast_mean',
    'cloudcover_mid_fcast_mean', 'cloudcover_total_fcast_mean',
    '10_metre_u_wind_component_fcast_mean',
    '10_metre_v_wind_component_fcast_mean',
    'direct_solar_radiation_fcast_mean',
    'surface_solar_radiation_downwards_fcast_mean', 'snowfall_fcast_mean',
    'total_precipitation_fcast_mean', 'temperature_fcast_mean_by_county',
    'dewpoint_fcast_mean_by_county', 'cloudcover_high_fcast_mean_by_county',
    'cloudcover_low_fcast_mean_by_county',
    'cloudcover_mid_fcast_mean_by_county',
    'cloudcover_total_fcast_mean_by_county',
    '10_metre_u_wind_component_fcast_mean_by_county',
    '10_metre_v_wind_component_fcast_mean_by_county',
    'direct_solar_radiation_fcast_mean_by_county',
    'surface_solar_radiation_downwards_fcast_mean_by_county',
    'snowfall_fcast_mean_by_county',
    'total_precipitation_fcast_mean_by_county', 'temperature_hist_mean',
    'dewpoint_hist_mean', 'rain_hist_mean', 'snowfall_hist_mean',
    'surface_pressure_hist_mean', 'cloudcover_total_hist_mean',
    'cloudcover_low_hist_mean', 'cloudcover_mid_hist_mean',
    'cloudcover_high_hist_mean', 'windspeed_10m_hist_mean',
    'winddirection_10m_hist_mean', 'shortwave_radiation_hist_mean',
    'direct_solar_radiation_hist_mean', 'diffuse_radiation_hist_mean',
    'temperature_hist_mean_by_county', 'dewpoint_hist_mean_by_county',
    'rain_hist_mean_by_county', 'snowfall_hist_mean_by_county',
    'surface_pressure_hist_mean_by_county',
    'cloudcover_total_hist_mean_by_county',
    'cloudcover_low_hist_mean_by_county',
    'cloudcover_mid_hist_mean_by_county',
    'cloudcover_high_hist_mean_by_county',
    'windspeed_10m_hist_mean_by_county',
    'winddirection_10m_hist_mean_by_county',
    'shortwave_radiation_hist_mean_by_county',
    'direct_solar_radiation_hist_mean_by_county',
    'diffuse_radiation_hist_mean_by_county',
    
    'installed_capacity_log',
    'euros_per_mwh_log',
    'temperature_fcast_mean_log',
    'dewpoint_fcast_mean_log',
    'cloudcover_high_fcast_mean_log',
    'cloudcover_low_fcast_mean_log',
    'cloudcover_mid_fcast_mean_log',
    'cloudcover_total_fcast_mean_log',
    '10_metre_u_wind_component_fcast_mean_log',
    '10_metre_v_wind_component_fcast_mean_log',
    'direct_solar_radiation_fcast_mean_log',
    'snowfall_fcast_mean_log',
    'total_precipitation_fcast_mean_log',
    'temperature_fcast_mean_by_county_log',
    'dewpoint_fcast_mean_by_county_log',
    'cloudcover_high_fcast_mean_by_county_log',
    'cloudcover_low_fcast_mean_by_county_log',
    'cloudcover_mid_fcast_mean_by_county_log',
    'cloudcover_total_fcast_mean_by_county_log',
    '10_metre_u_wind_component_fcast_mean_by_county_log',
    '10_metre_v_wind_component_fcast_mean_by_county_log',
    'surface_solar_radiation_downwards_fcast_mean_by_county_log',
    'snowfall_fcast_mean_by_county_log',
    'total_precipitation_fcast_mean_by_county_log',
    'rain_hist_mean_log',
    'snowfall_hist_mean_log',
    'windspeed_10m_hist_mean_by_county_log',
        
    'target_2_days_ago',
    'target_3_days_ago',
    'target_4_days_ago',
    'target_5_days_ago',
    'target_6_days_ago',
    'target_7_days_ago',
    'target_2_days_ago_log',
    'target_3_days_ago_log',
    'target_4_days_ago_log',
    'target_5_days_ago_log',
    'target_6_days_ago_log',
    'target_7_days_ago_log',
    
    'target_mean', 'target_std', 'target_var',
    'target_mean_log',
    'target_std_log',
]

In [15]:
selected_features_v1 = [
    'day_of_week',
    'county',
    'eic_count',
    'installed_capacity',
    'surface_solar_radiation_downwards_fcast_mean',
    'snowfall_fcast_mean',
    'cloudcover_low_fcast_mean_by_county',
    'direct_solar_radiation_fcast_mean_by_county',
    'surface_solar_radiation_downwards_fcast_mean_by_county',
    'snowfall_fcast_mean_by_county',
    'direct_solar_radiation_fcast_mean_log',
    'surface_solar_radiation_downwards_fcast_mean_by_county_log',
    'target_2_days_ago',
    'target_6_days_ago',
    'target_7_days_ago',
    'target_2_days_ago_log',
    'target_6_days_ago_log',
    'target_7_days_ago_log',
    'target_mean',
    'target_std',
    'target_var',
    'target_mean_log',
    'target_std_log'
 ]


selected_features_v2 = [
    'day_of_week',
    'county',
    'eic_count',
    'installed_capacity',
    'surface_solar_radiation_downwards_fcast_mean',
    'snowfall_fcast_mean',
    'direct_solar_radiation_fcast_mean_by_county',
    'surface_solar_radiation_downwards_fcast_mean_by_county',
    'snowfall_fcast_mean_by_county',
    'direct_solar_radiation_fcast_mean_log',
    'surface_solar_radiation_downwards_fcast_mean_by_county_log',
    'target_2_days_ago',
    'target_6_days_ago',
    'target_7_days_ago',
    'target_2_days_ago_log',
    'target_6_days_ago_log',
    'target_7_days_ago_log',
    'target_mean',
    'target_std',
    'target_var',
    'target_mean_log',
    'target_std_log'
 ]



selected_features_v3 = [
    'dayofyear', 'target_2_days_ago', 'target_6_days_ago',
       'target_7_days_ago', 'target_mean', 'target_std', 'target_var',
       'day_of_week', 'county', 'is_business', 'eic_count',
       'installed_capacity',
       'surface_solar_radiation_downwards_fcast_mean',
       'cloudcover_low_fcast_mean_by_county',
       'direct_solar_radiation_fcast_mean_by_county',
       'surface_solar_radiation_downwards_fcast_mean_by_county',
       'snowfall_fcast_mean_by_county', 'installed_capacity_log',
       'direct_solar_radiation_fcast_mean_log',
       'surface_solar_radiation_downwards_fcast_mean_by_county_log',
       'target_2_days_ago_log', 'target_6_days_ago_log',
       'target_7_days_ago_log', 'target_mean_log', 'target_std_log'
]

In [21]:
# choose which feature to use for training
# options (all_features_raw, all_features_transformed, selected_features_v1, selected_features_v2, selected_features_v3)
features = selected_features_v3

In [None]:
# Prepare the data for training
X = train[features].reset_index(drop=True)
y = train['target'].reset_index(drop=True)

In [25]:
# Define the cross-validator
tscv = TimeSeriesSplit(n_splits=5)

# Placeholder for the MAE of each fold
mae_scores = []

# Perform Time Series Cross-Validation
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Initialize XGBoost regressor
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=1000,
        learning_rate=0.01,
        min_child_weight=5,
        max_depth=8,
        subsample=1,
        colsample_bytree=0.7,
        tree_method='gpu_hist',
        eval_metric="mae"
    )

    # Fit the model
    model.fit(X_train, y_train, eval_set=[(X_train, y_train),(X_test, y_test)], early_stopping_rounds=50, verbose=False)

    # Make predictions
    predictions = model.predict(X_test)

    # Calculate and print the MAE
    fold_mae = mean_absolute_error(y_test, predictions)
    mae_scores.append(fold_mae)
    print(f'Fold MAE: {fold_mae}')



Fold MAE: 92.61205503774889




Fold MAE: 45.35265752937994




Fold MAE: 41.56985146662197




Fold MAE: 58.445178938754644




Fold MAE: 59.45528648822021


In [27]:
# Print the average MAE across all folds
average_mae = sum(mae_scores) / len(mae_scores)
print(f'Average MAE across all folds: {average_mae}')

Average MAE across all folds: 59.487005892145135


# Hyperparameter Tuning

- run when features = selected_features_v3

In [20]:
# # Initialize parameters
# params = {
#     'objective': 'reg:squarederror',
#     'eval_metric': 'mae',
#     'tree_method': 'gpu_hist',
#     'n_estimators': 1000,
#     'learning_rate': 0.05
# }

In [21]:
# gridsearch_params = [
#     (max_depth, min_child_weight, subsample, colsample, eta)
#     for max_depth in range(8, 11, 2)
#     for min_child_weight in range(5, 9, 2)
#     for subsample in [i / 10. for i in range(9, 11)]
#     for colsample in [i / 10. for i in range(7, 9)]
#     for eta in [0.3, 0.01, 0.005]
# ]

# gridsearch_params

[(8, 5, 0.9, 0.7, 0.3),
 (8, 5, 0.9, 0.7, 0.01),
 (8, 5, 0.9, 0.7, 0.005),
 (8, 5, 0.9, 0.8, 0.3),
 (8, 5, 0.9, 0.8, 0.01),
 (8, 5, 0.9, 0.8, 0.005),
 (8, 5, 1.0, 0.7, 0.3),
 (8, 5, 1.0, 0.7, 0.01),
 (8, 5, 1.0, 0.7, 0.005),
 (8, 5, 1.0, 0.8, 0.3),
 (8, 5, 1.0, 0.8, 0.01),
 (8, 5, 1.0, 0.8, 0.005),
 (8, 7, 0.9, 0.7, 0.3),
 (8, 7, 0.9, 0.7, 0.01),
 (8, 7, 0.9, 0.7, 0.005),
 (8, 7, 0.9, 0.8, 0.3),
 (8, 7, 0.9, 0.8, 0.01),
 (8, 7, 0.9, 0.8, 0.005),
 (8, 7, 1.0, 0.7, 0.3),
 (8, 7, 1.0, 0.7, 0.01),
 (8, 7, 1.0, 0.7, 0.005),
 (8, 7, 1.0, 0.8, 0.3),
 (8, 7, 1.0, 0.8, 0.01),
 (8, 7, 1.0, 0.8, 0.005),
 (10, 5, 0.9, 0.7, 0.3),
 (10, 5, 0.9, 0.7, 0.01),
 (10, 5, 0.9, 0.7, 0.005),
 (10, 5, 0.9, 0.8, 0.3),
 (10, 5, 0.9, 0.8, 0.01),
 (10, 5, 0.9, 0.8, 0.005),
 (10, 5, 1.0, 0.7, 0.3),
 (10, 5, 1.0, 0.7, 0.01),
 (10, 5, 1.0, 0.7, 0.005),
 (10, 5, 1.0, 0.8, 0.3),
 (10, 5, 1.0, 0.8, 0.01),
 (10, 5, 1.0, 0.8, 0.005),
 (10, 7, 0.9, 0.7, 0.3),
 (10, 7, 0.9, 0.7, 0.01),
 (10, 7, 0.9, 0.7, 0.005),
 (10, 7, 0

In [22]:
# # Define the cross-validator
# tscv = TimeSeriesSplit(n_splits=5)

# # Initialize minimum MAE, best parameters and DataFrame for results
# min_mae = float("Inf")
# best_params = None
# results_df = pd.DataFrame(columns=['max_depth', 'min_child_weight', 'subsample', 'colsample', 'eta', 'mae', 'time'])

# # Iterate over the combined grid
# for max_depth, min_child_weight, subsample, colsample, eta in tqdm(gridsearch_params, desc="Hyperparameter Tuning Progress"):
#     print(f"Testing params: max_depth={max_depth}, min_child_weight={min_child_weight}, subsample={subsample}, colsample={colsample}, eta={eta}")
    
#     mae_scores = []
#     start_time = time.perf_counter()

#     for train_index, test_index in tscv.split(X):
#         X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#         y_train, y_test = y[train_index], y[test_index]

#         model = xgb.XGBRegressor(
#             objective=params['objective'],
#             n_estimators=params['n_estimators'],
#             learning_rate=eta,
#             max_depth=max_depth,
#             min_child_weight=min_child_weight,
#             subsample=subsample,
#             colsample_bytree=colsample,
#             tree_method=params['tree_method'],
#             early_stopping_rounds=50,
#         )

#         model.fit(X_train, y_train, eval_set=[(X_test, y_test)],  verbose=False)
#         predictions = model.predict(X_test)
#         fold_mae = mean_absolute_error(y_test, predictions)
#         mae_scores.append(fold_mae)

#     end_time = time.perf_counter()
#     elapsed_time = end_time - start_time

#     mean_mae = sum(mae_scores) / len(mae_scores)
#     print(f"\tAverage MAE: {mean_mae} | Time: {elapsed_time:.2f} seconds")

#     new_row = {
#     'max_depth': max_depth,
#     'min_child_weight': min_child_weight,
#     'subsample': subsample,
#     'colsample': colsample,
#     'eta': eta,
#     'mae': mean_mae,
#     'time': elapsed_time
#     }

#     results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)


#     if mean_mae < min_mae:
#         min_mae = mean_mae
#         best_params = (max_depth, min_child_weight, subsample, colsample, eta)

# # Update params with the best found values
# params['max_depth'] = best_params[0]
# params['min_child_weight'] = best_params[1]
# params['subsample'] = best_params[2]
# params['colsample_bytree'] = best_params[3]
# params['eta'] = best_params[4]

# # Print final params
# print(params)

Hyperparameter Tuning Progress:   0%|          | 0/48 [00:00<?, ?it/s]

Testing params: max_depth=8, min_child_weight=5, subsample=0.9, colsample=0.7, eta=0.3


Hyperparameter Tuning Progress:   2%|▏         | 1/48 [00:25<20:21, 25.99s/it]

	Average MAE: 64.87043952073236 | Time: 25.99 seconds
Testing params: max_depth=8, min_child_weight=5, subsample=0.9, colsample=0.7, eta=0.01


Hyperparameter Tuning Progress:   4%|▍         | 2/48 [02:18<58:46, 76.66s/it]

	Average MAE: 61.723623580711845 | Time: 112.13 seconds
Testing params: max_depth=8, min_child_weight=5, subsample=0.9, colsample=0.7, eta=0.005


Hyperparameter Tuning Progress:   6%|▋         | 3/48 [04:25<1:14:42, 99.61s/it]

	Average MAE: 62.37359690800146 | Time: 126.92 seconds
Testing params: max_depth=8, min_child_weight=5, subsample=0.9, colsample=0.8, eta=0.3


Hyperparameter Tuning Progress:   8%|▊         | 4/48 [04:49<51:10, 69.78s/it]  

	Average MAE: 63.63910221909457 | Time: 24.03 seconds
Testing params: max_depth=8, min_child_weight=5, subsample=0.9, colsample=0.8, eta=0.01


Hyperparameter Tuning Progress:  10%|█         | 5/48 [06:39<1:00:34, 84.52s/it]

	Average MAE: 62.50038874810159 | Time: 110.65 seconds
Testing params: max_depth=8, min_child_weight=5, subsample=0.9, colsample=0.8, eta=0.005


Hyperparameter Tuning Progress:  12%|█▎        | 6/48 [08:46<1:09:19, 99.04s/it]

	Average MAE: 63.2856101815157 | Time: 127.24 seconds
Testing params: max_depth=8, min_child_weight=5, subsample=1.0, colsample=0.7, eta=0.3


Hyperparameter Tuning Progress:  15%|█▍        | 7/48 [09:10<50:54, 74.50s/it]  

	Average MAE: 65.46181615937112 | Time: 23.95 seconds
Testing params: max_depth=8, min_child_weight=5, subsample=1.0, colsample=0.7, eta=0.01


Hyperparameter Tuning Progress:  17%|█▋        | 8/48 [10:54<55:46, 83.66s/it]

	Average MAE: 62.21693745063732 | Time: 103.29 seconds
Testing params: max_depth=8, min_child_weight=5, subsample=1.0, colsample=0.7, eta=0.005


Hyperparameter Tuning Progress:  19%|█▉        | 9/48 [12:57<1:02:20, 95.90s/it]

	Average MAE: 62.44546096679958 | Time: 122.81 seconds
Testing params: max_depth=8, min_child_weight=5, subsample=1.0, colsample=0.8, eta=0.3


Hyperparameter Tuning Progress:  21%|██        | 10/48 [13:24<47:24, 74.85s/it] 

	Average MAE: 64.49836392420002 | Time: 27.70 seconds
Testing params: max_depth=8, min_child_weight=5, subsample=1.0, colsample=0.8, eta=0.01


Hyperparameter Tuning Progress:  23%|██▎       | 11/48 [15:06<51:18, 83.22s/it]

	Average MAE: 62.791502403735976 | Time: 102.19 seconds
Testing params: max_depth=8, min_child_weight=5, subsample=1.0, colsample=0.8, eta=0.005


Hyperparameter Tuning Progress:  25%|██▌       | 12/48 [17:10<57:19, 95.53s/it]

	Average MAE: 63.34053036725195 | Time: 123.70 seconds
Testing params: max_depth=8, min_child_weight=7, subsample=0.9, colsample=0.7, eta=0.3


Hyperparameter Tuning Progress:  27%|██▋       | 13/48 [17:34<43:05, 73.89s/it]

	Average MAE: 65.05460862801448 | Time: 24.07 seconds
Testing params: max_depth=8, min_child_weight=7, subsample=0.9, colsample=0.7, eta=0.01


Hyperparameter Tuning Progress:  29%|██▉       | 14/48 [19:20<47:20, 83.56s/it]

	Average MAE: 61.80932245925438 | Time: 105.90 seconds
Testing params: max_depth=8, min_child_weight=7, subsample=0.9, colsample=0.7, eta=0.005


Hyperparameter Tuning Progress:  31%|███▏      | 15/48 [21:26<53:02, 96.44s/it]

	Average MAE: 62.44029142875578 | Time: 126.28 seconds
Testing params: max_depth=8, min_child_weight=7, subsample=0.9, colsample=0.8, eta=0.3


Hyperparameter Tuning Progress:  33%|███▎      | 16/48 [21:51<39:56, 74.90s/it]

	Average MAE: 64.37944519864672 | Time: 24.87 seconds
Testing params: max_depth=8, min_child_weight=7, subsample=0.9, colsample=0.8, eta=0.01


Hyperparameter Tuning Progress:  35%|███▌      | 17/48 [23:42<44:13, 85.59s/it]

	Average MAE: 62.37240709598875 | Time: 110.46 seconds
Testing params: max_depth=8, min_child_weight=7, subsample=0.9, colsample=0.8, eta=0.005


Hyperparameter Tuning Progress:  38%|███▊      | 18/48 [25:50<49:09, 98.33s/it]

	Average MAE: 63.39533959698173 | Time: 127.99 seconds
Testing params: max_depth=8, min_child_weight=7, subsample=1.0, colsample=0.7, eta=0.3


Hyperparameter Tuning Progress:  40%|███▉      | 19/48 [26:17<37:11, 76.95s/it]

	Average MAE: 64.33258951800359 | Time: 27.15 seconds
Testing params: max_depth=8, min_child_weight=7, subsample=1.0, colsample=0.7, eta=0.01


Hyperparameter Tuning Progress:  42%|████▏     | 20/48 [28:06<40:29, 86.76s/it]

	Average MAE: 61.86675109736965 | Time: 109.62 seconds
Testing params: max_depth=8, min_child_weight=7, subsample=1.0, colsample=0.7, eta=0.005


Hyperparameter Tuning Progress:  44%|████▍     | 21/48 [30:10<44:02, 97.88s/it]

	Average MAE: 62.47110313092634 | Time: 123.80 seconds
Testing params: max_depth=8, min_child_weight=7, subsample=1.0, colsample=0.8, eta=0.3


Hyperparameter Tuning Progress:  46%|████▌     | 22/48 [30:35<32:57, 76.06s/it]

	Average MAE: 63.58630189167691 | Time: 25.16 seconds
Testing params: max_depth=8, min_child_weight=7, subsample=1.0, colsample=0.8, eta=0.01


Hyperparameter Tuning Progress:  48%|████▊     | 23/48 [32:24<35:42, 85.71s/it]

	Average MAE: 62.56700112460423 | Time: 108.23 seconds
Testing params: max_depth=8, min_child_weight=7, subsample=1.0, colsample=0.8, eta=0.005


Hyperparameter Tuning Progress:  50%|█████     | 24/48 [34:27<38:45, 96.90s/it]

	Average MAE: 63.39434040377354 | Time: 122.99 seconds
Testing params: max_depth=10, min_child_weight=5, subsample=0.9, colsample=0.7, eta=0.3


Hyperparameter Tuning Progress:  52%|█████▏    | 25/48 [34:52<28:57, 75.55s/it]

	Average MAE: 62.94245378564209 | Time: 25.75 seconds
Testing params: max_depth=10, min_child_weight=5, subsample=0.9, colsample=0.7, eta=0.01


Hyperparameter Tuning Progress:  54%|█████▍    | 26/48 [37:24<36:06, 98.45s/it]

	Average MAE: 59.45401744643307 | Time: 151.89 seconds
Testing params: max_depth=10, min_child_weight=5, subsample=0.9, colsample=0.7, eta=0.005


Hyperparameter Tuning Progress:  56%|█████▋    | 27/48 [40:32<43:52, 125.36s/it]

	Average MAE: 60.02024392142365 | Time: 188.12 seconds
Testing params: max_depth=10, min_child_weight=5, subsample=0.9, colsample=0.8, eta=0.3


Hyperparameter Tuning Progress:  58%|█████▊    | 28/48 [40:59<31:55, 95.75s/it] 

	Average MAE: 62.29057991285364 | Time: 26.67 seconds
Testing params: max_depth=10, min_child_weight=5, subsample=0.9, colsample=0.8, eta=0.01


Hyperparameter Tuning Progress:  60%|██████    | 29/48 [43:24<35:02, 110.64s/it]

	Average MAE: 60.26963157341292 | Time: 145.36 seconds
Testing params: max_depth=10, min_child_weight=5, subsample=0.9, colsample=0.8, eta=0.005


Hyperparameter Tuning Progress:  62%|██████▎   | 30/48 [46:32<40:04, 133.61s/it]

	Average MAE: 60.7109565534159 | Time: 187.20 seconds
Testing params: max_depth=10, min_child_weight=5, subsample=1.0, colsample=0.7, eta=0.3


Hyperparameter Tuning Progress:  65%|██████▍   | 31/48 [46:57<28:39, 101.17s/it]

	Average MAE: 62.25681674148548 | Time: 25.50 seconds
Testing params: max_depth=10, min_child_weight=5, subsample=1.0, colsample=0.7, eta=0.01


Hyperparameter Tuning Progress:  67%|██████▋   | 32/48 [49:18<30:09, 113.12s/it]

	Average MAE: 60.31045601345446 | Time: 140.99 seconds
Testing params: max_depth=10, min_child_weight=5, subsample=1.0, colsample=0.7, eta=0.005


Hyperparameter Tuning Progress:  69%|██████▉   | 33/48 [52:21<33:32, 134.15s/it]

	Average MAE: 60.39464492491233 | Time: 183.21 seconds
Testing params: max_depth=10, min_child_weight=5, subsample=1.0, colsample=0.8, eta=0.3


Hyperparameter Tuning Progress:  71%|███████   | 34/48 [52:48<23:44, 101.74s/it]

	Average MAE: 63.17615329365951 | Time: 26.10 seconds
Testing params: max_depth=10, min_child_weight=5, subsample=1.0, colsample=0.8, eta=0.01


Hyperparameter Tuning Progress:  73%|███████▎  | 35/48 [55:10<24:41, 113.96s/it]

	Average MAE: 61.3152434899351 | Time: 142.48 seconds
Testing params: max_depth=10, min_child_weight=5, subsample=1.0, colsample=0.8, eta=0.005


Hyperparameter Tuning Progress:  75%|███████▌  | 36/48 [58:14<26:58, 134.91s/it]

	Average MAE: 61.57744748883092 | Time: 183.78 seconds
Testing params: max_depth=10, min_child_weight=7, subsample=0.9, colsample=0.7, eta=0.3


Hyperparameter Tuning Progress:  77%|███████▋  | 37/48 [58:40<18:45, 102.33s/it]

	Average MAE: 62.877149930947084 | Time: 26.32 seconds
Testing params: max_depth=10, min_child_weight=7, subsample=0.9, colsample=0.7, eta=0.01


Hyperparameter Tuning Progress:  79%|███████▉  | 38/48 [1:00:59<18:53, 113.37s/it]

	Average MAE: 59.65550081666 | Time: 139.13 seconds
Testing params: max_depth=10, min_child_weight=7, subsample=0.9, colsample=0.7, eta=0.005


Hyperparameter Tuning Progress:  81%|████████▏ | 39/48 [1:04:04<20:13, 134.79s/it]

	Average MAE: 59.93705710106584 | Time: 184.76 seconds
Testing params: max_depth=10, min_child_weight=7, subsample=0.9, colsample=0.8, eta=0.3


Hyperparameter Tuning Progress:  83%|████████▎ | 40/48 [1:04:31<13:39, 102.48s/it]

	Average MAE: 62.362856233993725 | Time: 27.06 seconds
Testing params: max_depth=10, min_child_weight=7, subsample=0.9, colsample=0.8, eta=0.01


Hyperparameter Tuning Progress:  85%|████████▌ | 41/48 [1:06:52<13:17, 113.90s/it]

	Average MAE: 60.53316047852288 | Time: 140.55 seconds
Testing params: max_depth=10, min_child_weight=7, subsample=0.9, colsample=0.8, eta=0.005


Hyperparameter Tuning Progress:  88%|████████▊ | 42/48 [1:09:56<13:31, 135.18s/it]

	Average MAE: 60.78409440960543 | Time: 184.84 seconds
Testing params: max_depth=10, min_child_weight=7, subsample=1.0, colsample=0.7, eta=0.3


Hyperparameter Tuning Progress:  90%|████████▉ | 43/48 [1:10:25<08:35, 103.14s/it]

	Average MAE: 62.94574943470887 | Time: 28.37 seconds
Testing params: max_depth=10, min_child_weight=7, subsample=1.0, colsample=0.7, eta=0.01


Hyperparameter Tuning Progress:  92%|█████████▏| 44/48 [1:12:46<07:38, 114.60s/it]

	Average MAE: 60.172431740970424 | Time: 141.33 seconds
Testing params: max_depth=10, min_child_weight=7, subsample=1.0, colsample=0.7, eta=0.005


Hyperparameter Tuning Progress:  94%|█████████▍| 45/48 [1:15:49<06:44, 134.92s/it]

	Average MAE: 60.23988566739049 | Time: 182.32 seconds
Testing params: max_depth=10, min_child_weight=7, subsample=1.0, colsample=0.8, eta=0.3


Hyperparameter Tuning Progress:  96%|█████████▌| 46/48 [1:16:14<03:24, 102.03s/it]

	Average MAE: 63.08747110760834 | Time: 25.30 seconds
Testing params: max_depth=10, min_child_weight=7, subsample=1.0, colsample=0.8, eta=0.01


Hyperparameter Tuning Progress:  98%|█████████▊| 47/48 [1:18:25<01:50, 110.63s/it]

	Average MAE: 61.22076102784293 | Time: 130.68 seconds
Testing params: max_depth=10, min_child_weight=7, subsample=1.0, colsample=0.8, eta=0.005


Hyperparameter Tuning Progress: 100%|██████████| 48/48 [1:21:27<00:00, 101.81s/it]

	Average MAE: 61.58321111300624 | Time: 182.03 seconds
{'objective': 'reg:squarederror', 'eval_metric': 'mae', 'tree_method': 'gpu_hist', 'n_estimators': 1000, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 5, 'subsample': 0.9, 'colsample_bytree': 0.7, 'eta': 0.01}





In [23]:
# # saving the dataframe
# results_df.to_csv('hyperparam_tuning_df.csv')

# results_df

Unnamed: 0,max_depth,min_child_weight,subsample,colsample,eta,mae,time
0,8,5,0.9,0.7,0.3,64.87044,25.986456
1,8,5,0.9,0.7,0.01,61.723624,112.125522
2,8,5,0.9,0.7,0.005,62.373597,126.924967
3,8,5,0.9,0.8,0.3,63.639102,24.029246
4,8,5,0.9,0.8,0.01,62.500389,110.649843
5,8,5,0.9,0.8,0.005,63.28561,127.236056
6,8,5,1.0,0.7,0.3,65.461816,23.951245
7,8,5,1.0,0.7,0.01,62.216937,103.285509
8,8,5,1.0,0.7,0.005,62.445461,122.807689
9,8,5,1.0,0.8,0.3,64.498364,27.696551


# Model

- change hyperparameters according to the model version
    - default hyperparameters: {learning_rate=0.01, max_depth=8, min_child_weight = 5, subsample = 1, colsample_bytree=0.7} 
    - best hyperparameters: {learning_rate=0.01, max_depth=10, min_child_weight = 7, subsample = 0.9, colsample_bytree=0.7}

In [None]:
# Splitting the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

# Fit the model on the entire dataset
final_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=10000,
    learning_rate=0.01,
    max_depth=8,
    min_child_weight=5,
    subsample=1,
    colsample_bytree=0.7,
    tree_method='gpu_hist',
    eval_metric='mae',
    early_stopping_rounds=100
)

# Fit the model
final_model.fit(
    X_train, y_train, 
    eval_set=[(X_train, y_train), (X_val, y_val)], 
    verbose=True
)

# Capture the training and validation metrics
evals_result = final_model.evals_result()

# Create a DataFrame from the metrics
results_df_2 = pd.DataFrame({
    'train': evals_result['validation_0']['mae'],
    'validation': evals_result['validation_1']['mae']
})

# saving the dataframe
results_df_2.to_csv('model_train_val_df.csv')

[0]	validation_0-mae:271.63570	validation_1-mae:272.68724
[1]	validation_0-mae:268.93735	validation_1-mae:269.97888
[2]	validation_0-mae:266.27617	validation_1-mae:267.31215
[3]	validation_0-mae:263.63942	validation_1-mae:264.66590
[4]	validation_0-mae:261.03333	validation_1-mae:262.05207
[5]	validation_0-mae:258.45791	validation_1-mae:259.47010
[6]	validation_0-mae:255.91309	validation_1-mae:256.92046
[7]	validation_0-mae:253.39886	validation_1-mae:254.39887
[8]	validation_0-mae:250.91268	validation_1-mae:251.90574
[9]	validation_0-mae:248.45454	validation_1-mae:249.44087
[10]	validation_0-mae:246.02776	validation_1-mae:247.00754
[11]	validation_0-mae:243.62952	validation_1-mae:244.60505
[12]	validation_0-mae:241.25783	validation_1-mae:242.22739
[13]	validation_0-mae:238.91503	validation_1-mae:239.88280
[14]	validation_0-mae:236.60017	validation_1-mae:237.56517
[15]	validation_0-mae:234.31122	validation_1-mae:235.27131
[16]	validation_0-mae:232.04989	validation_1-mae:233.00488
[17]	va

# Visualization

## Plotting for All Feature Versions with Default Hyperparameters

In [None]:
all_results = pd.read_csv("/kaggle/input/feature-eng-res/trainvalid_iteration_all.csv").drop("Unnamed: 0",axis=1)
all_results.rename(columns={"train": "train_all", "validation": "validation_all"}, inplace=True)
iter_1 = pd.read_csv("/kaggle/input/feature-eng-res/trainvalid_iteration_1.csv").drop("Unnamed: 0",axis=1)
iter_1.rename(columns={"train": "train_1", "validation": "validation_1"}, inplace=True)
iter_2 = pd.read_csv("/kaggle/input/feature-eng-res/trainvalid_iteration_2.csv").drop("Unnamed: 0",axis=1)
iter_2.rename(columns={"train": "train_2", "validation": "validation_2"}, inplace=True)
iter_3 = pd.read_csv("/kaggle/input/feature-eng-res/trainvalid_iteration_3.csv").drop("Unnamed: 0",axis=1)
iter_3.rename(columns={"train": "train_3", "validation": "validation_3"}, inplace=True)

In [None]:
results_df = (
    all_results.merge(iter_1, left_index=True, right_index=True, how="outer")
    .merge(iter_2, left_index=True, right_index=True, how="outer")
    .merge(iter_3, left_index=True, right_index=True, how="outer")
)
results_df

In [None]:
# Retrieve performance metrics
plt.rc('font', family='serif', size=8)
estimators = results_df.shape[0]
x_axis = range(0, estimators)

# Plot
fig, ax = plt.subplots(figsize=[8,6])
ax.plot(x_axis, results_df["train_all"], label='all features train',linestyle='dotted', color='g', linewidth=.75)
ax.plot(x_axis, results_df["validation_all"], label='all features validation',linestyle='-', color='g', linewidth=.75)
ax.plot(x_axis, results_df["train_1"], label='feature set 1 train',linestyle='dotted', color='brown', linewidth=.75)
ax.plot(x_axis, results_df["validation_1"], label='feature set 1 validation',linestyle='-', color='brown', linewidth=.75)
ax.plot(x_axis, results_df["train_2"], label='feature set 2 train',linestyle='dotted', color='r', linewidth=.75)
ax.plot(x_axis, results_df["validation_2"], label='feature set 2 validation',linestyle='-', color='r', linewidth=.75)
ax.plot(x_axis, results_df["train_3"], label='feature set 3 train',linestyle='dotted', color='b', linewidth=.75)
ax.plot(x_axis, results_df["validation_3"], label='feature set 3 validation',linestyle='-', color='b', linewidth=.75)
ax.legend()
plt.ylabel('MAE')
plt.xlabel('n estimators')
plt.title('Train and Validation MAE')
plt.show()


## Plotting for Selected Feature V3 with Default Hyperparamaters vs Selected Features V3 with Best Hyperparameters

In [None]:
def_hyperparams = pd.read_csv("/kaggle/input/feature-eng-res/trainvalid_iteration_3_def.csv").drop("Unnamed: 0",axis=1)
best_hyperparams = pd.read_csv("/kaggle/input/feature-eng-res/trainvalid_iteration_3_best.csv").drop("Unnamed: 0",axis=1)

In [None]:
results_df = (
    def_hyperparams.merge(best_hyperparams, left_index=True, right_index=True, how="outer")
)
results_df

In [None]:
# Retrieve performance metrics
plt.rc('font', family='serif', size=8)
estimators = results_df.shape[0]
x_axis = range(0, estimators)

# Plot
fig, ax = plt.subplots(figsize=[8,6])
ax.plot(x_axis, results_df["train_x"], label='best features default hyperparams train',linestyle='dotted', color='g', linewidth=.75)
ax.plot(x_axis, results_df["validation_x"], label='best features default hyperparams validation',linestyle='-', color='g', linewidth=.75)
ax.plot(x_axis, results_df["train_y"], label='best features best hyperparams train',linestyle='dotted', color='brown', linewidth=.75)
ax.plot(x_axis, results_df["validation_y"], label='best features best hyperparams validation',linestyle='-', color='brown', linewidth=.75)
ax.legend()
plt.ylabel('MAE')
plt.xlabel('n estimators')
plt.title('Train and Validation MAE')
# plt.show()
plt.savefig('Train and Validation MAE.png', dpi=300)


## Feature Importance

In [None]:
xgb.plot_importance(final_model, importance_type="weight")

# Submission

- uses the python time-series API provided by Enefit for the Kaggle competition

In [None]:
def create_revealed_targets_test(data, previous_revealed_targets, max_lags):
    '''🎯 Create new test data based on previous_revealed_targets and N_day_lags 🎯 ''' 
    for count, revealed_targets in enumerate(previous_revealed_targets) :
        day_lag = count + 2
        
        # Get hour
        revealed_targets['hour'] = pd.to_datetime(revealed_targets['datetime'], utc= True).dt.hour
        
        # Select columns and rename target
        revealed_targets = revealed_targets[['hour', 'prediction_unit_id', 'is_consumption', 'target']]
        revealed_targets = revealed_targets.rename(columns = {"target" : f"target_{day_lag}_days_ago"})
        
        
        # Add past revealed targets
        data = pd.merge(data, revealed_targets, how='left', on=['hour', 'prediction_unit_id', 'is_consumption'])
        
    # If revealed_target_columns not available, replace by nan
    all_revealed_columns = [f"target_{day_lag}_days_ago" for day_lag in range(2, max_lags+1)]
    missing_columns = list(set(all_revealed_columns) - set(data.columns))
    data[missing_columns] = np.nan 
    
    return data

In [None]:
def generate_test_set(test, client, historical_weather,forecast_weather, electricity_prices, gas_prices):
    
    forecast_weather[["latitude",'longitude']] = forecast_weather[["latitude",'longitude']].astype(float).round(decimals=1)
    forecast_weather[["latitude",'longitude']]

    forecast_weather= forecast_weather.merge(locations, how='left', on=['longitude','latitude'])
    forecast_weather.dropna(axis= 0, inplace= True)
    forecast_weather['county'] = forecast_weather['county'].astype(int)
    forecast_weather.drop(['origin_datetime', 'latitude',
                               'longitude', 'hours_ahead'], axis=1, inplace= True)
    forecast_weather.rename(columns={'forecast_datetime': 'datetime'}, inplace= True)

    #Converting (datetime) column to datetime
    forecast_weather['datetime']= pd.to_datetime(forecast_weather['datetime'], utc= True)

    """Grouping all forecast_weather columns mean values by hour, So each hour will have the mean values of the forecast_weather columns""" 
    forecast_weather_datetime= forecast_weather.groupby([forecast_weather['datetime'].dt.to_period('h')])[list(forecast_weather.drop(['county','datetime'], axis= 1).columns)].mean().reset_index()
    #After converting the (datetime) column to hour period for the groupby we convert it back to datetime
    forecast_weather_datetime['datetime']= pd.to_datetime(forecast_weather_datetime['datetime'].dt.to_timestamp(), utc=True)

    """Grouping all forecast_weather columns mean values by hour and county, So each hour and county will have the mean values of the forecast_weather columns for each county""" 
    forecast_weather_datetime_county= forecast_weather.groupby(['county',forecast_weather['datetime'].
                              dt.to_period('h')])[list(forecast_weather.drop(['county',
                                                                              'datetime'], axis= 1)
                                                       .columns)].mean().reset_index()

    #After converting the (datetime) column to hour period for the groupby we convert it back to datetime
    forecast_weather_datetime_county['datetime']= pd.to_datetime(
        forecast_weather_datetime_county['datetime'].dt.to_timestamp(), utc=True)



    #Rounding the (latitude) and (longitude) for 1 decimal fraction           
    historical_weather[['latitude', 'longitude']] = historical_weather[['latitude', 'longitude']].astype(float).round(1)

    #Merging counties in locations data with the coordinations in the historical_weather data
    historical_weather= historical_weather.merge(locations, how='left', on=['longitude','latitude'])    

    #Dropping nan values to remov
    historical_weather.dropna(axis= 0, inplace= True)

    #Dropping the columns we won't need
    historical_weather.drop(['latitude', 'longitude'], axis=1, inplace= True)

    #Converting (county) to integer
    historical_weather['county'] = historical_weather['county'].astype('int64')

    #Converting (datetime) column to datetime
    historical_weather['datetime']= pd.to_datetime(historical_weather['datetime'], utc= True)

    """Grouping all historical_weather columns mean values by hour, So each hour will have the mean values of the historical_weather columns"""     
    hist_weather_datetime= historical_weather.groupby([historical_weather['datetime'].dt.to_period('h')])[list(historical_weather.drop(['county','datetime'], axis= 1).columns)].mean().reset_index()    

    #After converting the (datetime) column to hour period for the groupby we convert it back to datetime
    hist_weather_datetime['datetime']= pd.to_datetime(
        hist_weather_datetime['datetime'].dt.to_timestamp(), utc=True)

    #Merging (data_block_id) back after dropping it in the last step | (data_block_id will be used to merge with train data)
    hist_weather_datetime= hist_weather_datetime.merge(historical_weather[['datetime']], how='left', on='datetime')

    """Grouping all historical_weather columns mean values by hour and county, So each hour will have the mean values of the historical_weather columns for each county"""   
    hist_weather_datetime_county= historical_weather.groupby(['county',historical_weather['datetime'].dt.to_period('h')])[list(historical_weather.drop(['county','datetime'], axis= 1).columns)].mean().reset_index() 

    #After converting the (datetime) column to hour period for the groupby we convert it back to datetime
    hist_weather_datetime_county['datetime']= pd.to_datetime(hist_weather_datetime_county['datetime'].dt.to_timestamp(), utc=True)

    #Merging (data_block_id) back after dropping it in the last step
    hist_weather_datetime_county= hist_weather_datetime_county.merge(historical_weather[['datetime']], how='left', on='datetime')

    hist_weather_datetime['hour']= hist_weather_datetime['datetime'].dt.hour
    hist_weather_datetime_county['hour']= hist_weather_datetime_county['datetime'].dt.hour

    #Dropping duplicates and (datetime) column
    hist_weather_datetime.drop_duplicates(inplace=True)
    hist_weather_datetime_county.drop_duplicates(inplace=True)
    hist_weather_datetime.drop('datetime', axis= 1, inplace= True)
    hist_weather_datetime_county.drop('datetime', axis= 1, inplace= True)

    electricity_prices["forecast_date"] = pd.to_datetime(electricity_prices["forecast_date"], utc=True)
    electricity_prices["hour"] = electricity_prices["forecast_date"].dt.hour
    
    # Remove timezone information from both columns
    test['datetime'] = test['datetime'].dt.tz_localize(None)
    forecast_weather_datetime['datetime'] = forecast_weather_datetime['datetime'].dt.tz_localize(None)
    forecast_weather_datetime_county['datetime'] = forecast_weather_datetime_county['datetime'].dt.tz_localize(None)

    # Merge other necessary dataframes

    # Electricity Prices
    test = test.merge(electricity_prices[["hour", "euros_per_mwh"]], on=["hour"], how="left")
    
    # Client
    test = test.merge(client.drop("date",axis=1), on=['county', 'is_business', 'product_type'], how="left")

    # Forecast Weather Datetime
    test = test.merge(forecast_weather_datetime, how='left', on=['datetime'])

    # Forecast Weather Datetime County
    test = test.merge(forecast_weather_datetime_county, how='left', on=['datetime', 'county'],suffixes= ('_fcast_mean','_fcast_mean_by_county'))

    # Historical Weather Datetime
    test = test.merge(hist_weather_datetime, how='left', on=['hour'])

    # Historical Weather Datetime County
    test = test.merge(hist_weather_datetime_county, how='left', on= ['county', 'hour'],suffixes= ('_hist_mean','_hist_mean_by_county'))
    test = test.merge(county_details, on=["county"], how="left")

    
    test['date'] = test['datetime'].dt.normalize()
    
    # Gas Prices
    gas_prices['date'] = pd.to_datetime(gas_prices['forecast_date'])
    
    test = test.merge(gas_prices[["date", "lowest_price_per_mwh", "highest_price_per_mwh"]], left_on="date", right_on="date", how="left")

    test.drop('date', axis=1, inplace=True)

    test[['county','product_type','is_consumption']] = test[['county','product_type','is_consumption']].astype(object)
    
    
    test['county'] = test['county'].astype(int)
    test['product_type'] = test['product_type'].astype(int)
    
    return test

In [None]:
# Reload enefit environment (only in debug mode, otherwise the submission will fail)
debug = False # set this to False if submitting
env = enefit.make_env()
if debug :
    enefit.make_env.__called__ = False
    type(env)._state = type(type(env)._state).__dict__['INIT']
    iter_test = env.iter_test()
else:

    iter_test = env.iter_test()

In [None]:
previous_revealed_targets = []

for (test, revealed_targets, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
    
    test = test.rename(columns={'prediction_datetime':'datetime'})
    test["datetime"] = pd.to_datetime(test["datetime"], utc=True)
    test['hour'] = test['datetime'].dt.hour
    test['day_of_week'] = test['datetime'].dt.dayofweek
    test['day'] = test['datetime'].dt.day
    test['month'] = test['datetime'].dt.month
    test['year'] = test['datetime'].dt.year
    test['quarter'] = test['datetime'].dt.quarter
    test['dayofyear'] = test['datetime'].dt.dayofyear
    test['sin_hour']= (np.pi * np.sin(test['hour']) / 12)
    test['cos_hour']= (np.pi * np.cos(test['hour']) / 12)
    test['sin_dayofyear']= (np.pi * np.sin(test['dayofyear']) / 183)
    test['cos_dayofyear']= (np.pi * np.cos(test['dayofyear']) / 183)

    num_lag = 7
    previous_revealed_targets.insert(0, revealed_targets)
    if len(previous_revealed_targets) == num_lag:
        previous_revealed_targets.pop()
        
    X_test = create_revealed_targets_test(data=test.copy(), previous_revealed_targets = previous_revealed_targets.copy(), max_lags = num_lag)
    X_test['target_mean']= X_test[[f'target_{i}_days_ago' for i in range(2, num_lag+1)]].mean(1)
    X_test['target_std']= X_test[[f'target_{i}_days_ago' for i in range(2, num_lag+1)]].std(1)
    X_test['target_var']= X_test[[f'target_{i}_days_ago' for i in range(2, num_lag+1)]].var(1)
    for lag in range(2,num_lag+1):
        X_test[f"target_diff_{lag}_days_ago"] = X_test[f"target_{lag}_days_ago"]
        X_test[f"target_delta_{lag}_days_ago"] = X_test[f"target_diff_{lag}_days_ago"] / lag

    X_test = generate_test_set(X_test, client, historical_weather,forecast_weather, electricity_prices, gas_prices)
    for col in log_this_columns:
        X_test[f"{col}_log"] = np.where((X_test[col])!= 0, np.log(X_test[col]),0)
    X_test = X_test[features]
    
    sample_prediction['target'] = final_model.predict(X_test) 
    env.predict(sample_prediction)