# <b><span style='color:#2563eb'>00 | </span>Modele predykcji</b>

## <span style='color:#2563eb'>🔷 | <b></span>Import bibliotek</b>

In [1]:
# Set auto reload after making changes
%load_ext autoreload
%autoreload 2

import os

import numpy as np
import pandas as pd
from datetime import timedelta

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

# Modeling
import xgboost as xgb
import torch

from sklearn.model_selection import GridSearchCV

# Wrote myself
from source.CustomPlot import CustomPlot
from source.Utils import SplitDateColumn, AddPrefixToColumns, DescribeData
from sklearn.ensemble import RandomForestRegressor

# GPU or CPU use for model
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

## <span style='color:#2563eb'>🔷 | <b></span>Zbiory</b>

In [2]:
PATH = 'data/'

gas_prices = pd.read_csv(os.path.join(PATH, 'gas_prices.csv'),
                   dtype={'lowest_price_per_mwh': 'float64',
                          'highest_price_per_mwh': 'float64',
                          'data_block_id': 'int64'},
                   parse_dates=['forecast_date', 'origin_date'])

electricity_prices = pd.read_csv(os.path.join(PATH, 'electricity_prices.csv'),
                   dtype={'euros_per_mwh': 'float64',
                          'data_block_id': 'int64'},
                   parse_dates=['forecast_date', 'origin_date'])

historical_weather = pd.read_csv(os.path.join(PATH, 'historical_weather.csv'),
                dtype={'temperature': 'float64',
                        'dewpoint': 'float64',
                        'rain': 'float64',
                        'snowfall': 'float64',
                        'surface_pressure': 'float64',
                        'cloudcover_total': 'int16',
                        'cloudcover_low': 'int16',
                        'cloudcover_mid': 'int16',
                        'cloudcover_high': 'int16',
                        'winddirection_10m': 'int16',
                        'shortwave_radiation': 'float64',
                        'direct_solar_radiation' : 'float64',
                        'diffuse_radiation': 'float64',
                        'latitude': 'float64',
                        'longitude' : 'float64',
                        'data_block_id' : 'int64'},

                parse_dates=['datetime'])

forecast_weather = pd.read_csv(os.path.join(PATH, 'forecast_weather.csv'),
                dtype={'temperature': 'float64',
                        'dewpoint': 'float64',
                        'total_precipitation': 'float64',
                        'snowfall': 'float64',
                        'cloudcover_total': 'float64',
                        'cloudcover_low': 'float64',
                        'cloudcover_mid': 'float64',
                        'cloudcover_high': 'float64',
                        '10_metre_u_wind_component': 'float64',
                        '10_metre_v_wind_component': 'float64',
                        'direct_solar_radiation' : 'float64',
                        'surface_solar_radiation_downwards': 'float64',
                        'latitude': 'float64',
                        'longitude' : 'float64',
                        'data_block_id' : 'int64',
                        'hours_ahead': 'int16'},

                parse_dates=['origin_datetime', 'forecast_datetime'])

train = pd.read_csv(os.path.join(PATH, 'train.csv'),
                dtype={ 'county': 'int16',
                        'is_business': 'boolean',
                        'product_type': 'int8',
                        'target': 'float64',
                        'is_consumption': 'boolean',
                        'data_block_id' : 'int64',
                        'row_id' : 'int16',
                        'prediction_unit_id' : 'int16' },

                parse_dates=['datetime'])

client = pd.read_csv(os.path.join(PATH, 'client.csv'),
                dtype={ 'county': 'int16',
                        'is_business': 'boolean',
                        'product_type': 'int8',
                        'eic_count': 'float64',
                        'installed_capacity': 'float64',
                        'data_block_id' : 'int64'},

                parse_dates=['date'])

In [3]:
weather_station = pd.read_csv(os.path.join(PATH, 'weather_station_to_county_mapping.csv'),
                   dtype={'county_name': 'str',
                          'longitude': 'float64',
                          'latitude': 'float64',
                          'county': 'float64'})

weather_station.dropna(subset='county', inplace=True)
weather_station.drop(columns=['county_name'], inplace=True)
weather_station['county'] = weather_station['county'].astype('int')
weather_station[['latitude', 'longitude']] = weather_station[['latitude', 'longitude']].astype(float).round(1)

## <span style='color:#2563eb'>🔷 | <b></span>Scelenie zbiorów</b>

In [4]:
from source.Utils import SplitDateColumn, AddPrefixToColumns

class FeaturesProcessing():
    def __init__(self,
            train: pd.DataFrame,
            client: pd.DataFrame,
            gas_prices: pd.DataFrame,
            forecast_weather: pd.DataFrame,
            electricity_prices: pd.DataFrame,
            weather_station: pd.DataFrame):
        
        self.train = train
        self.weather_station = weather_station

        self.client = self.__prepareClient(client)
        self.gas_prices = self.__prepareGasPrices(gas_prices)
        self.forecast_weather = self.__prepareForecastData(forecast_weather)
        self.electricity_prices = self.__prepareEnergyPrices(electricity_prices)

        self.data = train.copy()
        self.__merge('gas prices', self.gas_prices, on=['data_block_id'])
        self.__merge('electricity prices', self.electricity_prices, on=['datetime', 'data_block_id'])
        self.__merge('client', self.client, on=['county', 'is_business', 'product_type', 'data_block_id'])
        self.__merge('forecast weather', self.forecast_weather, on=['datetime', 'county', 'data_block_id'])
        
        self.__AddCustomFeatures()

        SplitDateColumn(self.data, 'datetime')

    def getData(self, dropNa = False):
        if dropNa:
            return self.data.dropna()
        
        return self.data

    def __merge(self, datasetName: str, data: pd.DataFrame, on=[], how='left') -> pd.DataFrame:
        print(f'MERGING: <- {datasetName}')
        print(f'- Before: <- {len(self.data)} rows')

        self.data = self.data.merge(data, how=how, on=on)
        print(f'- After: {len(self.data)} rows')
        print()

    def __prepareClient(self, client: pd.DataFrame) -> pd.DataFrame:
        client = client.drop(columns=['date'])

        return client

    def __prepareForecastData(self, forecast_weather: pd.DataFrame) -> pd.DataFrame:
        forecast_weather = forecast_weather.rename(columns = {'forecast_datetime': 'datetime'})
        forecast_weather.drop(columns = 'origin_datetime', inplace=True)
        forecast_weather['datetime'] = forecast_weather['datetime'].dt.tz_convert('Europe/Brussels').dt.tz_localize(None)

        # Map to weather locations
        forecast_weather[['latitude', 'longitude']] = forecast_weather[['latitude', 'longitude']].astype(float).round(1)
        forecast_weather = forecast_weather.merge(self.weather_station, how='left', on=['latitude', 'longitude'])

        # Some weather locations are outside any county
        forecast_weather.dropna(subset='county', inplace=True)

        forecast_weather['county'] = forecast_weather['county'].astype(int)

        # Some county have many weather locations
        forecast_weather = forecast_weather.groupby(by=['datetime', 'county', 'data_block_id']).mean().reset_index()
        
        return forecast_weather

    def __prepareEnergyPrices(self, electricity_prices: pd.DataFrame) -> pd.DataFrame:
        columns = ['euros_per_mwh', 'data_block_id']

        ep = electricity_prices[columns].copy()
        ep['datetime'] = electricity_prices['forecast_date'] + timedelta(days=1)

        AddPrefixToColumns(ep, ['euros_per_mwh'], 'elec_price_')

        return ep
    
    def __prepareGasPrices(self, gas_prices: pd.DataFrame) -> pd.DataFrame:
        columns = ['highest_price_per_mwh', 'lowest_price_per_mwh', 'data_block_id']

        gp = gas_prices[columns].copy()

        AddPrefixToColumns(gp, ['highest_price_per_mwh', 'lowest_price_per_mwh'], 'gas_')
        
        return gp
    
    def __AddCustomFeatures(self):
        merge = ['county', 'is_business', 'product_type', 'data_block_id', 'is_consumption', 'datetime']

        feature = 'target_week_ago'
        trainMinus7 = self.train.copy()
        trainMinus7['datetime'] = trainMinus7['datetime'] + timedelta(days=7)
        trainMinus7['data_block_id'] = trainMinus7['data_block_id'] + 7
        trainMinus7.rename(columns={'target' : feature}, inplace=True)

        self.data = self.data.merge(trainMinus7[merge + [feature]], how='left', on=merge)

        feature = 'target_3_days_ago'
        trainMinus3 = self.train.copy()
        trainMinus3['datetime'] = trainMinus3['datetime'] + timedelta(days=3)
        trainMinus3['data_block_id'] = trainMinus3['data_block_id'] + 3
        trainMinus3.rename(columns={'target' : feature}, inplace=True)

        self.data = self.data.merge(trainMinus3[merge + [feature]], how='left', on=merge)

In [5]:
fp = FeaturesProcessing(
    train=train,
    client= client,
    gas_prices=gas_prices,
    forecast_weather=forecast_weather,
    electricity_prices=electricity_prices,
    weather_station=weather_station
    )

MERGING: <- gas prices
- Before: <- 2018352 rows
- After: 2018352 rows

MERGING: <- electricity prices
- Before: <- 2018352 rows
- After: 2018352 rows

MERGING: <- client
- Before: <- 2018352 rows
- After: 2018352 rows

MERGING: <- forecast weather
- Before: <- 2018352 rows
- After: 2018352 rows



In [6]:
DescribeData(fp.getData())

Size: 2018352 x 39



Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,gas_highest_price_per_mwh,...,target_week_ago,target_3_days_ago,datetime_minute,datetime_hour,datetime_day,datetime_month,datetime_year,datetime_datetime,datetime_time,datetime_date
Number of Nans,0,0,0,528,0,0,0,0,0,2928,...,29518,13344,0,0,0,0,0,0,0,0


Unnamed: 0,county,product_type,target,datetime,data_block_id,row_id,prediction_unit_id,gas_highest_price_per_mwh,gas_lowest_price_per_mwh,elec_price_euros_per_mwh,...,snowfall,total_precipitation,target_week_ago,target_3_days_ago,datetime_minute,datetime_hour,datetime_day,datetime_month,datetime_year,datetime_datetime
count,2018352.0,2018352.0,2017824.0,2018352,2018352.0,2018352.0,2018352.0,2015424.0,2015424.0,2015156.0,...,1984586.0,1984586.0,1988834.0,2005008.0,2018352.0,2018352.0,2018352.0,2018352.0,2018352.0,2018352
mean,7.297034,1.898927,274.8556,2022-07-20 08:29:25.326166016,321.8746,43.0835,33.04538,108.3038,95.4621,157.4218,...,2.674922e-05,7.882031e-05,274.5047,274.7042,0.0,11.5,15.70156,6.430617,2022.057,2022-07-20 08:29:25.326166016
min,0.0,0.0,0.0,2021-09-01 00:00:00,0.0,-32768.0,0.0,34.0,28.1,-10.06,...,-1.907349e-06,-1.43373e-05,0.0,0.0,0.0,0.0,1.0,1.0,2021.0,2021-09-01 00:00:00
25%,3.0,1.0,0.378,2022-02-14 04:00:00,166.0,-16491.0,16.0,67.67,60.0,85.29,...,0.0,0.0,0.359,0.37,0.0,5.75,8.0,3.0,2022.0,2022-02-14 04:00:00
50%,7.0,2.0,31.133,2022-07-21 09:30:00,323.0,213.0,33.0,94.0,85.9,128.66,...,0.0,1.589457e-06,31.118,31.138,0.0,11.5,16.0,6.0,2022.0,2022-07-21 09:30:00
75%,11.0,3.0,180.2062,2022-12-24 14:00:00,479.0,16490.0,50.0,133.0,109.74,199.96,...,4.291534e-07,3.59863e-05,179.826,180.0323,0.0,17.25,23.0,10.0,2022.0,2022-12-24 14:00:00
max,15.0,3.0,15480.27,2023-05-31 23:00:00,637.0,32767.0,68.0,305.0,250.0,4000.0,...,0.003272216,0.01346588,15480.27,15480.27,0.0,23.0,31.0,12.0,2023.0,2023-05-31 23:00:00
std,4.78099,1.081766,909.5024,,182.6343,18970.48,19.59059,54.75918,47.57902,121.3306,...,0.0001197891,0.000248235,909.2564,909.3756,0.0,6.922188,8.78617,3.664891,0.6452346,


Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,gas_highest_price_per_mwh,...,target_week_ago,target_3_days_ago,datetime_minute,datetime_hour,datetime_day,datetime_month,datetime_year,datetime_datetime,datetime_time,datetime_date
0,0,False,1,0.713,False,2021-09-01,0,0,0,,...,,,0,0,1,9,2021,2021-09-01,00:00:00,2021-09-01
1,0,False,1,96.59,True,2021-09-01,0,1,0,,...,,,0,0,1,9,2021,2021-09-01,00:00:00,2021-09-01
2,0,False,2,0.0,False,2021-09-01,0,2,1,,...,,,0,0,1,9,2021,2021-09-01,00:00:00,2021-09-01


## <span style='color:#2563eb'>🔷 | <b></span>Podział na zbiory do modelów</b>

In [7]:
features = [
    'is_business',
    'product_type',
    'is_consumption',
    'county',
    'temperature',
    'dewpoint',
    'cloudcover_high',
    'cloudcover_low',
    'cloudcover_mid',
    'cloudcover_total',
    '10_metre_u_wind_component',
    '10_metre_v_wind_component',
    'direct_solar_radiation',
    'surface_solar_radiation_downwards',
    'snowfall',
    'total_precipitation',
    'installed_capacity',
    'elec_price_euros_per_mwh',
    'datetime_hour',
    'gas_highest_price_per_mwh',
    'gas_lowest_price_per_mwh',
    'target_week_ago',
    'target_3_days_ago'
    ]
target_columns = ['target']

mergedData = fp.getData(dropNa=True)

X = mergedData[features]
y = mergedData[target_columns]

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## <span style='color:#2563eb'>🔷 | <b></span>XGBoost</b>

In [13]:
clf = xgb.XGBRegressor (
    # Device ordinal, available options are cpu, cuda, and gpu.
    device = device, 
    enable_categorical=True,
    # Number of gradient boosted trees
    n_estimators = 1000,
    # Step size shrinkage used in update to prevents overfitting
    eta=0.1,
    # Activates early stopping. Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training
    early_stopping_rounds=100,
    # L2 regularization term on weights. Increasing this value will make model more conservative
    reg_lambda = 1,
    # L1 regularization term on weights. Increasing this value will make model more conservative
    reg_alpha = 0,
    # Minimum loss reduction required to make a further partition on a leaf node of the tree
    gamma = 0,
    # Specify the learning task and the corresponding learning objective
    objective = 'reg:absoluteerror'
 )

### ✨ <b>Uczenie</b>

In [14]:
clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=True)

[0]	validation_0-mae:261.09490	validation_1-mae:261.22487
[1]	validation_0-mae:250.96929	validation_1-mae:251.12923
[2]	validation_0-mae:241.12695	validation_1-mae:241.32876
[3]	validation_0-mae:231.12913	validation_1-mae:231.36326
[4]	validation_0-mae:220.91444	validation_1-mae:221.18291
[5]	validation_0-mae:211.24934	validation_1-mae:211.56550
[6]	validation_0-mae:201.89983	validation_1-mae:202.27637
[7]	validation_0-mae:193.29480	validation_1-mae:193.74247
[8]	validation_0-mae:185.38928	validation_1-mae:185.90382
[9]	validation_0-mae:177.08599	validation_1-mae:177.65390
[10]	validation_0-mae:168.88708	validation_1-mae:169.49765
[11]	validation_0-mae:162.70588	validation_1-mae:163.36159
[12]	validation_0-mae:157.39492	validation_1-mae:158.07591
[13]	validation_0-mae:150.92051	validation_1-mae:151.65189
[14]	validation_0-mae:144.96048	validation_1-mae:145.73565
[15]	validation_0-mae:139.71451	validation_1-mae:140.51125
[16]	validation_0-mae:135.07473	validation_1-mae:135.88613
[17]	va

In [15]:
clf.predict(X_test)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




array([ 3.6685989e+01,  1.6477438e+01,  8.9630249e+02, ...,
       -1.7730713e-02,  4.4963703e+00,  1.0254416e+02], dtype=float32)

### ✨ <b>Wyniki</b>

In [10]:
pd.DataFrame({'name': clf.feature_names_in_, 'importance': clf.feature_importances_}).sort_values(by='importance', ascending=False)

Unnamed: 0,name,importance
22,target_3_days_ago,0.688004
1,product_type,0.081872
16,installed_capacity,0.051997
13,surface_solar_radiation_downwards,0.051941
3,county,0.048778
21,target_week_ago,0.034434
12,direct_solar_radiation,0.006964
2,is_consumption,0.006408
0,is_business,0.005183
15,total_precipitation,0.004234


## <span style='color:#2563eb'>🔷 | <b></span>Las losowy</b>

In [14]:
clf = RandomForestRegressor(max_depth=10, n_estimators=5)

### ✨ <b>Uczenie</b>

In [15]:
clf.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


### ✨ <b>Wyniki</b>

In [16]:
y_pred = clf.predict(X_test)

np.sqrt(MSE(y_test, y_pred)) 

163.84865528770513

## <span style='color:#2563eb'>🔷 | <b></span>Szukanie hiperparametrów</b>

In [17]:
model = xgb.XGBRegressor (
    # Device ordinal, available options are cpu, cuda, and gpu.
    device = device, 
    enable_categorical=True,
    # Number of gradient boosted trees
    n_estimators = 1000,
    # Step size shrinkage used in update to prevents overfitting
    eta=0.1,
    # Activates early stopping. Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training
    early_stopping_rounds=100,
    # L2 regularization term on weights. Increasing this value will make model more conservative
    reg_lambda = 1,
    # L1 regularization term on weights. Increasing this value will make model more conservative
    reg_alpha = 0,
    # Minimum loss reduction required to make a further partition on a leaf node of the tree
    gamma = 0,
 )

# A parameter grid for XGBoost
params = {
        'n_estimators': [1000],
        'eta': [0.1, 0.3, 0.5],
        'reg_lambda': [0.5, 1, 2],
        }

In [12]:
#clf = GridSearchCV(model, params, verbose=2)

#clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)

## <span style='color:#2563eb'>🔷 | <b></span>Załączanie odpowiedzi</b>

In [11]:
import enefit
env = enefit.make_env()
iter_test = env.iter_test()

for (test, revealed_targets, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:

      data = FeaturesProcessing(
              train = test,
              client = client,
              forecast_weather = forecast_weather,
              gas_prices = gas_prices,
              electricity_prices = electricity_prices,
              weather_station = weather_station).getData()

      sample_prediction['target'] = clf.predict(data)
      env.predict(sample_prediction)

ModuleNotFoundError: No module named 'enefit'