# <b><span style='color:#2563eb'>00 | </span>Modele predykcji</b>

## <span style='color:#2563eb'>🔷 | <b></span>Import bibliotek</b>

In [None]:
# Set auto reload after making changes
%load_ext autoreload
%autoreload 2

import os
import json
import numpy as np
import pandas as pd
from datetime import timedelta
from datetime import date

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

# Modeling
import xgboost as xgb
import torch

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Wrote myself
from source.CustomPlot import CustomPlot
from source.Utils import SplitDateColumn, AddPrefixToColumns, DescribeData
from source.EvaluateModel import EvaluteModel

# GPU or CPU use for model
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

## <span style='color:#2563eb'>🔷 | <b></span>Zbiory</b>

In [None]:
PATH = 'data/'

gas_prices = pd.read_csv(os.path.join(PATH, 'gas_prices.csv'),
                   dtype={'lowest_price_per_mwh': 'float64',
                          'highest_price_per_mwh': 'float64',
                          'data_block_id': 'int64'},
                   parse_dates=['forecast_date', 'origin_date'])

electricity_prices = pd.read_csv(os.path.join(PATH, 'electricity_prices.csv'),
                   dtype={'euros_per_mwh': 'float64',
                          'data_block_id': 'int64'},
                   parse_dates=['forecast_date', 'origin_date'])

historical_weather = pd.read_csv(os.path.join(PATH, 'historical_weather.csv'),
                dtype={'temperature': 'float64',
                        'dewpoint': 'float64',
                        'rain': 'float64',
                        'snowfall': 'float64',
                        'surface_pressure': 'float64',
                        'cloudcover_total': 'int16',
                        'cloudcover_low': 'int16',
                        'cloudcover_mid': 'int16',
                        'cloudcover_high': 'int16',
                        'winddirection_10m': 'int16',
                        'shortwave_radiation': 'float64',
                        'direct_solar_radiation' : 'float64',
                        'diffuse_radiation': 'float64',
                        'latitude': 'float64',
                        'longitude' : 'float64',
                        'data_block_id' : 'int64'},

                parse_dates=['datetime'])

forecast_weather = pd.read_csv(os.path.join(PATH, 'forecast_weather.csv'),
                dtype={'temperature': 'float64',
                        'dewpoint': 'float64',
                        'total_precipitation': 'float64',
                        'snowfall': 'float64',
                        'cloudcover_total': 'float64',
                        'cloudcover_low': 'float64',
                        'cloudcover_mid': 'float64',
                        'cloudcover_high': 'float64',
                        '10_metre_u_wind_component': 'float64',
                        '10_metre_v_wind_component': 'float64',
                        'direct_solar_radiation' : 'float64',
                        'surface_solar_radiation_downwards': 'float64',
                        'latitude': 'float64',
                        'longitude' : 'float64',
                        'data_block_id' : 'int64',
                        'hours_ahead': 'int16'},

                parse_dates=['origin_datetime', 'forecast_datetime'])

train = pd.read_csv(os.path.join(PATH, 'train.csv'),
                dtype={ 'county': 'int16',
                        'is_business': 'boolean',
                        'product_type': 'int8',
                        'target': 'float64',
                        'is_consumption': 'boolean',
                        'data_block_id' : 'int64',
                        'row_id' : 'int16',
                        'prediction_unit_id' : 'int16' },

                parse_dates=['datetime'])

client = pd.read_csv(os.path.join(PATH, 'client.csv'),
                dtype={ 'county': 'int16',
                        'is_business': 'boolean',
                        'product_type': 'int8',
                        'eic_count': 'float64',
                        'installed_capacity': 'float64',
                        'data_block_id' : 'int64'},

                parse_dates=['date'])

In [None]:
weather_station = pd.read_csv(os.path.join(PATH, 'weather_station_to_county_mapping.csv'),
                   dtype={'county_name': 'str',
                          'longitude': 'float64',
                          'latitude': 'float64',
                          'county': 'float64'})

weather_station.dropna(subset='county', inplace=True)
weather_station.drop(columns=['county_name'], inplace=True)
weather_station['county'] = weather_station['county'].astype('int')
weather_station[['latitude', 'longitude']] = weather_station[['latitude', 'longitude']].astype(float).round(1)

## <span style='color:#2563eb'>🔷 | <b></span>Scelenie zbiorów</b>

In [None]:
from source.Utils import SplitDateColumn, AddPrefixToColumns, IsEstionianHoliday, IsWeekend

class FeaturesProcessing():
    def __init__(self,
            train: pd.DataFrame,
            client: pd.DataFrame,
            gas_prices: pd.DataFrame,
            forecast_weather: pd.DataFrame,
            electricity_prices: pd.DataFrame,
            weather_station: pd.DataFrame,
            submission=False):
        
        self.train = train
        self.weather_station = weather_station

        self.client = self.__prepareClient(client)
        self.gas_prices = self.__prepareGasPrices(gas_prices)
        self.forecast_weather = self.__prepareForecastData(forecast_weather)
        self.electricity_prices = self.__prepareEnergyPrices(electricity_prices)

        self.data = train.copy()
        self.__merge('gas prices', self.gas_prices, on=['data_block_id'])
        self.__merge('electricity prices', self.electricity_prices, on=['datetime', 'data_block_id'])
        self.__merge('client', self.client, on=['county', 'is_business', 'product_type', 'data_block_id'])
        self.__merge('forecast weather', self.forecast_weather, on=['datetime', 'county', 'data_block_id'])
        
        SplitDateColumn(self.data, 'datetime')
        
        # Using submission API requires AddRevealtedTargets method instead
        if not submission:
            self.__AddCustomFeatures(self.train)


    def getData(self, dropNa = False):
        if dropNa:
            return self.data.dropna()
        
        return self.data

    def __merge(self, datasetName: str, data: pd.DataFrame, on=[], how='left') -> pd.DataFrame:
        print(f'MERGING: <- {datasetName}')
        print(f'- Before: <- {len(self.data)} rows')

        self.data = self.data.merge(data, how=how, on=on)
        print(f'- After: {len(self.data)} rows')
        print()

    def __prepareClient(self, client: pd.DataFrame) -> pd.DataFrame:
        client = client.drop(columns=['date'])

        return client

    def __prepareForecastData(self, forecast_weather: pd.DataFrame) -> pd.DataFrame:
        forecast_weather = forecast_weather.rename(columns = {'forecast_datetime': 'datetime'})
        forecast_weather.drop(columns = 'origin_datetime', inplace=True)
        forecast_weather['datetime'] = forecast_weather['datetime'].dt.tz_localize(None)

        # Map to weather locations
        forecast_weather[['latitude', 'longitude']] = forecast_weather[['latitude', 'longitude']].astype(float).round(1)
        forecast_weather = forecast_weather.merge(self.weather_station, how='left', on=['latitude', 'longitude'])

        # Some weather locations are outside any county
        forecast_weather.dropna(subset='county', inplace=True)

        forecast_weather['county'] = forecast_weather['county'].astype(int)

        # Some county have many weather locations
        forecast_weather = forecast_weather.groupby(by=['datetime', 'county', 'data_block_id']).mean().reset_index()
        
        return forecast_weather

    def __prepareEnergyPrices(self, electricity_prices: pd.DataFrame) -> pd.DataFrame:
        columns = ['euros_per_mwh', 'data_block_id']

        ep = electricity_prices[columns].copy()
        ep['datetime'] = electricity_prices['forecast_date'] + timedelta(days=1)

        AddPrefixToColumns(ep, ['euros_per_mwh'], 'elec_price_')

        return ep
    
    def __prepareGasPrices(self, gas_prices: pd.DataFrame) -> pd.DataFrame:
        columns = ['highest_price_per_mwh', 'lowest_price_per_mwh', 'data_block_id']

        gp = gas_prices[columns].copy()

        AddPrefixToColumns(gp, ['highest_price_per_mwh', 'lowest_price_per_mwh'], 'gas_')
        
        return gp
    
    def __AddLagFeature(self, source: pd.DataFrame, lag: int):
        merge = ['county', 'is_business', 'product_type', 'is_consumption', 'datetime']

        feature = f'target_{lag}_days_ago'
        lag_data = source[merge + ['target']].copy()
        lag_data['datetime'] += timedelta(days=lag)
        lag_data.rename(columns={'target' : feature}, inplace=True)

        self.data = self.data.merge(lag_data, how='left', on=merge)
        print(f'New feature: {feature}')

    
    def __AddCustomFeatures(self, source: pd.DataFrame):

        for lag in range(2, 15):
            self.__AddLagFeature(source, lag)

        self.data['is_holiday'] = self.data['datetime_date'].apply(IsEstionianHoliday)
        print(f'New feature: is_holiday')
        self.data['is_weekend'] = self.data['datetime_date'].apply(IsWeekend)
        print(f'New feature: is_weekend')

        self.data['datetime_hour_sin'] = np.sin(2 * np.pi * self.data['datetime_hour']/24.0)
        print(f'New feature: datetime_hour_sin')
        self.data['datetime_hour_cos'] = np.cos(2 * np.pi * self.data['datetime_hour']/24.0)
        print(f'New feature: datetime_hour_cos')

    def AddRevealtedTargets(self, targets: pd.DataFrame):
        self.__AddCustomFeatures(targets)


In [None]:
fp = FeaturesProcessing(
    train=train,
    client= client,
    gas_prices=gas_prices,
    forecast_weather=forecast_weather,
    electricity_prices=electricity_prices,
    weather_station=weather_station
    )

In [None]:
DescribeData(fp.getData())

In [None]:
fp.getData().columns



## <span style='color:#2563eb'>🔷 | <b></span>Podział na zbiory do modelów</b>

In [None]:
def prepareDataset(features, data, data_block_id_splitter = 630, data_splitter = None):
    target_column = ['target']

    if data_splitter is not None:
        train = data[data.datetime <= data_splitter]
        test = data[data.datetime > data_splitter]
    else:
        train = data[data.data_block_id <= data_block_id_splitter]
        test = data[data.data_block_id > data_block_id_splitter]

    X_train = train[features]
    X_test = test[features]

    y_train = train[target_column]
    y_test = test[target_column]

    # For plot purposes
    X_validation = test[features + ['datetime_date', 'datetime']]
    return [X_train, X_test, y_train, y_test, X_validation]

county_name = json.load(open(os.path.join(PATH, 'county_id_to_name_map.json')))

county_name = {int(k):str(v) for k,v in county_name.items()}

# <b><span style='color:#2563eb'>01 | </span>Badania modeli</b>

## <span style='color:#2563eb'>🔷 | <b></span>Test - XGBoost (Zmienne endogeniczne)</b>

### ✨ <b>Zmienne</b>

In [None]:
features = [
    'is_business',
    'product_type',
    'is_consumption',
    'county',
    'temperature',
    'dewpoint',
    'cloudcover_high',
    'cloudcover_low',
    'cloudcover_mid',
    'cloudcover_total',
    '10_metre_u_wind_component',
    '10_metre_v_wind_component',
    'direct_solar_radiation',
    'surface_solar_radiation_downwards',
    'snowfall',
    'total_precipitation',
    'installed_capacity',
    'elec_price_euros_per_mwh',
    'datetime_hour',
    'gas_highest_price_per_mwh',
    'gas_lowest_price_per_mwh',
    ]
target_columns = ['target']

mergedData = fp.getData(dropNa=True)

# Splitting into training and testing sets
X_train, X_test, y_train, y_test, X_validation = prepareDataset(features, mergedData)
modelEvaluator = EvaluteModel(X_test, y_test, X_validation, county_name)

### ✨ <b>Uczenie</b>

In [None]:
clf = xgb.XGBRegressor (
    # Device ordinal, available options are cpu, cuda, and gpu.
    device = device, 
    enable_categorical=True,
    # Number of gradient boosted trees
    n_estimators = 1000,
    # Step size shrinkage used in update to prevents overfitting
    eta=0.1,
    # Activates early stopping. Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training
    early_stopping_rounds=100,
    # L2 regularization term on weights. Increasing this value will make model more conservative
    reg_lambda = 1,
    # L1 regularization term on weights. Increasing this value will make model more conservative
    reg_alpha = 0,
    # Minimum loss reduction required to make a further partition on a leaf node of the tree
    gamma = 0,
    # Specify the learning task and the corresponding learning objective
    objective = 'reg:absoluteerror'
)

In [None]:
clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=True)

### ✨ <b>Wyniki</b>

In [None]:
modelEvaluator.test(clf, random_day=date(2023, 5, 27))

## <span style='color:#2563eb'>🔷 | <b></span>Test - XGBoost (Zmienne egzogeniczne i endogeniczne)</b>

### ✨ <b>Zmienne</b>

In [None]:
features = [
    'is_business',
    'product_type',
    'is_consumption',
    'county',
    'temperature',
    'dewpoint',
    'cloudcover_high',
    'cloudcover_low',
    'cloudcover_mid',
    'cloudcover_total',
    '10_metre_u_wind_component',
    '10_metre_v_wind_component',
    'direct_solar_radiation',
    'surface_solar_radiation_downwards',
    'snowfall',
    'total_precipitation',
    'installed_capacity',
    'elec_price_euros_per_mwh',
    'datetime_hour',
    'gas_highest_price_per_mwh',
    'gas_lowest_price_per_mwh',
    'target_2_days_ago',
    'target_3_days_ago',
    'target_4_days_ago',
    'target_5_days_ago',
    'target_6_days_ago',
    'target_7_days_ago',
    'target_8_days_ago',
    'target_9_days_ago',
    'target_10_days_ago',
    'target_11_days_ago',
    'target_12_days_ago',
    'target_13_days_ago',
    'target_14_days_ago',
    ]
target_columns = ['target']

mergedData = fp.getData(dropNa=True)

# Splitting into training and testing sets
X_train, X_test, y_train, y_test, X_validation = prepareDataset(features, mergedData)
modelEvaluator = EvaluteModel(X_test, y_test, X_validation, county_name)

### ✨ <b>Uczenie</b>

In [None]:
clf = xgb.XGBRegressor (
    # Device ordinal, available options are cpu, cuda, and gpu.
    device = device, 
    enable_categorical=True,
    # Number of gradient boosted trees
    n_estimators = 1000,
    # Step size shrinkage used in update to prevents overfitting
    eta=0.1,
    # Activates early stopping. Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training
    early_stopping_rounds=100,
    # L2 regularization term on weights. Increasing this value will make model more conservative
    reg_lambda = 1,
    # L1 regularization term on weights. Increasing this value will make model more conservative
    reg_alpha = 0,
    # Minimum loss reduction required to make a further partition on a leaf node of the tree
    gamma = 0,
    # Specify the learning task and the corresponding learning objective
    objective = 'reg:absoluteerror'
)

In [None]:
clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=True)

### ✨ <b>Wyniki</b>

In [None]:
modelEvaluator.test(clf, random_day=date(2023, 5, 27))

* Model nie radzi sobie z predykcją dni wolnych

## <span style='color:#2563eb'>🔷 | <b></span>Test 2 - XGBoost (Zmienne egzogeniczne i endogeniczne + dni wolne)</b>

### ✨ <b>Zmienne</b>

In [None]:
from source.Utils import IsEstionianHoliday

features = [
    'is_business',
    'product_type',
    'is_consumption',
    'county',
    'temperature',
    'dewpoint',
    'cloudcover_high',
    'cloudcover_low',
    'cloudcover_mid',
    'cloudcover_total',
    '10_metre_u_wind_component',
    '10_metre_v_wind_component',
    'direct_solar_radiation',
    'surface_solar_radiation_downwards',
    'snowfall',
    'total_precipitation',
    'installed_capacity',
    'elec_price_euros_per_mwh',
    'datetime_hour',
    'gas_highest_price_per_mwh',
    'gas_lowest_price_per_mwh',
    'target_2_days_ago',
    'target_3_days_ago',
    'target_4_days_ago',
    'target_5_days_ago',
    'target_6_days_ago',
    'target_7_days_ago',
    'target_8_days_ago',
    'target_9_days_ago',
    'target_10_days_ago',
    'target_11_days_ago',
    'target_12_days_ago',
    'target_13_days_ago',
    'target_14_days_ago',
    'is_holiday',
    ]
target_columns = ['target']

mergedData = pd.DataFrame(fp.getData(dropNa=True))
mergedData['is_holiday'] = mergedData['datetime_date'].apply(IsEstionianHoliday)

# Splitting into training and testing sets
X_train, X_test, y_train, y_test, X_validation = prepareDataset(features, mergedData)
modelEvaluator = EvaluteModel(X_test, y_test, X_validation, county_name)

### ✨ <b>Uczenie</b>

In [None]:
clf = xgb.XGBRegressor (
    # Device ordinal, available options are cpu, cuda, and gpu.
    device = device, 
    enable_categorical=True,
    # Number of gradient boosted trees
    n_estimators = 1000,
    # Step size shrinkage used in update to prevents overfitting
    eta=0.1,
    # Activates early stopping. Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training
    early_stopping_rounds=100,
    # L2 regularization term on weights. Increasing this value will make model more conservative
    reg_lambda = 1,
    # L1 regularization term on weights. Increasing this value will make model more conservative
    reg_alpha = 0,
    # Minimum loss reduction required to make a further partition on a leaf node of the tree
    gamma = 0,
    # Specify the learning task and the corresponding learning objective
    objective = 'reg:absoluteerror'
)

In [None]:
clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=True)

### ✨ <b>Wyniki</b>

In [None]:
modelEvaluator.test(clf, random_day=date(2023, 5, 27))

## <span style='color:#2563eb'>🔷 | <b></span>Test - XGBoost (Zmienne egzogeniczne/endogeniczne + dni wolne + cykliczność)</b>

### ✨ <b>Zmienne</b>

In [None]:
from source.Utils import IsEstionianHoliday

features = [
    'is_business',
    'product_type',
    'is_consumption',
    'county',
    'temperature',
    'dewpoint',
    'cloudcover_high',
    'cloudcover_low',
    'cloudcover_mid',
    'cloudcover_total',
    '10_metre_u_wind_component',
    '10_metre_v_wind_component',
    'direct_solar_radiation',
    'surface_solar_radiation_downwards',
    'snowfall',
    'total_precipitation',
    'installed_capacity',
    'elec_price_euros_per_mwh',
    'gas_highest_price_per_mwh',
    'gas_lowest_price_per_mwh',
    'target_2_days_ago',
    'target_3_days_ago',
    'target_4_days_ago',
    'target_5_days_ago',
    'target_6_days_ago',
    'target_7_days_ago',
    'target_8_days_ago',
    'target_9_days_ago',
    'target_10_days_ago',
    'target_11_days_ago',
    'target_12_days_ago',
    'target_13_days_ago',
    'target_14_days_ago',
    'is_holiday',
    'datetime_hour_sin',
    'datetime_hour_cos'
    ]
target_columns = ['target']

mergedData = pd.DataFrame(fp.getData(dropNa=True))

# Splitting into training and testing sets
X_train, X_test, y_train, y_test, X_validation = prepareDataset(features, mergedData)
modelEvaluator = EvaluteModel(X_test, y_test, X_validation, county_name)

### ✨ <b>Uczenie</b>

In [None]:
clf = xgb.XGBRegressor (
    # Device ordinal, available options are cpu, cuda, and gpu.
    device = device, 
    enable_categorical=True,
    # Number of gradient boosted trees
    n_estimators = 750,
    # Step size shrinkage used in update to prevents overfitting
    eta=0.05,
    # Activates early stopping. Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training
    early_stopping_rounds=100,
    # L2 regularization term on weights. Increasing this value will make model more conservative
    reg_lambda = 2,
    # L1 regularization term on weights. Increasing this value will make model more conservative
    reg_alpha = 0,
    # Minimum loss reduction required to make a further partition on a leaf node of the tree
    gamma = 0,
    # Specify the learning task and the corresponding learning objective
    objective = 'reg:absoluteerror',
    # Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.
    max_depth = 12
)

In [None]:
clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=True)

### ✨ <b>Wyniki</b>

In [None]:
modelEvaluator.test(clf, random_day=date(2023, 5, 27))

## <span style='color:#2563eb'>🔷 | <b></span>Test - XGBoost (Zmienne egzogeniczne/endogeniczne + dni wolne + cykliczność + weekendy)</b>

### ✨ <b>Zmienne</b>

In [None]:
from source.Utils import IsEstionianHoliday, IsWeekend

features = [
    'is_business',
    'product_type',
    'is_consumption',
    'county',
    'temperature',
    'dewpoint',
    'cloudcover_high',
    'cloudcover_low',
    'cloudcover_mid',
    'cloudcover_total',
    '10_metre_u_wind_component',
    '10_metre_v_wind_component',
    'direct_solar_radiation',
    'surface_solar_radiation_downwards',
    'snowfall',
    'total_precipitation',
    'installed_capacity',
    'elec_price_euros_per_mwh',
    'datetime_hour',
    'gas_highest_price_per_mwh',
    'gas_lowest_price_per_mwh',
    'target_2_days_ago',
    'target_3_days_ago',
    'target_4_days_ago',
    'target_5_days_ago',
    'target_6_days_ago',
    'target_7_days_ago',
    'target_8_days_ago',
    'target_9_days_ago',
    'target_10_days_ago',
    'target_11_days_ago',
    'target_12_days_ago',
    'target_13_days_ago',
    'target_14_days_ago',
    'is_holiday',
    'is_weekend',
    'datetime_hour_sin',
    'datetime_hour_cos'
    ]
target_columns = ['target']

mergedData = pd.DataFrame(fp.getData(dropNa=True))

# Splitting into training and testing sets
X_train, X_test, y_train, y_test, X_validation = prepareDataset(features, mergedData)
modelEvaluator = EvaluteModel(X_test, y_test, X_validation, county_name)

### ✨ <b>Uczenie</b>

In [None]:
clf = xgb.XGBRegressor (
    # Device ordinal, available options are cpu, cuda, and gpu.
    device = device, 
    enable_categorical=True,
    # Number of gradient boosted trees
    n_estimators = 750,
    # Step size shrinkage used in update to prevents overfitting
    eta=0.05,
    # Activates early stopping. Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training
    early_stopping_rounds=100,
    # L2 regularization term on weights. Increasing this value will make model more conservative
    reg_lambda = 2,
    # L1 regularization term on weights. Increasing this value will make model more conservative
    reg_alpha = 0,
    # Minimum loss reduction required to make a further partition on a leaf node of the tree
    gamma = 0,
    # Specify the learning task and the corresponding learning objective
    objective = 'reg:absoluteerror',
    # Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.
    max_depth = 12
)

In [None]:
clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=True)

### ✨ <b>Wyniki</b>

In [None]:
modelEvaluator.test(clf, random_day=date(2023, 5, 27))

## <span style='color:#2563eb'>🔷 | <b></span>Test - XGBoost (Zmienne egzogeniczne/endogeniczne + dni wolne + cykliczność + weekendy + dodatkowe atrybuty)</b>

### ✨ <b>Zmienne</b>

In [None]:
from source.Utils import IsEstionianHoliday, IsWeekend

features = [
    'is_business',
    'product_type',
    'is_consumption',
    'county',
    'temperature',
    'dewpoint',
    'cloudcover_high',
    'cloudcover_low',
    'cloudcover_mid',
    'cloudcover_total',
    '10_metre_u_wind_component',
    '10_metre_v_wind_component',
    'direct_solar_radiation',
    'surface_solar_radiation_downwards',
    'snowfall',
    'total_precipitation',
    'installed_capacity',
    'elec_price_euros_per_mwh',
    'datetime_hour',
    'gas_highest_price_per_mwh',
    'gas_lowest_price_per_mwh',
    'target_2_days_ago',
    'target_3_days_ago',
    'target_4_days_ago',
    'target_5_days_ago',
    'target_6_days_ago',
    'target_7_days_ago',
    'target_8_days_ago',
    'target_9_days_ago',
    'target_10_days_ago',
    'target_11_days_ago',
    'target_12_days_ago',
    'target_13_days_ago',
    'target_14_days_ago',
    'is_holiday',
    'is_weekend',
    'datetime_hour_sin',
    'datetime_hour_cos',
    'datetime_day',
    'datetime_month',
    'datetime_year'
    ]
target_columns = ['target']

mergedData = pd.DataFrame(fp.getData(dropNa=True))

# Splitting into training and testing sets
X_train, X_test, y_train, y_test, X_validation = prepareDataset(features, mergedData)
modelEvaluator = EvaluteModel(X_test, y_test, X_validation, county_name)

### ✨ <b>Uczenie</b>

In [None]:
clf = xgb.XGBRegressor (
    # Device ordinal, available options are cpu, cuda, and gpu.
    device = device, 
    enable_categorical=True,
    # Number of gradient boosted trees
    n_estimators = 750,
    # Step size shrinkage used in update to prevents overfitting
    eta=0.05,
    # Activates early stopping. Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training
    early_stopping_rounds=100,
    # L2 regularization term on weights. Increasing this value will make model more conservative
    reg_lambda = 2,
    # L1 regularization term on weights. Increasing this value will make model more conservative
    reg_alpha = 0,
    # Minimum loss reduction required to make a further partition on a leaf node of the tree
    gamma = 0,
    # Specify the learning task and the corresponding learning objective
    objective = 'reg:absoluteerror',
    # Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.
    max_depth = 12
)

In [None]:
clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=True)

### ✨ <b>Wyniki</b>

In [None]:
modelEvaluator.test(clf, random_day=date(2023, 5, 27))

## <span style='color:#2563eb'>🔷 | <b></span>Test - XGBoost (Zmienne egzogeniczne/endogeniczne + dni wolne + cykliczność + weekendy + dodatkowe atrybuty) - problem trendu</b>

### ✨ <b>Zmienne</b>

In [None]:
features = [
    'is_business',
    'product_type',
    'is_consumption',
    'county',
    'temperature',
    'dewpoint',
    'cloudcover_high',
    'cloudcover_low',
    'cloudcover_mid',
    'cloudcover_total',
    '10_metre_u_wind_component',
    '10_metre_v_wind_component',
    'direct_solar_radiation',
    'surface_solar_radiation_downwards',
    'snowfall',
    'total_precipitation',
    'installed_capacity',
    'elec_price_euros_per_mwh',
    'datetime_hour',
    'gas_highest_price_per_mwh',
    'gas_lowest_price_per_mwh',
    'target_2_days_ago',
    'target_3_days_ago',
    'is_holiday',
    'is_weekend',
    'datetime_hour_sin',
    'datetime_hour_cos'
    ]
target_columns = ['target']

mergedData = pd.DataFrame(fp.getData(dropNa=True))

# Normalization
mergedData['target'] = mergedData['target'] / mergedData['installed_capacity']

# Splitting into training and testing sets
X_train, X_test, y_train, y_test, X_validation = prepareDataset(features, mergedData)

modelEvaluator = EvaluteModel(X_test, y_test, X_validation, county_name)


### ✨ <b>Uczenie</b>

In [None]:
clf = xgb.XGBRegressor (
    # Device ordinal, available options are cpu, cuda, and gpu.
    device = device, 
    enable_categorical=True,
    # Number of gradient boosted trees
    n_estimators = 750,
    # Step size shrinkage used in update to prevents overfitting
    eta=0.05,
    # Activates early stopping. Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training
    early_stopping_rounds=100,
    # L2 regularization term on weights. Increasing this value will make model more conservative
    reg_lambda = 2,
    # L1 regularization term on weights. Increasing this value will make model more conservative
    reg_alpha = 0,
    # Minimum loss reduction required to make a further partition on a leaf node of the tree
    gamma = 0,
    # Specify the learning task and the corresponding learning objective
    objective = 'reg:absoluteerror',
    # Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.
    max_depth = 12
)

In [None]:
clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=True)

### ✨ <b>Wyniki</b>

In [None]:
modelEvaluator.test(clf, random_day=date(2023, 5, 27), normalization=X_test['installed_capacity'])

## <span style='color:#2563eb'>🔷 | <b></span>Najlepszy model + bagging</b>

### ✨ <b>Zmienne</b>

In [None]:
features = [
    'is_business',
    'product_type',
    'is_consumption',
    'county',
    'temperature',
    'dewpoint',
    'cloudcover_high',
    'cloudcover_low',
    'cloudcover_mid',
    'cloudcover_total',
    '10_metre_u_wind_component',
    '10_metre_v_wind_component',
    'direct_solar_radiation',
    'surface_solar_radiation_downwards',
    'snowfall',
    'total_precipitation',
    'installed_capacity',
    'elec_price_euros_per_mwh',
    'datetime_hour',
    'gas_highest_price_per_mwh',
    'gas_lowest_price_per_mwh',
    'target_2_days_ago',
    'target_3_days_ago',
    'target_4_days_ago',
    'target_5_days_ago',
    'target_6_days_ago',
    'target_7_days_ago',
    'target_8_days_ago',
    'target_9_days_ago',
    'target_10_days_ago',
    'target_11_days_ago',
    'target_12_days_ago',
    'target_13_days_ago',
    'target_14_days_ago',
    'is_holiday',
    'is_weekend',
    'datetime_hour_sin',
    'datetime_hour_cos'
    ]
target_columns = ['target']

mergedData = pd.DataFrame(fp.getData(dropNa=True))

# Splitting into training and testing sets
X_train, X_test, y_train, y_test, X_validation = prepareDataset(features, mergedData)

# Normalization
y_train['target'] = y_train['target'] / X_train['installed_capacity']

modelEvaluator = EvaluteModel(X_test, y_test, X_validation, county_name)

### ✨ <b>Uczenie</b>

In [None]:
from sklearn.ensemble import VotingRegressor

params = {
    'device': device, 
    'enable_categorical': True,
    'n_estimators': 750,
    'eta': 0.05,
    'reg_lambda': 2,
    'reg_alpha': 0,
    'gamma': 0,
    'objective': 'reg:absoluteerror',
    'max_depth': 12
}

model = VotingRegressor([(
        f'XGBoost_{i}',
        xgb.XGBRegressor(**params, seed = i)
    ) for i in range(10)
], verbose=True)

In [None]:
#model.fit(X_train, y_train.values.ravel())

### ✨ <b>Wyniki</b>

In [None]:
#modelEvaluator.test(clf, random_day=date(2023, 5, 27), normalization=X_test['installed_capacity'])

## <span style='color:#2563eb'>🔷 | <b></span>Najlepszy model + bagging + podział na konsumpcje/produkcję</b>

### ✨ <b>Zmienne</b>

In [None]:
features = [
    'is_business',
    'product_type',
    'is_consumption',
    'county',
    'temperature',
    'dewpoint',
    'cloudcover_high',
    'cloudcover_low',
    'cloudcover_mid',
    'cloudcover_total',
    '10_metre_u_wind_component',
    '10_metre_v_wind_component',
    'direct_solar_radiation',
    'surface_solar_radiation_downwards',
    'snowfall',
    'total_precipitation',
    'installed_capacity',
    'elec_price_euros_per_mwh',
    'datetime_hour',
    'gas_highest_price_per_mwh',
    'gas_lowest_price_per_mwh',
    'target_2_days_ago',
    'target_3_days_ago',
    'target_4_days_ago',
    'target_5_days_ago',
    'target_6_days_ago',
    'target_7_days_ago',
    'target_8_days_ago',
    'target_9_days_ago',
    'target_10_days_ago',
    'target_11_days_ago',
    'target_12_days_ago',
    'target_13_days_ago',
    'target_14_days_ago',
    'is_holiday',
    'is_weekend',
    'datetime_hour_sin',
    'datetime_hour_cos'
    ]
target_columns = ['target']

mergedData = pd.DataFrame(fp.getData(dropNa=True))

# Normalization
mergedData['target'] = mergedData['target'] / mergedData['installed_capacity']

consumptionData = mergedData[mergedData.is_consumption == True]
productionData = mergedData[mergedData.is_consumption == False]

# Splitting into training and testing sets
X_train_c, X_test_c, y_train_c, y_test_c, X_validation_c = prepareDataset(features, consumptionData)
X_train_p, X_test_p, y_train_p, y_test_p, X_validation_p = prepareDataset(features, productionData)


modelEvaluatorConsumption = EvaluteModel(X_test_c, y_test_c, X_validation_c, county_name)
modelEvaluatorProduction = EvaluteModel(X_test_p, y_test_p, X_validation_p, county_name)

### ✨ <b>Uczenie</b>


In [None]:
modelConsumption = xgb.XGBRegressor (
    # Device ordinal, available options are cpu, cuda, and gpu.
    device = device, 
    enable_categorical=True,
    # Number of gradient boosted trees
    n_estimators = 750,
    # Step size shrinkage used in update to prevents overfitting
    eta=0.05,
    # Activates early stopping. Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training
    early_stopping_rounds=100,
    # L2 regularization term on weights. Increasing this value will make model more conservative
    reg_lambda = 2,
    # L1 regularization term on weights. Increasing this value will make model more conservative
    reg_alpha = 0,
    # Minimum loss reduction required to make a further partition on a leaf node of the tree
    gamma = 0,
    # Specify the learning task and the corresponding learning objective
    objective = 'reg:absoluteerror',
    # Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.
    max_depth = 12
)

modelProduction = xgb.XGBRegressor (
    # Device ordinal, available options are cpu, cuda, and gpu.
    device = device, 
    enable_categorical=True,
    # Number of gradient boosted trees
    n_estimators = 750,
    # Step size shrinkage used in update to prevents overfitting
    eta=0.05,
    # Activates early stopping. Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training
    early_stopping_rounds=100,
    # L2 regularization term on weights. Increasing this value will make model more conservative
    reg_lambda = 2,
    # L1 regularization term on weights. Increasing this value will make model more conservative
    reg_alpha = 0,
    # Minimum loss reduction required to make a further partition on a leaf node of the tree
    gamma = 0,
    # Specify the learning task and the corresponding learning objective
    objective = 'reg:absoluteerror',
    # Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.
    max_depth = 12
)

In [None]:
modelConsumption.fit(X_train_c, y_train_c, eval_set=[(X_train_c, y_train_c), (X_test_c, y_test_c)], verbose=True)
modelProduction.fit(X_train_p, y_train_p, eval_set=[(X_train_p, y_train_p), (X_test_p, y_test_p)], verbose=True)

### ✨ <b>Wyniki</b>


In [None]:
modelEvaluatorConsumption.test(modelConsumption, random_day=date(2023, 5, 27), normalization=X_test_c['installed_capacity'])
modelEvaluatorProduction.test(modelProduction, random_day=date(2023, 5, 27), normalization=X_test_p['installed_capacity'])

## <span style='color:#2563eb'>🔷 | <b></span>Test - RandomForest</b>

### ✨ <b>Zmienne</b>

In [None]:
features = [
    'is_business',
    'product_type',
    'is_consumption',
    'county',
    'temperature',
    'dewpoint',
    'cloudcover_high',
    'cloudcover_low',
    'cloudcover_mid',
    'cloudcover_total',
    '10_metre_u_wind_component',
    '10_metre_v_wind_component',
    'direct_solar_radiation',
    'surface_solar_radiation_downwards',
    'snowfall',
    'total_precipitation',
    'installed_capacity',
    'elec_price_euros_per_mwh',
    'datetime_hour',
    'gas_highest_price_per_mwh',
    'gas_lowest_price_per_mwh',
    'target_2_days_ago',
    'target_3_days_ago',
    'is_holiday',
    'is_weekend',
    'datetime_hour_sin',
    'datetime_hour_cos'
    ]
target_columns = ['target']

mergedData = pd.DataFrame(fp.getData(dropNa=True))

# Normalization
mergedData['target'] = mergedData['target'] / mergedData['installed_capacity']

# Splitting into training and testing sets
X_train, X_test, y_train, y_test, X_validation = prepareDataset(features, mergedData)

modelEvaluator = EvaluteModel(X_test, y_test, X_validation, county_name)

### ✨ <b>Uczenie</b>

In [None]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(
    # The number of trees in the forest.
    n_estimators=80,
    # Controls both the randomness of the bootstrapping of the samples used when building trees and the sampling
    random_state=0,
    # The function to measure the quality of a split. 
    criterion='squared_error',
    verbose=2)

In [None]:
clf.fit(X_train, y_train.values.ravel())

### ✨ <b>Wyniki</b>

In [None]:
modelEvaluator.test(clf, random_day=date(2023, 5, 27), normalization=X_test['installed_capacity'])

## <span style='color:#2563eb'>🔷 | <b></span>Szukanie hiperparametrów</b>

In [None]:
# A parameter grid for XGBoost
params = {
        'n_estimators': [750, 1000, 1500],
        'max_depth': [6, 8, 10],
        'eta': [0.1, 0.3, 0.5]
        }

In [None]:
#clf = GridSearchCV(clf, params, verbose=2)

#clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)

In [None]:
# clf.best_params_
#{'eta': 0.1, 'max_depth': 10, 'n_estimators': 750}

* eta -> Wybrana wartość 0.1 - minimalna z przedziału - zmniejszenie zakresu
* n_ets -> Wybrana wartość 750 - minimalna z przedziału - zmniejszenie zakresu
* max_depth -> Wybrana wartość 10 - maksymalna z przedziału - zwiększenie zakresu

In [None]:
params = {
        'n_estimators': [500, 750],
        'max_depth': [10, 12, 14],
        'eta': [0, 0.05, 0.1]
        }

In [None]:
# clf = GridSearchCV(clf, params, verbose=2)

# clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)

In [None]:
# clf.best_params_
# {'eta': 0.05, 'max_depth': 12, 'n_estimators': 750}

In [None]:
params = {
        'reg_lambda': [0, 1, 2],
        'reg_alpha': [0, 1, 2],
        'eta': [0, 0.05, 0.1]
        }

In [None]:
# clf = GridSearchCV(clf, params, verbose=2)
# clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)

In [None]:
# clf.best_params_
# {'eta': 0.05, 'reg_alpha': 0, 'reg_lambda': 2}

## <span style='color:#2563eb'>🔷 | <b></span>Załączanie odpowiedzi</b>

In [None]:
#import enefit
#enefit.make_env.func_dict['__called__'] = False
#
#env = enefit.make_env()
#iter_test = env.iter_test()
#columns_to_store = ['county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'target', 'data_block_id']
#
#revealed = None
#
#for (test_sub, revealed_targets_sub, client_sub, historical_weather_sub,
#        forecast_weather_sub, electricity_prices_sub, gas_prices_sub, sample_prediction_sub) in iter_test:
#
#    # Rename test set to make consistent with train
#    test_sub = test_sub.rename(columns = {'prediction_datetime': 'datetime'})
#    
#    display(test_sub.head(1))
#    
#    test_sub['data_block_id'] = 0
#    client_sub['data_block_id'] = 0
#    electricity_prices_sub['data_block_id'] = 0
#    gas_prices_sub['data_block_id'] = 0
#    forecast_weather_sub['data_block_id'] = 0
#
#    fp_sub = FeaturesProcessing(
#        train = test_sub,
#        client = client_sub,
#        forecast_weather = forecast_weather_sub,
#        gas_prices = gas_prices_sub,
#        electricity_prices = electricity_prices_sub,
#        weather_station = weather_station,
#        submission = True)
#    
#    
#    # Store revealed data
#    if revealed is None:
#        revealed = revealed_targets_sub
#    else:
#        revealed = pd.concat([revealed, revealed_targets_sub], ignore_index=True)
#    
#    # Add revealed data to train
#    fp_sub.AddRevealtedTargets(revealed)
#    
#    # Prediction
#    data  = fp_sub.getData()[features]
#    display(data.head(1))
#    sample_prediction_sub['target'] = clf.predict(data)
#    
#    env.predict(sample_prediction_sub)