# <b><span style='color:#2563eb'>00 | </span>Modele predykcji</b>

## <span style='color:#2563eb'>🔷 | <b></span>Import bibliotek</b>

In [4]:
import os

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

# Modeling
import xgboost as xgb
import torch

# Wrote myself
from source.CustomPlot import CustomPlot
from source.Utils import SplitDateColumn, DescribeData
from sklearn.ensemble import RandomForestRegressor

# GPU or CPU use for model
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

## <span style='color:#2563eb'>🔷 | <b></span>Zbiory</b>

In [11]:
PATH = 'data/'

gas_prices = pd.read_csv(os.path.join(PATH, 'gas_prices.csv'),
                   dtype={'lowest_price_per_mwh': 'float64',
                          'highest_price_per_mwh': 'float64',
                          'data_block_id': 'int8'},
                   parse_dates=['forecast_date', 'origin_date'])

electricity_prices = pd.read_csv(os.path.join(PATH, 'electricity_prices.csv'),
                   dtype={'euros_per_mwh': 'float64',
                          'data_block_id': 'int8'},
                   parse_dates=['forecast_date', 'origin_date'])

historical_weather = pd.read_csv(os.path.join(PATH, 'historical_weather.csv'),
                dtype={'temperature': 'float64',
                        'dewpoint': 'float64',
                        'rain': 'float64',
                        'snowfall': 'float64',
                        'surface_pressure': 'float64',
                        'cloudcover_total': 'int16',
                        'cloudcover_low': 'int16',
                        'cloudcover_mid': 'int16',
                        'cloudcover_high': 'int16',
                        'winddirection_10m': 'int16',
                        'shortwave_radiation': 'float64',
                        'direct_solar_radiation' : 'float64',
                        'diffuse_radiation': 'float64',
                        'latitude': 'float64',
                        'longitude' : 'float64',
                        'data_block_id' : 'int16'},

                parse_dates=['datetime'])

forecast_weather = pd.read_csv(os.path.join(PATH, 'forecast_weather.csv'),
                dtype={'temperature': 'float64',
                        'dewpoint': 'float64',
                        'total_precipitation': 'float64',
                        'snowfall': 'float64',
                        'cloudcover_total': 'float64',
                        'cloudcover_low': 'float64',
                        'cloudcover_mid': 'float64',
                        'cloudcover_high': 'float64',
                        '10_metre_u_wind_component': 'float64',
                        '10_metre_v_wind_component': 'float64',
                        'direct_solar_radiation' : 'float64',
                        'surface_solar_radiation_downwards': 'float64',
                        'latitude': 'float64',
                        'longitude' : 'float64',
                        'data_block_id' : 'int16',
                        'hours_ahead': 'int16'},

                parse_dates=['origin_datetime', 'forecast_datetime'])

train = pd.read_csv(os.path.join(PATH, 'train.csv'),
                dtype={ 'county': 'int16',
                        'is_business': 'boolean',
                        'product_type': 'int8',
                        'target': 'float64',
                        'is_consumption': 'boolean',
                        'data_block_id' : 'int16',
                        'row_id' : 'int16',
                        'prediction_unit_id' : 'int16' },

                parse_dates=['datetime'])

client = pd.read_csv(os.path.join(PATH, 'client.csv'),
                dtype={ 'county': 'int16',
                        'is_business': 'boolean',
                        'product_type': 'int8',
                        'eic_count': 'float64',
                        'installed_capacity': 'float64',
                        'data_block_id' : 'int16'},

                parse_dates=['date'])

In [12]:
weather_station = pd.read_csv(os.path.join(PATH, 'weather_station_to_county_mapping.csv'),
                   dtype={'county_name': 'str',
                          'longitude': 'float64',
                          'latitude': 'float64',
                          'county': 'float64'})

weather_station.dropna(subset='county', inplace=True)
weather_station.drop(columns=['county_name'], inplace=True)
weather_station['county'] = weather_station['county'].astype('int')
weather_station[['latitude', 'longitude']] = weather_station[['latitude', 'longitude']].astype(float).round(1)

In [13]:
# Reduce forecast data

forecast_weather = forecast_weather.rename(columns = {'forecast_datetime': 'datetime'})
forecast_weather.drop(columns = 'origin_datetime', inplace=True)
forecast_weather['datetime'] = forecast_weather['datetime'].dt.tz_convert('Europe/Brussels').dt.tz_localize(None)

# Map to weather locations
forecast_weather[['latitude', 'longitude']] = forecast_weather[['latitude', 'longitude']].astype(float).round(1)
forecast_weather = forecast_weather.merge(weather_station, how='left', on=['latitude', 'longitude'])

# Some weather locations are outside any county
forecast_weather.dropna(subset='county', inplace=True)

forecast_weather['county'] = forecast_weather['county'].astype(int)

# Some county have many weather locations
forecast_weather = forecast_weather.groupby(by=['datetime', 'county', 'data_block_id']).mean().reset_index()
forecast_weather = SplitDateColumn(forecast_weather, column='datetime')

## <span style='color:#2563eb'>🔷 | <b></span>Scelenie zbiorów</b>

In [14]:
# train <- client
merged_data = train.merge(client.drop(columns=['date']), how='left', on = ['county', 'is_business', 'product_type', 'data_block_id'])

# train <- forecast weather
merged_data = merged_data.merge(forecast_weather, how='left', on=['datetime', 'county', 'data_block_id'])

merged_data.dropna(inplace=True)

## <span style='color:#2563eb'>🔷 | <b></span>Podział na zbiory do modelów</b>

In [15]:
features = [
    'is_business',
    'product_type',
    'is_consumption',
    'county',
    'temperature',
    'dewpoint',
    'cloudcover_high',
    'cloudcover_low',
    'cloudcover_mid',
    'cloudcover_total',
    '10_metre_u_wind_component',
    '10_metre_v_wind_component',
    'direct_solar_radiation',
    'surface_solar_radiation_downwards',
    'snowfall',
    'total_precipitation',
    'installed_capacity'
    ]
target_columns = ['target']

X = merged_data[features]
y = merged_data[target_columns]

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## <span style='color:#2563eb'>🔷 | <b></span>XGBoost</b>

In [16]:
clf = xgb.XGBRegressor (
    # Device ordinal, available options are cpu, cuda, and gpu.
    device = device, 
    enable_categorical=True,
    # Number of gradient boosted trees
    n_estimators = 1000,
    # Step size shrinkage used in update to prevents overfitting
    eta=0.1,
    # Activates early stopping. Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training
    early_stopping_rounds=100,
    # L2 regularization term on weights. Increasing this value will make model more conservative
    reg_lambda = 1,
    # L1 regularization term on weights. Increasing this value will make model more conservative
    reg_alpha = 0
 )

### ✨ <b>Uczenie</b>

In [17]:
clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])

[0]	validation_0-rmse:834.40052	validation_1-rmse:836.37210
[1]	validation_0-rmse:761.30617	validation_1-rmse:763.34040
[2]	validation_0-rmse:696.35021	validation_1-rmse:698.41427
[3]	validation_0-rmse:638.46167	validation_1-rmse:640.74600
[4]	validation_0-rmse:587.09801	validation_1-rmse:589.57466
[5]	validation_0-rmse:541.65613	validation_1-rmse:544.18167
[6]	validation_0-rmse:501.69690	validation_1-rmse:504.40031
[7]	validation_0-rmse:466.58422	validation_1-rmse:469.35699
[8]	validation_0-rmse:435.44981	validation_1-rmse:438.48945
[9]	validation_0-rmse:408.28493	validation_1-rmse:411.58930
[10]	validation_0-rmse:384.61510	validation_1-rmse:388.09350
[11]	validation_0-rmse:364.14230	validation_1-rmse:367.99516
[12]	validation_0-rmse:346.21622	validation_1-rmse:350.20502
[13]	validation_0-rmse:330.75866	validation_1-rmse:334.95828
[14]	validation_0-rmse:317.60542	validation_1-rmse:321.98713
[15]	validation_0-rmse:305.89268	validation_1-rmse:310.51921
[16]	validation_0-rmse:296.08515	v

### ✨ <b>Wyniki</b>

In [None]:
pd.DataFrame({'name': clf.feature_names_in_, 'importance': clf.feature_importances_})

## <span style='color:#2563eb'>🔷 | <b></span>Las losowy</b>

In [None]:
clf = RandomForestRegressor(max_depth=10, n_estimators=5)

### ✨ <b>Uczenie</b>

In [None]:
clf.fit(X_train, y_train)

### ✨ <b>Wyniki</b>

In [None]:
y_pred = clf.predict(X_test)

np.sqrt(MSE(y_test, y_pred)) 