In [22]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np
import os

# import split data
os.chdir('../scripts')
from functions import split_data, sMAPE
os.chdir('../notebooks')

In [12]:
# Read in data
df_lag = pd.read_csv('../data/clean/df_clean_lag.csv', index_col=0, parse_dates=True)
df = pd.read_csv('../data/clean/df_clean.csv', index_col=0, parse_dates=True)

In [None]:
def get_metrics(model, X, y, model_index, results_df, verbose=True, append_result=True):

In [208]:
def compute_metrics(
    model,
    model_index = None,
    X_train = None,
    y_train = None,
    X_val = None,
    y_val = None,
    results_df = None,
    verbose = False,
    append_result = True):
    
    if isinstance(X_train, pd.DataFrame)  and len(X_train) == len(y_train):
        preds_train = model.predict(X_train)
        if len(preds_train.shape)>1:
            preds_train = preds_train.sum(axis=1)
            y_train = y_train.sum(axis=1)
        sMAPE_train = sMAPE(y_train, preds_train)
        r2_train = np.corrcoef(y_train, preds_train)[0][1]**2
    if isinstance(X_val, pd.DataFrame)  and len(X_train) == len(y_train):
        preds_val = model.predict(X_val)
        if len(preds_val.shape)>1:
            preds_val = preds_val.sum(axis=1)
            y_val = y_val.sum(axis=1)
        sMAPE_val = sMAPE(y_val, preds_val)
        r2_val = np.corrcoef(y_val, preds_val)[0][1]**2

    if verbose:
        print('sMAPE_train:', sMAPE_train)
        print('sMAPE_val:', sMAPE_val)
        print('r2_train:', r2_train)
        print('r2_val:', r2_val)
        
    if model_index == None and append_result:
        model_index = len(results_df)
        
    if append_result:
        results_df.loc[model_index] = [sMAPE_train, sMAPE_val, r2_train, r2_val]
        return results_df

## Categoricals

In [14]:
# Get Categorical columns
categorical = df_lag.select_dtypes(include='object')

# Instationate LabelEncoder, fit and transform on wind_direction cols
wind_dir_coder = LabelEncoder()
wind_dir_coder.fit(df_lag['wind_madrid_lag'])
for col in categorical.filter(regex='wind').columns:
    df_lag[col] = wind_dir_coder.transform(df_lag[col])
    

# Stack condition columns into single col
stacked_conditions = categorical.filter(regex='condition').stack()

# Instantiate Label encoder, fit and transform on condition cols
condition_coder = LabelEncoder()
condition_coder.fit(stacked_conditions)
for col in categorical.filter(regex='condition').columns:
    df_lag[col] = condition_coder.transform(df_lag[col])

## Split Data

In [25]:
price_cols = df_lag.filter(regex='price').columns.to_list()
price_cols.remove('price_actual')
price_cols.remove('price_day_ahead')
data = df_lag.drop(columns=price_cols)

X_train, y_train, X_val, y_val = split_data(data, 2020, 'price_actual')

### Results DataFrame

In [119]:
results = pd.DataFrame(columns=['sMAPE_train', 'sMAPE_val', 'r2_train', 'r2_val'])
results.loc[0] = [None, round(sMAPE(y_val, df_lag.loc['2020'].price_day_ahead), 3),
                  None, round(np.corrcoef(y_val, df_lag.loc['2020'].price_day_ahead)[1][0]**2,3)]

In [120]:
results

Unnamed: 0,sMAPE_train,sMAPE_val,r2_train,r2_val
0,,16.922,,0.971


### Predict 2020

In [None]:
compute_metrics(
    model,
    model_index = None,
    X_train = None,
    y_train = None,
    X_val = None,
    y_val = None,
    results_df = None,
    verbose = False,
    append_result = True)

In [20]:
from sklearn.metrics import r2_score

In [121]:
xg1 = XGBRegressor(random_state=17)
xg1.fit(X_train, y_train)

results = compute_metrics(xg1, 1, X_train, y_train, X_val, y_val, results, verbose=True)

sMAPE_train: 1.2548110086018698
sMAPE_val: 6.3908475450170785
r2_train: 0.996006557632563
r2_val: 0.969592511476741


In [122]:
results

Unnamed: 0,sMAPE_train,sMAPE_val,r2_train,r2_val
0,,16.922,,0.971
1,1.254811,6.390848,0.996007,0.969593


In [133]:
xg2 = XGBRegressor(random_state=17, max_depth=3)
xg2.fit(X_train, y_train)
results = compute_metrics(xg2, 2, X_train, y_train, X_val, y_val, results, verbose=True)

sMAPE_train: 2.0402964121687757
sMAPE_val: 5.589833838280247
r2_train: 0.9888105863544137
r2_val: 0.9703313378877375


In [135]:
results

Unnamed: 0,sMAPE_train,sMAPE_val,r2_train,r2_val
0,,16.922,,0.971
1,1.254811,6.390848,0.996007,0.969593
2,2.040296,5.589834,0.988811,0.970331


Looks good as is. Beating the sMAPE from `price_day_ahead`. Look at feature importances

In [142]:
print(xg2.score(X_val, y_val))
print(r2_score(y_val, xg2.predict(X_val)))
np.corrcoef(y_val, xg2.predict(X_val))[0][1]**2

0.9505527893222502
0.9505527893222502


0.9703313378877375

In [144]:
pd.DataFrame({'preds':xg2.predict(X_val), 
              'true': y_val}).corr()**2

Unnamed: 0,preds,true
preds,1.0,0.970331
true,0.970331,1.0


In [155]:
xg2_importances = pd.DataFrame({'importance': xg2.feature_importances_}, index=X_val.columns)
xg2_importances.sort_values(by='importance',ascending=False)

Unnamed: 0,importance
price_day_ahead,0.805932
renewable_lag,0.023548
coal_lag,0.021408
load_forecast,0.021065
biomass_lag,0.019277
generation_forecast,0.017536
waste_lag,0.013615
consumption_lag,0.006654
transmission_fs_lag,0.005484
reservoir_lag,0.004301


### Model without using `price_day_ahead`

In [156]:
price_cols = df_lag.filter(regex='price').columns.to_list()
price_cols.remove('price_actual')
data = df_lag.drop(columns=price_cols)

X_train, y_train, X_val, y_val = split_data(data, 2020, 'price_actual')

In [160]:
xg3 = XGBRegressor(random_state=17)
xg3.fit(X_train, y_train)
results = compute_metrics(xg3, 3, X_train, y_train, X_val, y_val, results, verbose=True)

sMAPE_train: 4.746215017576991
sMAPE_val: 26.96572986382738
r2_train: 0.9441202444233144
r2_val: 0.3808492178460363


In [168]:
xg4 = XGBRegressor(random_state=17, max_depth=4)
xg4.fit(X_train, y_train)
results = compute_metrics(xg4, 4, X_train, y_train, X_val, y_val, results, verbose=True)

sMAPE_train: 6.949286522767955
sMAPE_val: 27.132804942442867
r2_train: 0.8826375165587237
r2_val: 0.45055026751444033


Unnamed: 0,sMAPE_train,sMAPE_val,r2_train,r2_val
0,,16.922,,0.971
1,1.254811,6.390848,0.996007,0.969593
2,2.040296,5.589834,0.988811,0.970331
3,4.746215,26.96573,0.94412,0.380849
4,6.949287,27.132805,0.882638,0.45055


In [169]:
xg4_importances = pd.DataFrame({'importance': xg4.feature_importances_}, index=X_val.columns)
xg4_importances.sort_values(by='importance',ascending=False)

Unnamed: 0,importance
consumption_forecast,0.271633
coal_lag,0.108124
coal/lignite_lag,0.05872
reservoir_lag,0.038176
load_forecast,0.032323
generation_forecast,0.029478
renewable_lag,0.027689
temp_valencia_lag,0.02651
gas_lag,0.023208
load_actual,0.023203


### Multivariate - Predict All Price Components

In [187]:
# Split Data
price_cols = df_lag.filter(regex='price').columns.to_list()
price_cols.remove('price_actual')
X_train, y_train, X_val, y_val = split_data(df_lag.drop(columns='price_actual'), 2020, price_cols)

In [171]:
from sklearn.multioutput import MultiOutputRegressor

In [190]:
# Instantiate XGBRegressor and Multioutput Regressor
xg_multi = XGBRegressor(random_state=17)
xg5 = MultiOutputRegressor(xg_multi)

# Fit the Regressor
xg5.fit(X_train, y_train)

# Compute metrics and add the results
results = compute_metrics(xg5,5, X_train, y_train, X_val, y_val, results)

MultiOutputRegressor(estimator=XGBRegressor(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=17, reg_alpha=None,
                                        

In [210]:
results

Unnamed: 0,sMAPE_train,sMAPE_val,r2_train,r2_val
0,,16.922,,0.971
1,1.254811,6.390848,0.996007,0.969593
2,2.040296,5.589834,0.988811,0.970331
3,4.746215,26.96573,0.94412,0.380849
4,6.949287,27.132805,0.882638,0.45055
5,4.751722,28.977634,0.94363,0.473974


### Multiple Variate - Predict Price Components Except `price_day_ahead`

In [211]:
# Split Data
price_cols = df_lag.filter(regex='price').columns.to_list()
price_cols.remove('price_actual')
price_cols.remove('price_day_ahead')
X_train, y_train, X_val, y_val = split_data(df_lag.drop(columns='price_actual'), 2020, price_cols)

In [214]:
# Instantiate XGBRegressor and Multioutput Regressor
xg_multi = XGBRegressor(random_state=17)
xg6 = MultiOutputRegressor(xg_multi)

# Fit the Regressor
xg6.fit(X_train, y_train)

# Compute metrics and add the results
results = compute_metrics(xg6,6, X_train, y_train, X_val, y_val, results)

In [215]:
results

Unnamed: 0,sMAPE_train,sMAPE_val,r2_train,r2_val
0,,16.922,,0.971
1,1.254811,6.390848,0.996007,0.969593
2,2.040296,5.589834,0.988811,0.970331
3,4.746215,26.96573,0.94412,0.380849
4,6.949287,27.132805,0.882638,0.45055
5,4.751722,28.977634,0.94363,0.473974
6,7.755889,39.625241,0.948131,0.250351


In [221]:
xg6.predict(X_val).sum(axis=1)

array([6.558312 , 6.6806703, 6.624584 , ..., 6.5187917, 5.8888335,
       4.9574933], dtype=float32)

In [223]:
y_val.sum(axis = 1)

2020-01-01 00:00:00    3.55
2020-01-01 01:00:00    4.05
2020-01-01 02:00:00    5.34
2020-01-01 03:00:00    6.40
2020-01-01 04:00:00    7.20
                       ... 
2020-12-31 19:00:00    8.63
2020-12-31 20:00:00    8.29
2020-12-31 21:00:00    5.69
2020-12-31 22:00:00    4.21
2020-12-31 23:00:00    3.92
Length: 8784, dtype: float64