# Prediction with skforecaster

In this Jupyter notebook, another approach is used to predict the future and improve the results. This notebook use the skforecaster python module combined with XGBoost regressor. The skforecaster python module allow the user to realize simple prediction using time series.

In [None]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
from skforecast.ForecasterAutoreg import ForecasterAutoreg
%matplotlib inline

pd.set_option('display.max_columns', None)
import xgboost as xgb

## Renaming the column and convert the date to pandas format

In this section the data is loaded and sorted by dates and convert to the pandas datetime format.

In [None]:
original_data = pd.read_csv("PJME_hourly.csv")

In [None]:
original_data["Datetime"] = pd.to_datetime(original_data["Datetime"])
original_data.rename({"Datetime" : "date", "PJME_MW" : "out"}, axis=1, inplace=True)
original_data.sort_values("date", ascending=True, inplace=True, ignore_index=True)

Removing the duplicated data by keeping only the first one. Why keeping the first one? Because it is fast, easy and unlikely to highly impact our training are there is only a few duplicated data.

In [None]:
original_data.set_index('date', inplace=True)
pd.concat([original_data[original_data.index.duplicated(keep="first") == True],
          original_data[original_data.index.duplicated(keep="last") == True]])
original_data = original_data[~original_data.index.duplicated(keep='first')]
original_data.reset_index(inplace=True)

In [None]:
data_features = original_data.copy()

data_features.set_index('date', inplace=True)

In [None]:
def get_features(df):
    out = df.copy()
    out["hour"] = out.index.hour
    out["day"] = out.index.day
    out["month"] = out.index.month
    out["year"] = out.index.year
    
    out['quarter'] = out.index.quarter
    out['dayofyear'] = out.index.dayofyear
    out['dayofmonth'] = out.index.day
    
    out['weekofyear'] = out.index.isocalendar().week.astype(np.int64)
    
    return out

data_features = get_features(data_features)

## First approach

### Choosing frequency for skforecaster

Skforecaster need a constant frequency to work properly. As the dataset have some missing data we can't directly use a frequency of 1h as we should, therefore, here a frequency of 3h is used (2h also don't work).

In [None]:
data = data_features.asfreq('3h')
na = data.isna()
na[na['out'] == True]

### Partitionning

In [None]:
from sklearn.model_selection import train_test_split

train_feats, test_feats = train_test_split(data.sort_values('date'), shuffle=False)
train_feats.dropna(inplace=True)

### Training the skforecaster model

In [None]:
args = {
    "n_estimators" : 600,
    "base_score" : 0.5,
    "max_depth" : 6,
    "learning_rate" : 0.01
}

forecaster = ForecasterAutoreg(
                    regressor = xgb.XGBRegressor(tree_method="gpu_hist",
                       **args),
                    lags = 200
             )

forecaster.fit(y=train_feats['out'])
# forecaster

### Evaluating the forecasting model

Performance visualization.

In [None]:
preds = forecaster.predict(steps=50)

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))

ax.set_title('Testing Data/predicted value')
ax.plot(preds.index, preds, alpha=0.7, color="blue")
test_feats.loc[preds.index]['out'].plot(ax=ax, alpha=0.7, color="red")
ax.legend(['Prediction', 'Testing Set'])

plt.show()

## Try to handle the missing data with XGBoost

Try to manage the missing data with XGBoost then a frequency of 1h is obtained in the dataset and can be used to train a skforecaster model with a better frequency and hopefully better results.

### Creating columns to add previous points features

In [None]:
def get_colums_names(column_names, N):
    column_names = list(column_names)
    names = []
    for i in range(N, 0, -1):
        for name in column_names:
            names.append(name + str(i))
    names.extend(column_names)
    return names

data_features.reset_index(inplace=True)
all_available_features = list(data_features.columns)

N = 2 # Number of points to predict future
data_multiple = data_features.copy()

for i in range(1, N):
    data_multiple = pd.concat([data_multiple.iloc[:-1].reset_index(drop=True), data_features.iloc[i:].reset_index(drop=True)], axis=1)

data_multiple = pd.concat([data_multiple.iloc[:-1].reset_index(drop=True), data_features.iloc[N:].reset_index(drop=True)], axis=1)

data_multiple.columns = get_colums_names(all_available_features, N)
data_multiple.set_index("date", inplace=True)

data_features.set_index("date", inplace=True)

Defining the training features.

In [None]:
all_features = data_multiple.columns

training_features_list = ['hour', 'day', 'month', 'year', 'quarter', 'dayofyear',
       'dayofmonth', 'weekofyear', 'out']

training_features_list = ['hour', 'month', 'out']

def is_training_feature(feature, training_features):
    for training_feature in training_features:
        if feature != "out" and \
        training_feature == feature[:len(training_feature)] and \
        (feature[len(training_feature):].isnumeric() or feature[len(training_feature):] == ""):
            return True
    return False

training_features = list(filter(lambda x : is_training_feature(x, training_features_list), all_features))

target = "out"

Partitionning with the last desired features

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data_multiple, shuffle=False)
# train

Training the model XGBoost model to fill the missing data.

In [None]:
from time import time

args = {
    "n_estimators" : 1200,
    "base_score" : 0.5,
    "max_depth" : 6,
    "learning_rate" : 0.01
}

reg = xgb.XGBRegressor(tree_method="gpu_hist",
                       **args)

t = time()
reg.fit(train[training_features], train[target],
        eval_set=[(train[training_features], train[target]), (test[training_features],test[target])],
        verbose=100)
print(time() - t)

Performance evaluation using the RMSE metric of XGBoost.

In [None]:
preds_train = reg.predict(train[training_features])
preds_test = reg.predict(test[training_features])

print("Training score:", reg.score(train[training_features], train[target]))
print("Testing score:", reg.score(test[training_features], test[target]))

Performance visualization.

In [None]:
period = '2018 05'

preds_period = reg.predict(test.loc[period][training_features])

fig, ax = plt.subplots(figsize=(15, 5))

ax.set_title('Testing Data/predicted value')
ax.plot(test.loc[period].index, preds_period, alpha=0.7, color="blue")
test.loc[period]['out'].plot(ax=ax, alpha=0.7, color="red")
ax.legend(['Prediction', 'Testing Set'])

plt.show()

### Fill the missingdata using the trained model

As the model is trained to use the N previous points to guess the next one, it can now be used to fill the missing data.

Making sure there is no duplicated dates :

In [None]:
data_multiple[data_multiple.duplicated() == True]

Creating the index with all the dates that should be in the dataset if there wasn't missing data. In this dataset the missing data is not written as NaN, the date is simply not in the dataset. Therefore a range of date with also the date that are supposed to be in the dataset is created.

In [None]:
start_date = data_multiple.iloc[0].name
end_date = data_multiple.iloc[-1].name

dateRange = pd.date_range(start_date, end_date, freq='1h')
# dateRange

Initialization of the new DataFrame, that will contains the filled values.

In [None]:
filled_df = pd.DataFrame(index=dateRange)
filled_df['out'] = 0
# filled_df

Filling the missing data (the code isn't effective and could be improved)

In [None]:
import re

mapping_dict = {}
for feature in all_available_features:
    for i in range(N, 0, -1):
        if (feature != "date") or i != 1:
            mapping_dict[feature + str(i)] = feature + str(i-1) if i > 1 else feature
    
# mapping_dict

In [None]:
h = pd.Timedelta("1h")

last = None
for dt in filled_df.index:
    try:
        filled_df.loc[dt]['out'] = data_features.loc[dt]['out']
        last = dt
    except:
        last_row = data_multiple.loc[dt - pd.Timedelta("1h"):dt - pd.Timedelta("1h")].copy()
        pred_row = last_row.copy()
        for k in mapping_dict:
            pred_row[k] = last_row[mapping_dict[k]]
        pred_row['date'] = dt
        filled_df.loc[dt, 'out'] = reg.predict(pred_row[training_features])[0]

filled_df.loc[filled_df['out'] == 0]

### Training the model

Making sure the data has a frequency of 1h and partitioning.

In [None]:
from sklearn.model_selection import train_test_split

train_feats, test_feats = train_test_split(filled_df, shuffle=False, train_size=np.random.random())
train_feats = train_feats.asfreq('1h')
test_feats = test_feats.asfreq('1h')

In [None]:
args = {
    "n_estimators" : 800,
    "base_score" : 0.5,
    "max_depth" : 6,
    "learning_rate" : 0.01
}

forecaster = ForecasterAutoreg(
                    regressor = xgb.XGBRegressor(tree_method="gpu_hist",
                       **args),
                    lags = 200
             )


forecaster.fit(y=train_feats['out'])

### Predicting the future and evaluating the performance

In [None]:
preds = forecaster.predict(steps=150)

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))

ax.set_title('Testing Data/predicted value')
ax.plot(test_feats.loc[preds.index]['out'], alpha=1, color="blue")
ax.plot(preds.index, preds, alpha=0.7, color="red")
ax.legend(['Testing Set', 'Prediction'])

plt.show()