-------------------UNIVARIATE TIME SERIES FORECASTING------------------

XGBOOST REGRESSION TEMPLATE

For use on datasets with a named 'Date' column and target (Y) Value column as shown below:

Date ¦ Value

In [57]:
''' IMPORTS
'''
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import xgboost as xgb
import datetime as dt
import numpy as np
from numpy import asarray
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import mean_squared_error
from dateutil.relativedelta import *
from statsmodels.tsa.arima.model import ARIMA

In [58]:
''' LOAD DATA
Load Data with X 'Date' Column and Y target value column. Y column must be numeric value!
Change file path specific to the location of your time series data. file path for data and loop must match exactly!
-
ARIMAdata is used so user only has to input csv file Once!
'''
                        #Change Input Data
data = pd.read_csv('data_for_inequalities.csv')
                        #Change Input Data
loopData = pd.read_csv('data_for_inequalities.csv')


In [59]:
''' USER DEFINED
 
Length_of_split: Use 12 for month cycles or 30 for daily cycles or 24 for hourly cycles

target_col: specify target Y column name - target_col = 'Name of your target column'

number_of_steps: Specify Number of steps to predict 

length_of_train: Specify number of values to train 
 
Use # to comment out what you dont need and remove # for what you need
'''
length_of_split = 12
#   length_of_split = 30
#   length_of_split = 24
target_col = 'SG_Y'
number_of_steps = 36
length_of_train = 24

#Turns target value Y column into type integer to type float for use in XGB Loop
loopData[target_col] = loopData[target_col].astype(float)


In [60]:
''' PRE PROCESSING 
Sorts data in sliding window technique shifting values one-step
to create new columns of X variables for target 'target_col' Y variable.
'''
for i in range(1, length_of_split+1):
    col_name = 'n-' + str(i)
    data[col_name] = data[target_col].shift(i)

data.set_index('Date', inplace = True)

In [61]:
''' DROPS NAN VALUES
'''
data = data.dropna()

In [62]:
''' TRAIN/TEST SPLIT
Splits processed data into train/test.
'''
X_train = data.head(length_of_train).drop(target_col, axis = 1)
y_train = data.head(length_of_train)[target_col]

X_test = data.tail(length_of_split).drop(target_col, axis = 1)
y_test = data.tail(length_of_split)[target_col]


In [63]:
''' XGBOOST
Creates model then fits model on Training data.
'''
model_fit = xgb.XGBRegressor(objective='reg:squarederror', n_estimators = 1000)
model_fit.fit(X_train, y_train)
model_fit.predict(X_test)

array([104.091156, 103.92977 , 104.415306, 104.32933 , 103.92977 ,
       104.01575 , 104.01575 , 104.01575 , 104.415306, 104.32769 ,
       104.32769 , 104.32769 ], dtype=float32)

In [64]:
''' METRICS
Gives MAE, MSE & RMSE scores for XGBoost
'''
ypred = model_fit.predict(X_test)
mse = mean_squared_error(y_test, ypred)
mae = mean_absolute_error(y_test, ypred)
print("XGBoost MAE: %.2f" % mae)
print("XGBoost MSE: %.2f" % mse)
print("XGBoost RMSE: %.2f" % (mse**(1/2.0)))

XGBoost MAE: 2.69
XGBoost MSE: 9.78
XGBoost RMSE: 3.13


In [65]:
''' FUTURE PREDICTIONS
Loop used for predicting next 'n' values - 'n' = number_of_steps (as defined by user earlier)
Change file path specific to your time series data
'''
                    #Change Input Data 
df = loopData
print("---XGBoost MAE: %.2f" % mae)
print('---XGBoost Predictions:')
for i in range(1, number_of_steps):
    forecast = pd.DataFrame(columns=df.columns)
    forecast['Date'] = pd.date_range(start = pd.to_datetime(max(df['Date'])) + relativedelta(months = 1), periods = 1, freq='M')
    forecast['Date'] = pd.to_datetime(forecast['Date'])
    forecast['Date'] = forecast['Date'].dt.strftime('%Y/%m')
   
    #Adds empty row to dataframe
    data = df.append(forecast)

    #Creates shifted columns n-1, n-2 ... n-12 for previous values
    for j in range(1, length_of_split + 1):
        coll_name = 'n-' + str(j)
        data[coll_name] = data[target_col].shift(j)

    data.set_index('Date', inplace = True)

    #Uses last row of n values as input for predictions
    X_forecast = data.tail(1).drop(target_col, axis = 1)

    #Removes last row as not for future so shouldn't be used in Training
    data = data.dropna()
    X_test_forecast = data.drop(target_col, axis = 1)
    y_test_forecast = data[target_col]

    #Fits XGBoost model
    import xgboost as xgb
    model_fit = xgb.XGBRegressor(objective = 'reg:squarederror', n_estimators = 1000)
    model_fit.fit(X_test_forecast, y_test_forecast)
    predictions = model_fit.predict(X_forecast)

    #Fill the next NaN values
    forecast = forecast.fillna(predictions[0])
    df = df.append(forecast).reset_index(drop = True)

#Rounds predictions to the nearest whole number    
df[target_col] = df[target_col].round()
print(df.tail(number_of_steps))

#Outputs future dates column and prediction column to csv file named: 'xgbPredictions.csv'
df.tail(number_of_steps).to_csv('xgbPredictions.csv', index=False)

---XGBoost MAE: 2.69
---XGBoost Predictions:
       Date   SG_Y
47  2022/11  103.0
48  2022/12  107.0
49  2023/01  107.0
50  2023/02  106.0
51  2023/03  104.0
52  2023/04  100.0
53  2023/05  100.0
54  2023/06  100.0
55  2023/07  102.0
56  2023/08  105.0
57  2023/09  107.0
58  2023/10  106.0
59  2023/11  104.0
60  2023/12  104.0
61  2024/01  101.0
62  2024/02   99.0
63  2024/03  102.0
64  2024/04  105.0
65  2024/05  107.0
66  2024/06  107.0
67  2024/07  106.0
68  2024/08  104.0
69  2024/09  101.0
70  2024/10  100.0
71  2024/11  101.0
72  2024/12  103.0
73  2025/01  107.0
74  2025/02  107.0
75  2025/03  106.0
76  2025/04  105.0
77  2025/05  101.0
78  2025/06  100.0
79  2025/07  100.0
80  2025/08  102.0
81  2025/09  105.0
82  2025/10  106.0
