In [9]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.xgboost
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import subprocess
from mlflow.tracking import MlflowClient
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error
from typing import Tuple, Mapping, Sequence
import yfinance as yf

In [10]:
class GasPrices:
    """
    Makes use of the yfinance python module
    to scrape natural gas prices from the TTF market.
    """

    # def __init__(self) -> None:
    #     pass

    def get_data() -> pd.DataFrame:
        symbol = "TTF=F"
        ticker = yf.Ticker(symbol)
        gas_prices = ticker.history(
            interval="1wk",
            start="2005-01-01",
            end=None,
            actions=True,
            auto_adjust=True,
            back_adjust=False,
        )

        gas_prices.index = pd.to_datetime(gas_prices.index)
        gas_prices.index = gas_prices.index.date
        gas_prices = gas_prices[["Close"]]
        gas_prices = gas_prices.rename(columns={"Close": "GAS NATURALE"})

        return gas_prices

In [11]:
class Preprocessing:
    def get_data(path: str) -> pd.DataFrame:
        data = pd.read_csv(path, index_col=0)
        return data

    def preprocessing(data: pd.DataFrame, col: str) -> pd.DataFrame:
        """
        Returns data with index and frequency of index set

        Parameters
        ----------
        data: pd.DataFrame

        col: str
            name of the column that will be kept
        """
        data.index = pd.to_datetime(data.index)
        data = data[col]
        data = data.div(1000)
        data.index.freq = pd.infer_freq(data.index)
        return data

    def train_test_split_series(data: pd.DataFrame, n_test: int) -> pd.DataFrame:
        return data.iloc[:-n_test], data.iloc[-n_test:]

    def train_test_split_df(data: pd.DataFrame, n_test: int) -> pd.DataFrame:
        return data.iloc[:-n_test], data.iloc[-n_test:]

    def series_to_supervised(
        data: pd.Series, n_in: int = 1, dropnan: bool = True
    ) -> np.array:
        """
        Converts a sequence of numbers, i.e. a univariate time series, into a matrix
        with one array (series at time t) plus one more array for each n_in
        (lags at times t-1, t-2, .., t-n_in).

        Parameters
        ----------
        data: pd.Series

        n_in: int
            number of lags to create from the original series.
            For each lag required, one more column will be added,
            at the cost of one row of observations.

        dropnan: bool

        """
        df = pd.DataFrame(data)
        cols = list()
        # input sequence (t-n, ... t-1)
        for i in range(n_in, 0, -1):
            cols.append(df.shift(i))
        cols.append(df)
        # put it all together
        agg = pd.concat(cols, axis=1)
        # drop rows with NaN values (in particular the first and the last rows)
        if dropnan:
            agg.dropna(inplace=True)

        return agg

In [36]:
class XGBForecaster(XGBRegressor):
    """
    XGBoost model class used for univariate or multivariate forecasting.
    """

    def __init__(self) -> None:
        super().__init__()
        #self.xgb = XGBRegressor()

    def fit(
            self, 
            model: XGBRegressor, 
            train_ensamble: pd.DataFrame
            ) -> XGBRegressor:
        """
        Trains an XGBRegressor on a TimeSeries Dataset
        
        Returns
        ---------
        model: XGBRegressor
            a fitted instance of the XGBRegressor
        """
        data = np.asarray(train_ensamble)
        X, y = data[:, :-1], data[:, -1]
        model.fit(X, y)
        return model

    def forecast(self, 
                 model: XGBRegressor, 
                 row_just_before: int, 
                 steps_ahead: int
        ) -> list:
        """
            Rolling prediction with the model_fitted for predicting n=steps_ahead new instances.
            This instances will immediately follow row_just_before, which is the last row of the dataframe available
        """
        row_just_before = np.asarray(row_just_before)[1:]
        current_row = row_just_before.reshape(1, -1)
        forecast = []
        for _ in range(steps_ahead):
            pred = model.predict(current_row)
            forecast.append(pred[0])
            current_row = np.concatenate((current_row[0][1:], pred)).reshape(1, -1)
        return forecast

    def grid_search(
            self, 
            parameters: Mapping, 
            n_folds: int, 
            train_df: pd.DataFrame, 
            test_size: int, 
            n_jobs: int =1, 
            verbose: int =0
        ) -> Tuple[XGBRegressor, list]:
        # model = self
        self.grid = GridSearchCV(
            self, parameters, cv=n_folds, n_jobs=n_jobs, verbose=verbose
        )
        self.grid = self.fit(self, model=self.grid, train_ensamble=train_df)
        predictions = self.forecast(
            self, 
            model=self.grid,
            row_just_before=train_df.iloc[-1, :], 
            steps_ahead=test_size
        )
        return self.grid, predictions

    def prepare_data(
            self,
            data: pd.DataFrame,
            col: str, 
            frac: float
        ) -> None:   
            # ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
            Creates an experiment run for the model to be trained and preprocess the data

        Parameters
        ----------
        data: pd.DataFrame
            data to use for training.
        col: str
            column to be kept in the data
        frac: float
            percentage of data to hold out for testing the model.

        """
        self.data = Preprocessing.preprocessing(data, col)

        self.train, self.test = Preprocessing.train_test_split_df(
            data=self.data, 
            n_test=round(len(data) * frac)
        )

        self.proc_training_data = Preprocessing.series_to_supervised(
            data=self.train, 
            n_in=1, 
            dropnan=True
        )
        self.proc_testing_data = Preprocessing.series_to_supervised(
            data=self.test, 
            n_in=1, 
            dropnan=False
        )
        
    def train_model(
            self,
            experiment_name: str, 
        ) -> None:
        # prepare train and test data
        self.proc_testing_data.fillna(self.proc_training_data.iloc[-1, -1])
        X_train = self.proc_training_data.iloc[:, :-1].values
        X_test = self.proc_testing_data.iloc[:, :-1].values
        y_train = self.proc_training_data.iloc[:, -1].values
        y_test = self.proc_testing_data.iloc[:, -1].values

        frac = round((len(y_test)/len(self.data)), 2)
        
        # n-folds
        effective_df_length = len(self.proc_training_data) - len(self.proc_testing_data)
        max_folds = effective_df_length // len(self.proc_testing_data)
        n_folds = min(max_folds, 10)

        # Create the experiment if it does not exist
        experiment = mlflow.get_experiment_by_name(experiment_name)
        if experiment is None:
            mlflow.create_experiment(experiment_name)
            experiment = mlflow.get_experiment_by_name(experiment_name)
        # enable auto logging
        mlflow.xgboost.autolog()
        # start experiment run
        with mlflow.start_run(experiment_id=experiment.experiment_id):
            mlflow.log_param(key="pct_data_for_training", value=(1 - frac))
            mlflow.log_param(key="pct_data_for_testing", value=(frac))
            # log the script
            # mlflow.log_artifact(__file__)

            # Get current commit hash
            commit_hash = (
                 subprocess.check_output(["git", "rev-parse", "HEAD"])
                 .strip()
                 .decode("utf-8")
            )
            # Log Git commit hash as a parameter
            # mlflow.log_param("commit_hash", commit_hash)

            parameters_xgb = {
                "gamma": [0, 30, 100, 200],
                "eta": [0.3, 0.03, 0.003],
                "max_depth": [6, 12, 30],
            }
            self.xgb_grid, self.predictions_xgb = XGBForecaster.grid_search(
                self,
                parameters=parameters_xgb,
                n_folds=n_folds,
                train_df=self.proc_training_data,
                test_size=len(self.proc_testing_data),
                n_jobs=-1,
                verbose=1,
            )
            mae = mean_absolute_error(y_test, self.predictions_xgb)
            mape = mean_absolute_percentage_error(y_test, self.predictions_xgb)

            # log metrics
            mlflow.log_metrics({"MAE": mae, "MAPE": mape})
    

In [37]:
XGBForecaster()

In [21]:
gas_prices = GasPrices.get_data()

In [30]:
gas_prices.head()

Unnamed: 0,GAS NATURALE
2017-10-23,18.15
2017-10-30,18.309999
2017-11-06,19.82
2017-11-13,18.950001
2017-11-20,20.455


In [38]:
forecaster = XGBForecaster()

experiment_name = "gas_model_nbk"

frac = 0.2

forecaster.prepare_data(
        data=gas_prices,
        col="GAS NATURALE",
        frac=frac
    )

In [39]:
forecaster.proc_training_data.head()

Unnamed: 0,GAS NATURALE,GAS NATURALE.1
2017-10-30,0.01815,0.01831
2017-11-06,0.01831,0.01982
2017-11-13,0.01982,0.01895
2017-11-20,0.01895,0.020455
2017-11-27,0.020455,0.02079


In [40]:
forecaster.train_model(experiment_name=experiment_name)

TypeError: fit() got multiple values for argument 'model'

In [None]:
test_data.tail()

In [None]:
model = forecaster.train_model(
    experiment_name=experiment_name,
    train_data=train_data,
    test_data=test_data
)

In [None]:
model

In [None]:
test_data.tail(1)

In [None]:
test_data.tail(1)

In [None]:
np.asarray(test_data.tail(1))[-1,:]

In [None]:
model.predict(np.asarray(test_data.tail(1))[-1,:])

In [None]:
def forecast(model, last_data_row: np.array, n_steps:int, ) -> list:
    