# Train model

In [1]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.base import TransformerMixin
import pandas as pd
import os
from joblib import dump, load
import datetime
from pathlib import Path

In [2]:
DATA_DIR = "data"
INPUT_DIRNAME = "02_clean"
INPUT_FILENAME = "clean_data.csv"
OUTPUT_DIRNAME = "artifacts"
NB_DAYS_TO_TRAIN = 7
EXECUTION_DATE = pd.to_datetime("2021-03-18")
DATE_FORMAT = "%Y-%m-%d"
DAY_PLUS_1 = "DAY_PLUS_1"
DAY_PLUS_7 = "DAY_PLUS_7"
DAY_PLUS_30 = "DAY_PLUS_30"

In [3]:
OUTPUT_DIR_1 = Path(os.path.join(OUTPUT_DIRNAME, EXECUTION_DATE.strftime(DATE_FORMAT), DAY_PLUS_1))
OUTPUT_DIR_7 = Path(os.path.join(OUTPUT_DIRNAME, EXECUTION_DATE.strftime(DATE_FORMAT), DAY_PLUS_7))
OUTPUT_DIR_30 = Path(os.path.join(OUTPUT_DIRNAME, EXECUTION_DATE.strftime(DATE_FORMAT), DAY_PLUS_30))
OUTPUT_DIR_1.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR_7.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR_30.mkdir(parents=True, exist_ok=True)

In [4]:
def load_all_available_data_before_date(path: str) -> pd.DataFrame:
    dataframes = [pd.read_csv(os.path.join(path, name, INPUT_FILENAME)) for name in os.listdir(path) if "." not in name and pd.to_datetime(name) <= EXECUTION_DATE]
    return pd.concat(dataframes, ignore_index=True)

In [5]:
# Load data
data = load_all_available_data_before_date(os.path.join(DATA_DIR, INPUT_DIRNAME))

In [6]:
data = data.set_index("Date").rename(columns={"Close": "value"}).sort_index()
data

Unnamed: 0_level_0,value
Date,Unnamed: 1_level_1
2021-01-01,29374.152344
2021-01-02,32127.267578
2021-01-03,32782.023438
2021-01-04,31971.914062
2021-01-05,33992.429688
...,...
2021-03-14,59844.519531
2021-03-15,55907.199219
2021-03-16,56304.406250
2021-03-17,55904.730469


In [7]:
class PastDataSetter(TransformerMixin):
    
    PAST_COLUMN_NAME = "value_minus_"
    NEW_COL_NAME = "target"
    
    def __init__(self, data_column="value", nb_days_to_set=30, delay=1):
        self.data_column = data_column
        self.nb_days_to_set = nb_days_to_set
        self.delay = delay
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        result = X.copy()
        for day in range(self.delay, self.delay + self.nb_days_to_set):
            result[self.PAST_COLUMN_NAME + str(day)] = X[self.data_column].shift(day)
        result = result.dropna()
        result = result.rename(columns={self.data_column: self.NEW_COL_NAME})
        y = result[self.NEW_COL_NAME].values
        X = result.drop([self.NEW_COL_NAME], axis=1).values
        return X, y

In [8]:
setter_1 = PastDataSetter(nb_days_to_set=NB_DAYS_TO_TRAIN, delay=1)
setter_7 = PastDataSetter(nb_days_to_set=NB_DAYS_TO_TRAIN, delay=7)
setter_30 = PastDataSetter(nb_days_to_set=NB_DAYS_TO_TRAIN, delay=30)

X_plus_1, y_plus_1 = setter_1.fit_transform(data)
X_plus_7, y_plus_7 = setter_7.fit_transform(data)
X_plus_30, y_plus_30 = setter_30.fit_transform(data)

In [9]:
#Train the model
grad_boost_plus_1 = GradientBoostingRegressor().fit(X_plus_1, y_plus_1)
grad_boost_plus_7 = GradientBoostingRegressor().fit(X_plus_7, y_plus_7)
grad_boost_plus_30 = GradientBoostingRegressor().fit(X_plus_30, y_plus_30)

In [10]:
dump(setter_1, os.path.join(OUTPUT_DIR_1, "%s_gradient_boosting.joblib" % setter_1.__class__.__name__))
dump(grad_boost_plus_1, os.path.join(OUTPUT_DIR_1, "%s.joblib" % grad_boost_plus_1.__class__.__name__))

dump(setter_7, os.path.join(OUTPUT_DIR_7, "%s_gradient_boosting.joblib" % setter_7.__class__.__name__))
dump(grad_boost_plus_7, os.path.join(OUTPUT_DIR_7, "%s.joblib" % grad_boost_plus_7.__class__.__name__))

dump(setter_30, os.path.join(OUTPUT_DIR_30, "%s_gradient_boosting.joblib" % setter_30.__class__.__name__))
dump(grad_boost_plus_30, os.path.join(OUTPUT_DIR_30, "%s.joblib" % grad_boost_plus_30.__class__.__name__))

['artifacts/2021-03-18/DAY_PLUS_30/GradientBoostingRegressor.joblib']