# Train model

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.base import TransformerMixin
import pandas as pd
import os
from joblib import dump, load
import datetime
from pathlib import Path
import ipdb

In [2]:
DATA_DIR = "data"
INPUT_DIRNAME = "02_clean"
INPUT_FILENAME = "clean_data.csv"
OUTPUT_DIRNAME = "artifacts"
NB_DAYS_TO_TRAIN = 5
EXECUTION_DATE = datetime.datetime.now()
DATE_FORMAT = "%Y-%m-%d"
DAY_PLUS_1 = "DAY_PLUS_1"
DAY_PLUS_7 = "DAY_PLUS_7"
DAY_PLUS_30 = "DAY_PLUS_30"

In [3]:
OUTPUT_DIR_1 = Path(os.path.join(OUTPUT_DIRNAME, EXECUTION_DATE.strftime(DATE_FORMAT), DAY_PLUS_1))
OUTPUT_DIR_7 = Path(os.path.join(OUTPUT_DIRNAME, EXECUTION_DATE.strftime(DATE_FORMAT), DAY_PLUS_7))
OUTPUT_DIR_30 = Path(os.path.join(OUTPUT_DIRNAME, EXECUTION_DATE.strftime(DATE_FORMAT), DAY_PLUS_30))
OUTPUT_DIR_1.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR_7.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR_30.mkdir(parents=True, exist_ok=True)

In [4]:
def load_all_available_data(path: str) -> pd.DataFrame:
    dataframes = [pd.read_csv(os.path.join(path, name, INPUT_FILENAME)) for name in os.listdir(path) if "." not in name]
    return pd.concat(dataframes, ignore_index=True)

In [5]:
# Load data
data = load_all_available_data(os.path.join(DATA_DIR, INPUT_DIRNAME))

In [6]:
data = data.set_index("Date").rename(columns={"Close": "value"}).sort_index()
data

Unnamed: 0_level_0,value
Date,Unnamed: 1_level_1
2021-02-07,38903.441406
2021-02-08,46196.464844
2021-02-09,46481.105469
2021-02-10,44918.183594
2021-02-11,47909.332031
2021-02-12,47504.851562
2021-02-13,47105.515625
2021-02-14,48717.289062
2021-02-15,47945.058594
2021-02-16,49199.871094


In [7]:
class PastDataSetter(TransformerMixin):
    
    PAST_COLUMN_NAME = "value_minus_"
    NEW_COL_NAME = "target"
    
    def __init__(self, data_column="value", nb_days_to_set=30, delay=1):
        self.data_column = data_column
        self.nb_days_to_set = nb_days_to_set
        self.delay = delay
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        result = X.copy()
        for day in range(self.delay, self.delay + self.nb_days_to_set):
            result[self.PAST_COLUMN_NAME + str(day)] = X[self.data_column].shift(day)
        result = result.dropna()
        result = result.rename(columns={self.data_column: self.NEW_COL_NAME})
        y = result[self.NEW_COL_NAME].values
        X = result.drop([self.NEW_COL_NAME], axis=1).values
        return X, y

In [8]:
setter_1 = PastDataSetter(nb_days_to_set=NB_DAYS_TO_TRAIN, delay=1)
setter_7 = PastDataSetter(nb_days_to_set=NB_DAYS_TO_TRAIN, delay=7)
setter_30 = PastDataSetter(nb_days_to_set=NB_DAYS_TO_TRAIN, delay=30)

X_plus_1, y_plus_1 = setter_1.fit_transform(data)
X_plus_7, y_plus_7 = setter_7.fit_transform(data)
X_plus_30, y_plus_30 = setter_30.fit_transform(data)

In [9]:
#Train the model
lin_reg_plus_1 = LinearRegression().fit(X_plus_1, y_plus_1)
lin_reg_plus_7 = LinearRegression().fit(X_plus_7, y_plus_7)
lin_reg_plus_30 = LinearRegression().fit(X_plus_30, y_plus_30)

In [10]:
dump(setter_1, os.path.join(OUTPUT_DIR_1, "%s.joblib" % setter_1.__class__.__name__))
dump(lin_reg_plus_1, os.path.join(OUTPUT_DIR_1, "%s.joblib" % lin_reg_plus_1.__class__.__name__))

dump(setter_7, os.path.join(OUTPUT_DIR_7, "%s.joblib" % setter_7.__class__.__name__))
dump(lin_reg_plus_7, os.path.join(OUTPUT_DIR_7, "%s.joblib" % lin_reg_plus_7.__class__.__name__))

dump(setter_30, os.path.join(OUTPUT_DIR_30, "%s.joblib" % setter_30.__class__.__name__))
dump(lin_reg_plus_30, os.path.join(OUTPUT_DIR_30, "%s.joblib" % lin_reg_plus_30.__class__.__name__))

['artifacts/2021-03-14/DAY_PLUS_30/LinearRegression.joblib']