# Prediction for N+1, N+7, N+30

In [None]:
# Import libraries
import datetime
import os
import pandas as pd
from pathlib import Path
from joblib import load
from sklearn.base import TransformerMixin

In [None]:
# Constants
MODEL_PATH = "artifacts/2021-03-18/"
MODEL_NAME = "LinearRegression"
MODEL_EXTENSION = ".joblib"
DATA_DIR = "data"
DATE_FORMAT = "%Y-%m-%d"
INPUT_DIRNAME = "02_clean"
INPUT_FILENAME = "clean_data.csv"
OUTPUT_DIRNAME = "03_predictions"
OUTPUT_FILENAME = "predictions_linear_regression.csv"
DAY_PLUS_1 = "DAY_PLUS_1"
DAY_PLUS_7 = "DAY_PLUS_7"
DAY_PLUS_30 = "DAY_PLUS_30"
EXECUTION_DATE = "2021-02-25"
NB_DAYS = 7

In [None]:
# System constants
D_PLUS_1 = datetime.datetime.strptime(EXECUTION_DATE, "%Y-%m-%d") + datetime.timedelta(days=1)
D_PLUS_7 = datetime.datetime.strptime(EXECUTION_DATE, "%Y-%m-%d") + datetime.timedelta(days=7)
D_PLUS_30 = datetime.datetime.strptime(EXECUTION_DATE, "%Y-%m-%d") + datetime.timedelta(days=30)
OUTPUT_DIR_1 = Path(os.path.join(DATA_DIR, OUTPUT_DIRNAME, D_PLUS_1.strftime(DATE_FORMAT), DAY_PLUS_1))
OUTPUT_DIR_1.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR_7 = Path(os.path.join(DATA_DIR, OUTPUT_DIRNAME, D_PLUS_7.strftime(DATE_FORMAT), DAY_PLUS_7))
OUTPUT_DIR_7.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR_30 = Path(os.path.join(DATA_DIR, OUTPUT_DIRNAME, D_PLUS_30.strftime(DATE_FORMAT), DAY_PLUS_30))
OUTPUT_DIR_30.mkdir(parents=True, exist_ok=True)

In [None]:
def load_training_data(path: str, date: str, nb_days: int) -> pd.DataFrame:
    date_ = datetime.datetime.strptime(date, "%Y-%m-%d")
    dataframes = []
    for d in range(nb_days):
        curr_date = (date_ - datetime.timedelta(days=d)).strftime("%Y-%m-%d")
        dataframes.append(pd.read_csv(os.path.join(path, curr_date, INPUT_FILENAME)))
    return pd.concat(dataframes, ignore_index=True)

In [None]:
# Load data
data_df = load_training_data(os.path.join(DATA_DIR, INPUT_DIRNAME), EXECUTION_DATE, NB_DAYS).rename(columns={"Close": "value"}).set_index("Date").sort_index()

In [None]:
# Move this to 02_transform_data.ipynb
def get_data_for_prediction(df: pd.DataFrame, date: str, nb_days: int) -> pd.DataFrame:
    start_date = (datetime.datetime.strptime(date, "%Y-%m-%d") - datetime.timedelta(days=nb_days-1)).strftime("%Y-%m-%d")
    end_date = datetime.datetime.strptime(date, "%Y-%m-%d").strftime("%Y-%m-%d")
    return df.loc[start_date:end_date]

In [None]:
df = get_data_for_prediction(data_df, EXECUTION_DATE, NB_DAYS)

In [None]:
x = df.sort_index(ascending=False).values.reshape((1, NB_DAYS))

In [None]:
# Load models artifacts
model_plus_1 = load(os.path.join(MODEL_PATH, DAY_PLUS_1, MODEL_NAME+MODEL_EXTENSION))

model_plus_7 = load(os.path.join(MODEL_PATH, DAY_PLUS_7, MODEL_NAME+MODEL_EXTENSION))

model_plus_30 = load(os.path.join(MODEL_PATH, DAY_PLUS_30, MODEL_NAME+MODEL_EXTENSION))

In [None]:
# Predict for D+1, D+7, D+30
predictions_1 = pd.DataFrame(model_plus_1.predict(x), columns=["value"], index=[D_PLUS_1.strftime("%Y-%m-%d")])
predictions_7 = pd.DataFrame(model_plus_7.predict(x), columns=["value"], index=[D_PLUS_7.strftime("%Y-%m-%d")])
predictions_30 = pd.DataFrame(model_plus_30.predict(x), columns=["value"], index=[D_PLUS_30.strftime("%Y-%m-%d")])

In [None]:
# Save predictions
predictions_1.to_csv(os.path.join(OUTPUT_DIR_1, OUTPUT_FILENAME))
predictions_7.to_csv(os.path.join(OUTPUT_DIR_7, OUTPUT_FILENAME))
predictions_30.to_csv(os.path.join(OUTPUT_DIR_30, OUTPUT_FILENAME))