In [1]:
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import yfinance as yf

# Introduction

Forecasting financial time series is not something that is easy to pick up the first time. Even though we might be acquainted with different machine learning classification and regression problems, it poses its own unique challenges in terms of autocorrelation, format of the data, evaluation of predictions.
In this notebook and articles series we will try to frame a time series problem in format that is closer to common machine learning problems so that we can use frameworks such as gradient boosted machines or DNN to tackle our problem.\

As such I will not discuss here methods that are commonly used in Academiy such as VAR, ARIMA, SARIMA and variations thereof that require much deeper knowledge about the theory of the data generating process and that are in my opinion cumbersome for people that have not encountered them before. \

I am going to use yahoo finance data as an example below!

In [2]:
STOCK_TICKER = "SPY"

data = yf.download(  # or pdr.get_data_yahoo(...
    # tickers list or string as well
    tickers="SPY BABA",
    # use "period" instead of start/end
    # valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
    # (optional, default is '1mo')
    period="60d",
    # fetch data by interval (including intraday if period < 60 days)
    # valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
    # (optional, default is '1d')
    interval="15m",
    # group by ticker (to access via data['SPY'])
    # (optional, default is 'column')
    group_by="ticker",
    # adjust all OHLC automatically
    # (optional, default is False)
    auto_adjust=True,
    # download pre/post regular market hours data
    # (optional, default is False)
    prepost=True,
    # use threads for mass downloading? (True/False/Integer)
    # (optional, default is True)
    threads=True,
    # proxy URL scheme use use when downloading?
    # (optional, default is None)
    proxy=None,
)

[*********************100%***********************]  2 of 2 completed


In [3]:
data[STOCK_TICKER].reset_index().dtypes

Datetime    datetime64[ns, America/New_York]
Open                                 float64
High                                 float64
Low                                  float64
Close                                float64
Volume                               float64
dtype: object

Let's formulate the problem in the following way:
- We want to predict the average stock price of the coming week
- Each sample in the training set is going to be a combination of Open Price

In [4]:
train_data_cutoff = pd.Timestamp("2022-01-10", tz="America/New_York")
test_data_beginning = train_data_cutoff + timedelta(days=7)
test_data_end = test_data_beginning + timedelta(days=7)

In [5]:
# convert to pandas specific timestamps
test_data_end

Timestamp('2022-01-24 00:00:00-0500', tz='America/New_York')

In [6]:
df = data[STOCK_TICKER].copy()#[data["MSFT"].index < train_data_cutoff]

## Label

Remember that we are trying to predict the average price of next week for a given stock.
A week can be defined as the 7 day period after the data horizon time. 

In [7]:
# Let's Make some features and produce label

# For a stock we parhaps can use the following stuff: 
# Moving average of price
# 


In [8]:
hour_vector = data[STOCK_TICKER].index.hour
weekday = data[STOCK_TICKER].index.weekday

In [9]:
df["hour"] = hour_vector.astype("category")
df["weekday"] = hour_vector.astype("category")

In [10]:
df.hour.unique()

[4, 5, 6, 7, 8, ..., 14, 15, 16, 17, 18]
Length: 15
Categories (15, int64): [4, 5, 6, 7, ..., 15, 16, 17, 18]

In [11]:
# There is 11 hours in our range and we have data ever 15 mins.
# In tota that is  11*15 rows per day
rows_per_day = 11*15

In [12]:
len(df)/rows_per_day

21.23030303030303

In [13]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,hour,weekday
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-11-17 04:00:00-05:00,469.47,469.49,469.3,469.3,0.0,4,4
2021-11-17 04:15:00-05:00,469.3,469.33,469.22,469.24,0.0,4,4
2021-11-17 04:30:00-05:00,469.22,469.24,468.96,468.99,0.0,4,4
2021-11-17 04:45:00-05:00,469.0,469.08,468.97,468.99,0.0,4,4
2021-11-17 05:00:00-05:00,468.99,469.07,468.96,468.99,0.0,5,5


In [14]:
def reverse_rolling_mean(x, rows):
    return x.iloc[::-1].rolling(rows, min_periods=rows).mean().iloc[::-1]
# We could have a reverse rolling Standard Deviation

In [15]:
df["target"] = reverse_rolling_mean(df.Close, rows_per_day*3) # mean price of the 3 days

In [16]:
df["previous_Close"] = df.Close.shift(1) # we use this to normalize our features
df["normalized_Open"] = (df.Open / df.previous_Close)-1
df["normalized_High"] = (df.Open / df.previous_Close)-1
df["normalized_Low"] = (df.Open / df.previous_Close)-1
df["normalized_Close"] = (df.Close / df.previous_Close)-1
df["target"] = (df.target / df.Open)-1 # Adjusting target to be average next 3 days percentage change compared to today's open.

In [17]:
# Features
# Rolling MA different periods. 
def rolling_features(df, col, periods):
    ma = df[col].rolling(periods, min_periods=periods).mean()
    mstd = df[col].rolling(periods, min_periods=periods).std()
    return ma, mstd

#interesting_lags = [, rows_per_day, rows_per_day*10, rows_per_day*30, rows_per_day*45, rows_per_day*60]
interesting_lags = np.linspace(5,95,10, dtype=int)
for lag in interesting_lags:
    ma, mstd = rolling_features(df, "normalized_Close", lag)
    df[f"ma_{lag}"] = ma
    df[f"mstd_{lag}"] = mstd

In [18]:
df_complete = df.dropna()

In [19]:
df_complete.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,hour,weekday,target,previous_Close,normalized_Open,...,ma_55,mstd_55,ma_65,mstd_65,ma_75,mstd_75,ma_85,mstd_85,ma_95,mstd_95
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-11-18 13:15:00-05:00,469.475006,469.475006,468.970001,469.179993,1025661.0,13,13,-0.010162,469.472687,5e-06,...,4e-05,0.000816,3e-06,0.000813,1e-05,0.000838,6.08733e-06,0.000803,-2.404501e-06,0.000762
2021-11-18 13:30:00-05:00,469.170013,469.350006,469.049988,469.290009,616266.0,13,13,-0.009586,469.179993,-2.1e-05,...,4.1e-05,0.000816,2.9e-05,0.000793,2.4e-05,0.000832,7.089868e-06,0.000803,1.409556e-06,0.000763
2021-11-18 13:45:00-05:00,469.299988,469.470001,469.190002,469.379913,1552479.0,13,13,-0.00993,469.290009,2.1e-05,...,3.5e-05,0.000814,5.5e-05,0.00077,5.2e-05,0.000802,6.584511e-06,0.000803,9.03432e-06,0.000761
2021-11-18 14:00:00-05:00,469.355011,469.48999,468.970001,469.316101,1043316.0,14,14,-0.010116,469.379913,-5.3e-05,...,4.2e-05,0.000811,3.9e-05,0.000763,3e-05,0.000785,7.218869e-07,0.000803,7.603267e-06,0.000761
2021-11-18 14:15:00-05:00,469.309998,469.350006,468.799988,468.980011,1087987.0,14,14,-0.010089,469.316101,-1.3e-05,...,2.3e-05,0.000816,3e-05,0.000768,2.8e-05,0.000787,-5.196278e-06,0.000806,6.508409e-08,0.000765


In [20]:
# Our label is the mean price of the next 3 days
ml_data = df_complete.drop(columns=["Open", "High", "Low", "Close", "previous_Close"])

In [21]:
training_data = ml_data[ml_data.index < train_data_cutoff]
test_data = ml_data[(ml_data.index > test_data_beginning) & (ml_data.index < test_data_end)]

In [22]:
from catboost import CatBoostRegressor, Pool

In [23]:
train_pool = Pool(training_data.drop(columns="target"),
                 label=training_data.target,
                 cat_features=["hour", "weekday"])
test_pool = Pool(test_data.drop(columns="target"),
                 label=test_data.target,
                 cat_features=["hour", "weekday"])

In [24]:
model = CatBoostRegressor(iterations=40,
                          learning_rate=0.01,
                          eval_metric="MAE",
                          loss_function='RMSE',
)

In [25]:
model.fit(train_pool,
         eval_set=test_pool,)

0:	learn: 0.0128157	test: 0.0266367	best: 0.0266367 (0)	total: 60ms	remaining: 2.34s
1:	learn: 0.0127460	test: 0.0266514	best: 0.0266367 (0)	total: 62.1ms	remaining: 1.18s
2:	learn: 0.0126679	test: 0.0266558	best: 0.0266367 (0)	total: 63.9ms	remaining: 788ms
3:	learn: 0.0125928	test: 0.0266337	best: 0.0266337 (3)	total: 65.7ms	remaining: 591ms
4:	learn: 0.0125109	test: 0.0266428	best: 0.0266337 (3)	total: 67.2ms	remaining: 471ms
5:	learn: 0.0124333	test: 0.0266169	best: 0.0266169 (5)	total: 68.9ms	remaining: 391ms
6:	learn: 0.0123524	test: 0.0265786	best: 0.0265786 (6)	total: 70.4ms	remaining: 332ms
7:	learn: 0.0122782	test: 0.0265271	best: 0.0265271 (7)	total: 72ms	remaining: 288ms
8:	learn: 0.0122076	test: 0.0265083	best: 0.0265083 (8)	total: 73.6ms	remaining: 253ms
9:	learn: 0.0121391	test: 0.0264866	best: 0.0264866 (9)	total: 75.1ms	remaining: 225ms
10:	learn: 0.0120658	test: 0.0264579	best: 0.0264579 (10)	total: 76.7ms	remaining: 202ms
11:	learn: 0.0119946	test: 0.0264007	best: 0.

<catboost.core.CatBoostRegressor at 0x110f30af0>

In [26]:
# Issues. This is just one model, we have no guarantee our architecture will work in the future.
predictions = model.predict(test_data.drop(columns="target"))

In [27]:
eval_df = test_data[["target"]].copy()
eval_df["predictions"] = predictions

In [28]:
eval_df

Unnamed: 0_level_0,target,predictions
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-01-18 04:00:00-05:00,-0.037921,0.003274
2022-01-18 04:15:00-05:00,-0.036481,0.003490
2022-01-18 04:30:00-05:00,-0.034905,0.003296
2022-01-18 04:45:00-05:00,-0.035035,0.003341
2022-01-18 05:00:00-05:00,-0.036178,0.003409
...,...,...
2022-01-21 17:45:00-05:00,0.009724,0.004409
2022-01-21 18:00:00-05:00,0.009954,0.004320
2022-01-21 18:15:00-05:00,0.010042,0.004412
2022-01-21 18:30:00-05:00,0.010293,0.004431


In [30]:
eval_df.max()

target         0.010477
predictions    0.004475
dtype: float64

In [None]:
# Need to package this into a signal for an algo.
# it could be something when prediction is high buy.