In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.utils

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit

from statsmodels.regression.linear_model import OLS

from ml.models.simpleneuralnetwork import SimpleNeuralNetwork
from ml.utils.preprocessing.preprocessor import PreProcessingData

import warnings
warnings.simplefilter("ignore")

In [20]:
def get_ticker_daily_close(ticker = "AAPL"):
    """ returns the daily close price for a maximum period for specified ticker """
    import yfinance
    
    tick = yfinance.Ticker(ticker)
    return tick.history(period = "max", interval = "1d")["Close"]

### Process Data to Features and Targets Tensors

In [26]:
# load data

# file_name = "msft_stock.csv"
# data = pd.read_csv("data/" + file_name)
# returns = data["price"].apply(np.log).diff()

# or through yahoo finance
prices = get_ticker_daily_close("MSFT")
returns = prices.apply(np.log).diff()
print(f"Total observations: {len(returns)}")

################################################################
# targets are just the returns squared
targets = returns ** 2

# features are the targets with a certain lag
df = pd.DataFrame({})
lags = 20
for i in range(1, lags+1):
    df[f"lag_{i}"] = targets.shift(i)

df_har = pd.DataFrame({})
df_har["rv_d"] = targets.shift(1)
df_har["rv_w"] = targets.rolling(5).apply(np.mean)
df_har["rv_m"] = targets.rolling(20).apply(np.mean)
    
# numpy array with features and targets
features_har = df_har.values
features = df.values
targets = targets.values.reshape(-1,1)

# drop nan values
features = features[lags+1:-max(1, int(.3*lags))]
targets = targets[lags+1:-max(1, int(.3*lags))]

print(f"shape features: {features.shape} of type {type(features)}")
print(f"shape targets: {targets.shape} of type {type(targets)}")

Total observations: 9286
shape features: (9259, 20) of type <class 'numpy.ndarray'>
shape targets: (9259, 1) of type <class 'numpy.ndarray'>


### Perform Kfold Once

In [29]:
model_specifications = ((1,),)
for model_specification in model_specifications:

    score_nn = []
    score_har = []
    
    kfold = TimeSeriesSplit(n_splits = 5, max_train_size = 8000, test_size = 500)
    for train_index, test_index in kfold.split(features):
        
        # split data
        features_train, features_test, targets_train, targets_test = \
            features[train_index], features[test_index], targets[train_index], targets[test_index]
        features_train_har, features_test_har = features[train_index], features[test_index]

        # fit normalizer on train features and normalize data
        scaler = StandardScaler()
        features_train = scaler.fit_transform(features_train)
        features_test = scaler.transform(features_test)

        #=========================================================================================
        #===================================ESTIMATE HAR==========================================
        #=========================================================================================
        mod = OLS(targets_train, features_train_har, hasconst = True)
        mod = mod.fit()
        pred = mod.predict(features_test_har)
        
        # plt.plot(pred, label = "pred")
        # plt.plot(targets_test, label = "target")
        # plt.legend()
        # plt.show()
        
        loss = np.var(targets_test - pred)
        score_har += [loss]

        #=========================================================================================
        #====================================ESTIMATE NN==========================================
        #=========================================================================================
        # all to tensor after transform
        features_train_tensor = torch.tensor(features_train, dtype=torch.float32)
        features_test_tensor = torch.tensor(features_test, dtype=torch.float32)
        targets_train_tensor = torch.tensor(targets_train, dtype=torch.float32)
        targets_test_tensor = torch.tensor(targets_test, dtype=torch.float32)

        # to data loader so torch can handle the data efficiently
        trainloader = torch.utils.data.DataLoader( [(feature, target) for feature, target in zip(features_train_tensor, targets_train_tensor)], batch_size = 20)
        # testloader  = torch.utils.data.DataLoader( [(feature, target) for feature, target in zip(features_test, targets_test)], batch_size = 10)
        
        # estimate the model
        model = SimpleNeuralNetwork(
            lags = lags, 
            nodes = model_specification,
            hidden_dim = 10,
            n_layers = 1,
            output_size= 1)
        model.fit(trainloader, lr = .1, epochs = 4)
        
        # perform out of sample
        output = model(features_test_tensor)
        
        # plt.plot(output.detach().numpy(), label = "pred")
        # plt.plot(targets_test_tensor.detach().numpy(), label = "target")
        # plt.legend()
        # plt.show()
        
        loss = model.criterion(output, targets_test_tensor)
        score_nn += [loss.item()]
        #=========================================================================================
        #=========================================================================================

    avg_score_nn = np.average(score_nn)
    avg_score_har = np.average(score_har)
    print(f"model: {model_specification}\navg score: {avg_score_nn}")
    print(f"model: {model_specification}\navg score: {avg_score_har}")

model: (1,)
avg score: 0.0001800380678560032
model: (1,)
avg score: 1.067804701207753e-06
