In [82]:
import numpy as np
import pandas as pd
from preprocessing.wrangling import get_indi_df, get_labels, slide_and_flatten
from preprocessing.extract_features import get_all_ta_features, get_wavelet_coeffs
from evaluation.eval import sliding_window_cv_regression, batch_test_swcv_regression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from numpy.lib.stride_tricks import sliding_window_view
from xgboost import XGBRegressor
from sklearn.pipeline import make_pipeline
import datetime
import os
import csv

In [2]:
def add_closing_price(y, cls_price):
    return y + cls_price

In [3]:
class PersistanceModel:
    def __init__(self, persist_colname='Close'):
        self.persist_colname = persist_colname

    def __repr__(self):
        return "PersistanceModel(persist_colname={})".format(self.persist_colname)

    def fit(self, Xtr, ytr):
        pass

    def predict(self, Xts):
        return Xts.loc[:, self.persist_colname]


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

In [79]:
def sliding_window_cv_torch(swdf, y, model, optimizer, loss_fn, n_tr, n_ts=1, scorers=[], comment="", post_processor=None):
    assert len(swdf) == len(y), "Length of X ([]) must match that of y ([]).".format(len(X), len(y))
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using {device} device")

    y_pred = []
    y_target = []
    agg_results = {}
    if post_processor is not None:
        post_processor_f, post_processor_args = post_processor[0], post_processor[1]
        

    for i_tr_start in range(0, len(swdf)-(n_tr+n_ts)):
        # The last i_ts_end should be len(X).
        # i_ts_end = i_ts_start + n_ts
        # Now, i_tr_end = i_ts_start
        # So, i_tr_start = i_ts_start - n_tr
        # But, i_ts_start = i_ts_end - n_ts
        # Thus, i_tr_start = i_ts_end - n_tr - n_ts
        # Hence, last i_tr_start = len(X) - (n_tr + n_ts)

        i_tr_end = i_ts_start = i_tr_start + n_tr 
        i_ts_end = i_ts_start + n_ts 

        Xtr, Xts = swdf[i_tr_start:i_tr_end, :, :], swdf[i_ts_start:i_ts_end, :, :]
        ytr, yts = y[i_tr_start:i_tr_end].to_numpy(), y[i_ts_start:i_ts_end].to_numpy()
        Xtr, Xts, ytr, yts = torch.Tensor(Xtr), torch.Tensor(Xts), torch.Tensor(ytr), torch.Tensor(yts)
        Xtr, Xts, ytr, yts = Xtr.float().to(device), Xts.float().to(device), ytr.float().to(device), yts.float().to(device)
        
        model.to(device)
        
        epochs = 10
        for e in range(epochs):
            model.train()
            pred = model(Xtr)
            loss = loss_fn(pred, ytr)
            
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            pred = model(Xts)
            mape = torch.mean((torch.abs((yts - pred) / yts)) * 100)
            if device == "cuda":
                pred = pred.detach()
                yts = yts.detach()

            if len(pred.shape) == 0:
                pred = pred.unsqueeze(0)
            if pred.shape[0] == 1:
                y_pred.append(pred.item())
                y_target.append(yts.item())
            else:
                y_pred, yts = list(y_pred), list(yts)
                y_pred.extend(y_pred)
                y_target.extend(yts)

    # print(len(y_target), len(y_pred))
    # print(y_pred, y_target)
    if len(y_pred) > 1:
        y_pred = np.squeeze(y_pred)

    if post_processor is not None:
        y_pred = post_processor_f(y_pred, **post_processor_args)
        y_target = post_processor_f(y_target, **post_processor_args)

    agg_results['time'] = datetime.datetime.now()
    agg_results['model'] = str(model)
    agg_results['comment'] = comment
    for scorer in scorers:
        agg_results[scorer.__name__] = scorer(y_target, y_pred)

    return agg_results





In [80]:
class LSTMBaseline(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTMBaseline, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
        self.dense = nn.Linear(hidden_size, 1)
    
    def forward(self, swdf):
        _, hncn = self.lstm(swdf)
        hn, cn = hncn
        hn = hn.squeeze()
        op = self.dense(hn)
        return op.squeeze()
    

In [84]:
list_dir = 'data_collection/stocks_list'
list_prefix = "ind_nifty"
list_suffix = "list.csv"
save_dir = 'data_collection/ohlcv_data'
save_prefix = "ohlcv_"
save_suffix = ".csv"
resultfile = "results/baseline_lstm.csv"
cap_n_stocks = 10
results = []

for f in os.listdir(list_dir):
    if f.startswith(list_prefix) and f.endswith(list_suffix):
            savefile = os.path.join(save_dir, save_prefix+f[9:-8]+save_suffix)
            listfile = os.path.join(list_dir, f)
            p = pd.read_csv(listfile)
            symbols = list(p['Symbol'].values + '.NS')
            if cap_n_stocks <= 0:
                break
            for symbol in symbols:
                cap_n_stocks -= 1
                if cap_n_stocks <= 0:
                    break

                df = get_indi_df(symbol, ohlcvfile=savefile, start_date="2017-01-01")
                # df = get_all_ta_features(df)
                drop_columns = ['Date', 'Adj Close']
                df.drop(drop_columns, axis=1, inplace=True)
                move_dir_target, cls_target = get_labels(df['Close'])
                df = df.iloc[:-1]
                swdf10 = (sliding_window_view(df, (10, df.shape[1]))).squeeze()
                cls_target10 = cls_target.iloc[(10 - 1):-1]
                # df10 = slide_and_flatten(df, window_len=10)
                # df10 = pd.DataFrame(df10, index=df.index[9:])
                # df30 = slide_and_flatten(df, window_len=30)
                # df30 = pd.DataFrame(df30, index=df.index[29:])
                # df60 = slide_and_flatten(df, window_len=60)
                # df60 = pd.DataFrame(df60, index=df.index[59:])
                
                # print(swdf10.shape, cls_target10.shape)
                model = LSTMBaseline(swdf10.shape[2], 64)
                optimizer = torch.optim.RMSprop(model.parameters(), lr=0.1)
                loss_fn = nn.MSELoss()
                result = sliding_window_cv_torch(swdf10, cls_target10, model, optimizer, loss_fn, n_tr=60, n_ts=1, 
                scorers=[mean_squared_error,mean_absolute_percentage_error, r2_score], comment="", post_processor=None)
                results.append(result)
                if resultfile is not None:
                    file_exists = os.path.isfile(resultfile)
    
                    with open(resultfile, 'a', newline='') as f:
                        writer = csv.DictWriter(f, fieldnames=results[0].keys(), delimiter=',', lineterminator='\n')

                        if not file_exists:
                            writer.writeheader()  # file doesn't exist yet, write a header

                        writer.writerows(results)

Using cpu device
Using cpu device
Using cpu device
Using cpu device
Using cpu device
Using cpu device
Using cpu device


KeyboardInterrupt: 