# Retrieve Stock Data and Save to DF

In [22]:
from pandas_datareader import data as web
import os
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns

def get_stock(ticker, start_date, end_date, s_window, l_window):
    try:
        #yf.pdr_override()
        df = yf.download(ticker, start=start_date, end=end_date,auto_adjust=False)
        #print("DF: ",df)
# can use this as well        df = web.get_data_yahoo(ticker, start=start_date, end=end_date)
        df['Return'] = df['Adj Close'].pct_change()
        df['Return'].fillna(0, inplace = True)
        df['Date'] = df.index
        df['Date'] = pd.to_datetime(df['Date'])
        df['Month'] = df['Date'].dt.month
        df['Year'] = df['Date'].dt.year 
        df['Day'] = df['Date'].dt.day
        for col in ['Open', 'High', 'Low', 'Close', 'Adj Close']:
            df[col] = df[col].round(2)
        df['Weekday'] = df['Date'].dt.day_name()
        df['Week_Number'] = df['Date'].dt.strftime('%U')
        df['Year_Week'] = df['Date'].dt.strftime('%Y-%U')
        df['Short_MA'] = df['Adj Close'].rolling(window=s_window, min_periods=1).mean()
        df['Long_MA'] = df['Adj Close'].rolling(window=l_window, min_periods=1).mean()        
        col_list = ['Date', 'Year', 'Month', 'Day', 'Weekday', 
                    'Week_Number', 'Year_Week', 'Open', 
                    'High', 'Low', 'Close', 'Volume', 'Adj Close',
                    'Return', 'Short_MA', 'Long_MA']
        num_lines = len(df)
        df = df[col_list]
        print('read ', num_lines, ' lines of data for ticker: ' , ticker)
        return df
    except Exception as error:
        print(error)
        return None

In [None]:
try:
    ticker='SPY'
    input_dir = os.getcwd()
    output_file = os.path.join(input_dir, ticker + '.csv')
    df = get_stock(ticker, start_date='2000-01-01', end_date='2025-07-08', 
               s_window=14, l_window=50)
    df.to_csv(output_file, index=False)
    print('wrote ' + str(len(df)) + ' lines to file: ' + output_file)
except Exception as e:
    print(e)
    print('failed to get Yahoo stock data for ticker: ', ticker)

In [None]:
#psuedocode
'''
1. create ATR column
2. create SD column
3. build LSTM columnn
4. split into train and test
5. train lstm on training data for ATR and test
6. train lstm on training data for SD and test
7. plot results and avg. returns based on decisions (buy if delta_volatility<-alpha, sell if delta_volatility>alpha)
'''

In [None]:
 #.xs returns a cross section; selects only values within the SPY indexes at level=1
df_n= df.xs('SPY',axis=1,level=1)
#cleaned up data

In [None]:
# import math
# #challenge of how to deal with missing values in time series data
# #choose sliding window of length=N (could be 20?) N_lstmtraining>N_atrslidingwindow
# def atr_func(d):
#     running = 0
#     for i in range(len(d)):
#         high = d.loc[i,"High"]
#         low = d.loc[i,"Low"]
#         if i==0:
#             running += high-low
#             continue
#         if i>0:
#             y_close = d.loc[i,"Close"]
        
#         TR = max(high-low,max(math.abs(high-y_close),math.abs(low-y_close)))
#         running+=TR
#     return running/len(d)

# df_n['ATR'] = df_n.rolling(window=14).apply(atr_func)
# df_n

In [None]:
#define ATR columns
high = df_n["High"]
low = df_n["Low"]
close = df_n["Close"]

prev_close = close.shift(1)


tr = pd.concat([
    high - low,
    (high - prev_close).abs(),
    (low - prev_close).abs()
], axis=1).max(axis=1)

df_n["ATR"] = tr.rolling(7).mean()

In [26]:
import numpy as np

log_diff = np.log(df_n["Close"]/df_n["Close"].shift(1))
df_n["SD_Log_Close"] = log_diff.rolling(7).std()
df_n["ATR_normalized"] = (df_n["ATR"] - df_n["ATR"].mean())/df_n["ATR"].std()
df_n["SD_normalized"] = (df_n["SD_Log_Close"] - df_n["SD_Log_Close"].mean())/df_n["SD_Log_Close"].std()
#xlb, xle, 
df_n

Price,Date,Open,High,Low,Close,Volume,Adj Close,ATR,SD_Log_Close,ATR_normalized,SD_normalized,Delta_Days
0,2000-01-12,144.59,144.59,142.88,143.06,6907700,90.64,4.024286,,0.422937,,0
1,2000-01-13,144.47,145.75,143.28,145.00,5158300,91.87,3.580000,,0.247474,,1
2,2000-01-14,146.53,147.47,145.97,146.97,7437300,93.11,3.321429,,0.145356,,2
3,2000-01-18,145.34,146.62,145.19,145.81,6488500,92.38,3.040000,,0.034211,,6
4,2000-01-19,145.31,147.00,145.00,147.00,6157900,93.13,2.182857,,-0.304303,,7
...,...,...,...,...,...,...,...,...,...,...,...,...
6403,2025-06-30,617.38,619.22,615.04,617.85,92502500,617.85,5.868571,0.005666,1.151305,-0.588181,9301
6404,2025-07-01,616.36,618.83,615.52,617.65,70030100,617.65,5.398571,0.004345,0.965687,-0.765501,9302
6405,2025-07-02,617.24,620.49,616.61,620.45,66510400,620.45,4.717143,0.003906,0.696569,-0.824331,9303
6406,2025-07-03,622.45,626.28,622.43,625.34,51065800,625.34,4.450000,0.003185,0.591065,-0.921087,9304


In [None]:
#drop na values (first 6 rows)
df_n = df_n.dropna()
df_n = df_n.reset_index()


ValueError: cannot insert level_0, already exists

In [None]:
df_n["index"] = df_n["index"]%7

In [31]:
df_n

Price,level_0,index,Date,Open,High,Low,Close,Volume,Adj Close,ATR,SD_Log_Close,ATR_normalized,SD_normalized,Delta_Days
0,0,0,2000-01-24,145.66,145.84,139.41,140.34,7896900,88.92,2.857143,0.015872,-0.038005,0.781583,12
1,1,1,2000-01-25,140.52,141.94,139.00,141.94,9942500,89.93,2.892857,0.015525,-0.023901,0.735084,13
2,2,2,2000-01-26,141.00,141.55,140.09,140.81,5158100,89.21,2.804286,0.013730,-0.058880,0.494139,14
3,3,3,2000-01-27,141.84,142.22,138.12,140.25,10922700,88.86,3.135714,0.013724,0.072011,0.493374,15
4,4,4,2000-01-28,139.44,140.06,135.53,135.88,11916200,86.08,3.524286,0.015265,0.225471,0.700118,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6396,6396,5,2025-06-30,617.38,619.22,615.04,617.85,92502500,617.85,5.868571,0.005666,1.151305,-0.588181,9301
6397,6397,6,2025-07-01,616.36,618.83,615.52,617.65,70030100,617.65,5.398571,0.004345,0.965687,-0.765501,9302
6398,6398,0,2025-07-02,617.24,620.49,616.61,620.45,66510400,620.45,4.717143,0.003906,0.696569,-0.824331,9303
6399,6399,1,2025-07-03,622.45,626.28,622.43,625.34,51065800,625.34,4.450000,0.003185,0.591065,-0.921087,9304


In [None]:
# df_n["Delta_Days"] = [(df_n.loc[i,"Date"] - df_n.loc[0,"Date"]).days for i in range(len(df_n))]
# #save clean data
# df_n.to_csv("Clean_Data.csv")

In [None]:
#define lstm model
import torch
from torch.nn import *
import torch.nn.functional as F

class NN_LSTM(Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.lstm = LSTM(input_size=input_size,hidden_size=30)
        self.fc = Linear(30,output_size)
    def activation(self,X):
        return F.relu(X)
    def forward(self,input):
        input,_ = self.lstm(input)
        input = self.fc(input[-1,:,:])
        return input #return the last prediction
#lstm_layer = LSTM(input_size=4,hidden_size=30)

In [None]:
#define PDE loss
def PDE_loss(v_hat,):
    return 0
    #use torch.autograd to get PDE loss

In [39]:
def tt_split(df_n,vol_metric):
    train = df_n.loc[[i<=len(df_n)*4/5 for i in range(len(df_n))]]
    X_train = train[["index","Open","Close","High","Low", vol_metric]].to_numpy()
    y_train = train[vol_metric].to_numpy()

    test = df_n.loc[[i>len(df_n)*4/5 for i in range(len(df_n))]]
    X_test = test[["index","Open","Close","High","Low", vol_metric]].to_numpy()
    y_test = test[vol_metric].to_numpy()
    return X_train,y_train,X_test,y_test

X_train,y_train,X_test,y_test = tt_split(df_n,"ATR_normalized")

#bollinger bands
#try moving median instead of moving average
#try moving quartiles instead of std; q3-q2  *1/2
#take longer time period - five years

#lstm,cnn,and rnn cant either predict directional volatiliy for xl stocks


In [40]:
def make_seq(X_train,y_train,X_test,y_test):
    T = 30  # sequence length (window size)
    X_seq = []
    y_seq = []
    X_seq_test = []
    y_seq_test = []

    for i in range(len(X_train) - T):
        X_seq.append(X_train[i:i+T])  # shape: [T, 6] <- what is wanted in lstm
        y_seq.append(y_train[i+T])    # predict next ATR value
    for i in range(len(X_test)-T):    
        X_seq_test.append(X_test[i:i+T])
        y_seq_test.append(y_test[i+T])


    X_seq = torch.tensor(X_seq, dtype=torch.float32)
    y_seq = torch.tensor(y_seq, dtype=torch.float32).unsqueeze(1)
    X_seq_test = torch.tensor(X_seq_test, dtype=torch.float32)
    y_seq_test = torch.tensor(y_seq_test, dtype=torch.float32).unsqueeze(1)
    
    return X_seq,y_seq,X_seq_test,y_seq_test
X_seq,y_seq,X_seq_test,y_seq_test = make_seq(X_train,y_train,X_test,y_test)

In [41]:
from torch.utils.data import DataLoader, TensorDataset

def create_loaders(X_seq,y_seq,X_seq_test,y_seq_test,batch_size=64):
    dataset = TensorDataset(X_seq, y_seq)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    dataset_test = TensorDataset(X_seq_test,y_seq_test)
    loader_test = DataLoader(dataset_test,batch_size=batch_size,shuffle=True)
    return loader,loader_test
loader,loader_test = create_loaders(X_seq,y_seq,X_seq_test,y_seq_test)

In [None]:
# #training loop
# model = NN_LSTM(input_size=5,output_size=1)
# epochs = 100
# optim = torch.optim.Adam(params = model.parameters())
# crit = MSELoss()
# losses = []
# losses_test = []

# for i in range(epochs):
#     running_loss = 0
#     for x_window,y_atr in loader:
#         #print("Running")
#         input = x_window.permute(1,0,2) #shape = [seq_length,batch_length,4]
#         out = model(input)
#         #print(y_atr.shape)
#         #print(out,y_atr)
#         #break
#         loss = crit(out,y_atr)
#         running_loss+=loss.item()
#         optim.zero_grad()
#         loss.backward()
#         optim.step()
#             # could try loss += (i+1)/period/sum(j/period for j in range(period))crit(out,y_train[i]); adds a coeff to give more weigt to recent ones
#     #break
#     running_loss/=(len(loader))
#     #print(f"Training Loss: {running_loss}")
#     losses.append(running_loss)

#     with torch.no_grad():
#         testing_loss = 0
#         for x_window_test,y_atr_test in loader_test:
#             out_test = model(x_window_test.permute(1,0,2))
#             #print(y_atr_test.shape)
#             loss = crit(out_test,y_atr_test)
#             testing_loss+=loss.item()
#         losses_test.append(testing_loss/(len(loader_test)))
    


In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# sns.lineplot(x=[i for i in range(len(losses))],y=losses)
# plt.title("Training Loss of LSTM (ATR_normalized) across 60 epochs")

In [None]:
# sns.lineplot(losses_test)
# plt.title("Testing Loss of LSTM (ATR_normalized) across 60 epochs")

In [None]:
# X_train_sd,y_train_sd,X_test_sd,y_test_sd = tt_split(df_n,vol_metric="SD_normalized")
# X_seq_sd,y_seq_sd,X_seq_test_sd,y_seq_test_sd = make_seq(X_train_sd,y_train_sd,X_test_sd,y_test_sd)
# loader_sd,loader_test_sd = create_loaders(X_seq_sd,y_seq_sd,X_seq_test_sd,y_seq_test_sd)

In [None]:
# #training loop
# model_sd = NN_LSTM(input_size=5,output_size=1)
# epochs = 100
# optim = torch.optim.Adam(params = model_sd.parameters())
# crit = MSELoss()
# losses = []
# losses_test = []

# for i in range(epochs):
#     running_loss = 0
#     for x_window,y_sd in loader_sd:
#         #print("Running")
#         input = x_window.permute(1,0,2) #shape = [seq_length,batch_length,4]
#         out = model_sd(input)
#         #print(y_sd.shape)
#         loss = crit(out,y_sd)
#         running_loss+=loss.item()
#         optim.zero_grad()
#         loss.backward()
#         optim.step()
#             # could try loss += (i+1)/period/sum(j/period for j in range(period))crit(out,y_train[i]); adds a coeff to give more weigt to recent ones
#     running_loss/=(len(loader_sd))
#     #print(f"Training Loss: {running_loss}")
#     losses.append(running_loss)

#     with torch.no_grad():
#         testing_loss = 0
#         for x_window_test,y_sd_test in loader_test_sd:
#             out_test = model_sd(x_window_test.permute(1,0,2))
#             #print(y_sd_test.shape)
#             loss = crit(out_test,y_sd_test)
#             testing_loss+=loss.item()
#         losses_test.append(testing_loss/(len(loader_test_sd)))


In [None]:
# sns.lineplot(losses)
# plt.title("Training Loss of LSTM (SD__normalized) across 60 epochs")

In [None]:
# sns.lineplot(losses_test)
# plt.title("Testing Loss of LSTM (SD_normalized) across 60 epochs")

In [None]:
#use past atrs
#try transformations of atr log(atr)

In [60]:
def pipeline(ticker,start,end,metric):
    df = get_stock(ticker,start_date=start,end_date=end,s_window=14,l_window=50)
    df_n= df.xs(ticker,axis=1,level=1)
    
        #define ATR columns
    high = df_n["High"]
    low = df_n["Low"]
    close = df_n["Close"]

    prev_close = close.shift(1)


    tr = pd.concat([
        high - low,
        (high - prev_close).abs(),
        (low - prev_close).abs()
    ], axis=1).max(axis=1)

    df_n["ATR"] = tr.rolling(7).mean()
    
    log_diff = np.log(df_n["Close"]/df_n["Close"].shift(1))
    df_n["SD_Log_Close"] = log_diff.rolling(7).std()
    df_n["ATR_normalized"] = (df_n["ATR"] - df_n["ATR"].mean())/df_n["ATR"].std()
    df_n["SD_normalized"] = (df_n["SD_Log_Close"] - df_n["SD_Log_Close"].mean())/df_n["SD_Log_Close"].std()

    df_n = df_n.dropna()
    df_n = df_n.reset_index()
    df_n["index"] = df_n.index%7 
    print(df_n)

    X_train,y_train,X_test,y_test = tt_split(df_n, metric)

    X_seq,y_seq,X_seq_test,y_seq_test = make_seq(X_train,y_train,X_test,y_test)

    loader,loader_test = create_loaders(X_seq,y_seq,X_seq_test,y_seq_test)

    #training loop
    
    model = NN_LSTM(input_size=6,output_size=1)
    epochs = 100
    optim = torch.optim.Adam(params = model.parameters())
    crit = MSELoss()
    losses = []
    losses_test = []

    for i in range(epochs):
        running_loss = 0
        for x_window,y_atr in loader:
            #print("Running")
            input = x_window.permute(1,0,2) #shape = [seq_length,batch_length,4]
            out = model(input)
            #print(y_atr.shape)
            #print(out,y_atr)
            #break
            loss = crit(out,y_atr)
            running_loss+=loss.item()
            optim.zero_grad()
            loss.backward()
            optim.step()
                # could try loss += (i+1)/period/sum(j/period for j in range(period))crit(out,y_train[i]); adds a coeff to give more weigt to recent ones
        #break
        running_loss/=(len(loader))
        #print(f"Training Loss: {running_loss}")
        losses.append(running_loss)

        with torch.no_grad():
            testing_loss = 0
            for x_window_test,y_atr_test in loader_test:
                out_test = model(x_window_test.permute(1,0,2))
                #print(y_atr_test.shape)
                loss = crit(out_test,y_atr_test)
                testing_loss+=loss.item()
            losses_test.append(testing_loss/(len(loader_test)))

    sns.lineplot(x=[i for i in range(len(losses))],y=losses)
    plt.title(f"Training Loss of LSTM ({metric}) across 60 epochs for {ticker}")
    plt.show()

    sns.lineplot(losses_test)
    plt.title(f"Testing Loss of LSTM ({metric}) across 60 epochs for {ticker}")
    plt.show()

    return min(losses),min(losses_test)

    

In [61]:
start = "2000-01-01"
end = "2025-07-14"
#pipeline("SPY",start,end,metric="ATR_normalized")

In [62]:
#pipeline("XLP",start,end,metric="SD_normalized")
import pandas as pd

In [63]:
import warnings
warnings.filterwarnings('ignore')

In [64]:
etfs = ['XLB','XLE','XLF','XLI','XLP','XLV','XLY','XLU']
metrics = ["ATR_normalized","SD_normalized"]
df = pd.DataFrame({"etf":[],"metric":[],"train_loss":[],"test_loss":[]})
for etf in etfs[:2]:
    for metric in metrics:
        train_loss_atr, test_loss_atr = pipeline(etf,start,end,metric=metric)
        df.loc[len(df)] = [etf,metric,train_loss_atr,test_loss_atr]


[*********************100%***********************]  1 of 1 completed


read  6419  lines of data for ticker:  XLB
Price       Date   Open   High    Low  Close   Volume  Adj Close       ATR  \
0     2000-01-12  26.61  27.03  26.61  26.64   135300      15.04  0.640000   
1     2000-01-13  26.89  26.97  26.62  26.75    45900      15.11  0.627143   
2     2000-01-14  26.52  26.72  26.44  26.56    76000      15.00  0.515714   
3     2000-01-18  26.22  26.31  25.69  25.92    34400      14.64  0.504286   
4     2000-01-19  25.78  25.95  25.42  25.42   125700      14.36  0.510000   
...          ...    ...    ...    ...    ...      ...        ...       ...   
6407  2025-07-07  90.90  91.22  90.01  90.49  5880800      90.49  1.254286   
6408  2025-07-08  90.68  91.52  90.57  91.23  8173800      91.23  1.224286   
6409  2025-07-09  91.47  91.88  90.92  91.78  6717800      91.78  1.235714   
6410  2025-07-10  92.00  92.86  91.81  92.27  6432800      92.27  1.297143   
6411  2025-07-11  91.28  91.68  90.81  91.52  5781600      91.52  1.108571   

Price  SD_Log_Close 

: 

In [None]:
df