# Retrieve Stock Data and Save to DF

In [1]:
from pandas_datareader import data as web
import os
import pandas as pd
import yfinance as yf

def get_stock(ticker, start_date, end_date, s_window, l_window):
    try:
        #yf.pdr_override()
        df = yf.download(ticker, start=start_date, end=end_date,auto_adjust=False)
        #print("DF: ",df)
# can use this as well        df = web.get_data_yahoo(ticker, start=start_date, end=end_date)
        df['Return'] = df['Adj Close'].pct_change()
        df['Return'].fillna(0, inplace = True)
        df['Date'] = df.index
        df['Date'] = pd.to_datetime(df['Date'])
        df['Month'] = df['Date'].dt.month
        df['Year'] = df['Date'].dt.year 
        df['Day'] = df['Date'].dt.day
        for col in ['Open', 'High', 'Low', 'Close', 'Adj Close']:
            df[col] = df[col].round(2)
        df['Weekday'] = df['Date'].dt.day_name()
        df['Week_Number'] = df['Date'].dt.strftime('%U')
        df['Year_Week'] = df['Date'].dt.strftime('%Y-%U')
        df['Short_MA'] = df['Adj Close'].rolling(window=s_window, min_periods=1).mean()
        df['Long_MA'] = df['Adj Close'].rolling(window=l_window, min_periods=1).mean()        
        col_list = ['Date', 'Year', 'Month', 'Day', 'Weekday', 
                    'Week_Number', 'Year_Week', 'Open', 
                    'High', 'Low', 'Close', 'Volume', 'Adj Close',
                    'Return', 'Short_MA', 'Long_MA']
        num_lines = len(df)
        df = df[col_list]
        print('read ', num_lines, ' lines of data for ticker: ' , ticker)
        return df
    except Exception as error:
        print(error)
        return None

In [11]:
try:
    ticker='SPY'
    input_dir = os.getcwd()
    output_file = os.path.join(input_dir, ticker + '.csv')
    df = get_stock(ticker, start_date='2017-01-01', end_date='2025-07-08', 
               s_window=14, l_window=50)
    df.to_csv(output_file, index=False)
    print('wrote ' + str(len(df)) + ' lines to file: ' + output_file)
except Exception as e:
    print(e)
    print('failed to get Yahoo stock data for ticker: ', ticker)

[*********************100%***********************]  1 of 1 completed

read  2138  lines of data for ticker:  SPY
wrote 2138 lines to file: c:\Users\aksha\Documents\BU_RISE\Final_Project\stock-data\SPY.csv



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Return'].fillna(0, inplace = True)


In [None]:
#psuedocode
'''
1. create ATR column
2. create SD column
3. build LSTM columnn
4. split into train and test
5. train lstm on training data for ATR and test
6. train lstm on training data for SD and test
7. plot results and avg. returns based on decisions (buy if delta_volatility<-alpha, sell if delta_volatility>alpha)
'''

'\n1. create ATR column\n2. create SD column\n3. build LSTM columnn\n4. split into train and test\n5. train lstm on training data for ATR and test\n6. train lstm on training data for SD and test\n7. plot results and avg. returns based on decisions (buy if delta_volatility<-alpha, sell if delta_volatility>alpha)\n'

In [17]:
df_n= df.xs('SPY',axis=1,level=1) #.xs returns a cross section; selects only values within the SPY indexes at level=1
df_n.head()
#cleaned up data

Price,Open,High,Low,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-03,225.04,225.83,223.88,225.24,91366500,196.12
2017-01-04,225.62,226.75,225.61,226.58,78744400,197.28
2017-01-05,226.27,226.58,225.48,226.4,78379000,197.13
2017-01-06,226.53,227.75,225.9,227.21,71559900,197.83
2017-01-09,226.91,227.07,226.42,226.46,46939700,197.18


In [19]:
import math
#challenge of how to deal with missing values in time series data
#choose sliding window of length=N (could be 20?) N_lstmtraining>N_atrslidingwindow
def atr_func(d):
    running = 0
    for i in range(len(d)):
        high = d.loc[i,"High"]
        low = d.loc[i,"Low"]
        if i==0:
            running += high-low
            continue
        if i>0:
            y_close = d.loc[i,"Close"]
        
        TR = max(high-low,max(math.abs(high-y_close),math.abs(low-y_close)))
        running+=TR
    return running/len(d)

df_n['ATR'] = df_n.rolling(window=14).apply(atr_func)
df_n

IndexingError: Too many indexers

In [None]:
#define ATR columns
high = df_n["High"]
low = df_n["Low"]
close = df_n["Close"]

prev_close = close.shift(1)


tr = pd.concat([
    high - low,
    (high - prev_close).abs(),
    (low - prev_close).abs()
], axis=1).max(axis=1)

df_n["ATR"] = tr.rolling(7).mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_n["ATR"] = tr.rolling(7).mean()


In [27]:
import numpy as np

log_diff = np.log(df_n["Close"]/df_n["Close"].shift(1))
df_n["SD_Log_Close"] = log_diff.rolling(7).std()
df_n

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_n["SD_Log_Close"] = log_diff.rolling(7).std()


Price,Open,High,Low,Close,Volume,Adj Close,ATR,SD_Log_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-03,225.04,225.83,223.88,225.24,91366500,196.12,,
2017-01-04,225.62,226.75,225.61,226.58,78744400,197.28,,
2017-01-05,226.27,226.58,225.48,226.40,78379000,197.13,,
2017-01-06,226.53,227.75,225.90,227.21,71559900,197.83,,
2017-01-09,226.91,227.07,226.42,226.46,46939700,197.18,,
...,...,...,...,...,...,...,...,...
2025-06-30,617.38,619.22,615.04,617.85,92502500,617.85,5.868571,0.005666
2025-07-01,616.36,618.83,615.52,617.65,70030100,617.65,5.398571,0.004345
2025-07-02,617.24,620.49,616.61,620.45,66510400,620.45,4.717143,0.003906
2025-07-03,622.45,626.28,622.43,625.34,51065800,625.34,4.450000,0.003185


In [39]:
#drop na values (first 6 rows)
df_n = df_n.dropna()
df_n = df_n.reset_index()

In [41]:
(df_n.loc[1,"Date"] - df_n.loc[0,"Date"]).days

1

In [47]:
df_n["Delta_Days"] = [(df_n.loc[i,"Date"] - df_n.loc[0,"Date"]).days for i in range(len(df_n))]

In [None]:
#define lstm model
import torch
from torch.nn import *
import torch.nn.functional as F

class NN_LSTM(Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.lstm = LSTM(input_size=input_size,hidden_size=30)
        self.second = Linear(30,output_size)
    def activation(self,X):
        return F.relu(X)
    def forward(self,input,h_t,c_t):
        input,h_t,c_t = self.lstm(input,h_t,c_t)
        input = self.second(input)
        return self.activation(input),h_t,c_t
#lstm_layer = LSTM(input_size=4,hidden_size=30)

In [49]:
#define PDE loss
def PDE_loss(v_hat):
    return 0
    #use torch.autograd to get PDE loss

In [63]:
train = df_n.loc[[i<=len(df_n)*4/5 for i in range(len(df_n))]]
X_train = train[["Open","Close","High","Low"]].to_numpy()
vol_metric = "ATR"
y_train = train[vol_metric].to_numpy()

test = df_n.loc[[i>len(df_n)*4/5 for i in range(len(df_n))]]
X_test = test[["Open","Close","High","Low"]].to_numpy()
y_test = test[vol_metric].to_numpy()

In [None]:
#training loop
model = NN_LSTM(input_size=4,output_size=1)
epochs = 100
optim = torch.optim.Adam(params = model.parameters())
crit = MSELoss()

for i in range(epochs):
    period = 7
    loss = 0
    for i in range(len(df_n)):
        out,h_t,c_t = model(X_train[i],h_t,c_t)
        loss += crit(out,y_train[i]) + PDE_loss(out) #accumulate loss over period
        if (i+1)%7==0: #abt to be new period
            optim.zero_grad()
            loss.backward()
            optim.step()
            loss = 0
            #reset the hidden state?
        # could try loss += (i+1)/period/sum(j/period for j in range(period))crit(out,y_train[i]); adds a coeff to give more weigt to recent ones
    print("Training Loss")

    with torch.no_grad():
        test_out = model(X_test)
        test_loss = sum(crit(test_out,y_train))/len(df_n)
        print(f"Test Loss: {test}")
