# 1. Get S&P data from Yahoo Finance

In [8]:
from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import os
import pandas_ta
import warnings

warnings.filterwarnings('ignore')

# Define a file path to save and load the data
data_file = 'stock_data.csv'

# Check if the data file exists, and load it if it does
if os.path.exists(data_file):
  df = pd.read_csv(data_file, index_col=[0, 1], header=0, parse_dates=True)
else:
  # If the data file doesn't exist, download the data
  sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]

  # symbols cleanup - some symbols contain dots ".", we need to replace them with "-" so we can read data with yfinance
  sp500['Symbol'] = sp500['Symbol'].str.replace('.', '-')

  # grab list with all the stock symbols
  symbols_list = sp500['Symbol'].unique().tolist()

  end_date = '2023-10-31'
  start_date = pd.to_datetime(end_date)-pd.DateOffset(365*8) # 8 years ago

  # download data from yahooFinance
  df = yf.download(tickers=symbols_list,
    start=start_date,
    end=end_date).stack()


  # Save the downloaded data to a file
  df.to_csv(data_file)


# add/change column names
df.index.names = ['date', 'ticker']
# fix column names to lower case
df.columns = df.columns.str.lower()

df

Unnamed: 0_level_0,Unnamed: 1_level_0,adj close,close,high,low,open,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-11-02,A,36.128700,38.590000,38.619999,37.799999,37.869999,1810800.0
2015-11-02,AAL,44.313274,46.470001,46.820000,46.200001,46.200001,6189300.0
2015-11-02,AAPL,27.498978,30.295000,30.340000,29.902500,30.200001,128813200.0
2015-11-02,ABBV,44.862263,63.380001,64.199997,61.439999,61.599998,17008700.0
2015-11-02,ABT,39.108433,45.430000,45.500000,44.599998,44.880001,5477800.0
...,...,...,...,...,...,...,...
2023-10-30,YUM,119.870003,119.870003,120.639999,119.260002,120.290001,1551900.0
2023-10-30,ZBH,103.410004,103.410004,104.099998,102.330002,103.760002,1309800.0
2023-10-30,ZBRA,209.770004,209.770004,211.210007,204.470001,207.500000,970000.0
2023-10-30,ZION,29.980000,29.980000,30.180000,29.320000,29.840000,2837100.0


# 2. Calculate features and technical indicators for each stock
- Garman-Klass Volatility (intraday volatility for a given asset)
- RSI
- Bollinger Bands
- ATR
- MACD
- Dollar Volume

Garman-Klass Volatility
\begin{equation}
\frac{(ln(High) - ln(Low))^{2}}{2}-(2ln(2)-1)(ln(\text{Adj Close})-ln(Open))^{2}
\end{equation}



In [11]:
## Garman-Klass Volatility
df['garman_klass_vol'] = ((np.log(df['high']) - np.log(df['low']))**2)/2-(2*np.log(2)-1)*(np.log(df['adj close'])-np.log(df['open']))**2

## RSI - groupby (levl=1) == ticker
df['rsi'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.rsi(close=x, length=20));

df

NameError: name 'pandas_ta' is not defined