**1.Download S&P 500 Data**

In [2]:
import pandas as pd
from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import yfinance as yf
import pandas_ta as ta
import warnings
warnings.filterwarnings('ignore')



In [3]:
sp500=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]

sp500['Symbol'] = sp500['Symbol'].str.replace('.','-')

symbols_list = sp500['Symbol'].unique().tolist()

end_date= '2024-09-30'

start_date = pd.to_datetime(end_date)-pd.DateOffset(365*10)
print(start_date)

2014-10-03 00:00:00


In [4]:
df= yf.download(tickers=symbols_list,start=start_date,end=end_date).stack()


[*********************100%***********************]  503 of 503 completed


working with all these huge rows and colums data would be inefficient so we would use df.stack()

In [5]:

df.index.names=['date','ticker']
df.columns=df.columns.str.lower()
df

Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-10-03 00:00:00+00:00,A,37.466999,40.708157,40.779686,40.171673,40.221745,2205065.0
2014-10-03 00:00:00+00:00,AAPL,22.113325,24.905001,25.052500,24.760000,24.860001,173878400.0
2014-10-03 00:00:00+00:00,ABBV,38.515568,58.689999,58.779999,57.340000,57.419998,11675600.0
2014-10-03 00:00:00+00:00,ABT,34.355885,41.790001,41.860001,41.279999,41.410000,3430900.0
2014-10-03 00:00:00+00:00,ACGL,18.469999,18.469999,18.490000,18.320000,18.333332,1470300.0
...,...,...,...,...,...,...,...
2024-09-27 00:00:00+00:00,XYL,134.509995,134.509995,136.289993,133.779999,134.660004,691300.0
2024-09-27 00:00:00+00:00,YUM,139.919998,139.919998,140.020004,136.990005,137.520004,2778300.0
2024-09-27 00:00:00+00:00,ZBH,107.980003,108.220001,110.190002,108.190002,108.769997,1166200.0
2024-09-27 00:00:00+00:00,ZBRA,368.600006,368.600006,374.029999,368.269989,372.299988,213600.0


<!-- # ### # ###### 2.CALCULATE FEATURES AND TECHNICAL INDICATORS FOR EACH STOCK
1**) garman-klass volatility
2)RSI
3)Bollinger bands
4)ATR
5)MACD
6)Dollar volume**
GK = sqrt((0.5 * (log(high / low))^2) - ((2 * log(2) - 1) * (log(close / open))^2))(garman klass simplified in code)

 -->


In [6]:
# **** GARMAN-KLASS VOLATILITY****

df['garman_klass_vol']=((np.log(df['high'])-np.log(df['low']))**2)/2-(2*np.log(2)-1)*(np.log(df['adj close'])-(np.log(df['open'])))**2
df

Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume,garman_klass_vol
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-10-03 00:00:00+00:00,A,37.466999,40.708157,40.779686,40.171673,40.221745,2205065.0,-0.001832
2014-10-03 00:00:00+00:00,AAPL,22.113325,24.905001,25.052500,24.760000,24.860001,173878400.0,-0.005226
2014-10-03 00:00:00+00:00,ABBV,38.515568,58.689999,58.779999,57.340000,57.419998,11675600.0,-0.061293
2014-10-03 00:00:00+00:00,ABT,34.355885,41.790001,41.860001,41.279999,41.410000,3430900.0,-0.013375
2014-10-03 00:00:00+00:00,ACGL,18.469999,18.469999,18.490000,18.320000,18.333332,1470300.0,0.000021
...,...,...,...,...,...,...,...,...
2024-09-27 00:00:00+00:00,XYL,134.509995,134.509995,136.289993,133.779999,134.660004,691300.0,0.000172
2024-09-27 00:00:00+00:00,YUM,139.919998,139.919998,140.020004,136.990005,137.520004,2778300.0,0.000124
2024-09-27 00:00:00+00:00,ZBH,107.980003,108.220001,110.190002,108.190002,108.769997,1166200.0,0.000147
2024-09-27 00:00:00+00:00,ZBRA,368.600006,368.600006,374.029999,368.269989,372.299988,213600.0,0.000082


In [7]:
#  RSI 
# A technical indicator that measures the speed and change of price movements of an asset,
#  such as a stock, index, or ETF. The RSI is displayed on a scale of 0 to 100, 
#  with readings above 70 indicating an overbought asset and readings below 30 indicating an oversold asset

df['rsi']=df.groupby(level=1)['adj close'].transform(lambda x : ta.rsi(close=x , length=20))
# df.xs('AAPL',level=1)['rsi'].plot()(#to ckeck the rsi yes its correct)

In [18]:
# bollinger bandsBollinger Bands are a technical analysis tool used 
# to measure market volatility and identify potential trading opportunities.
# They are made up of three lines: an upper band, a middle band, and a lower band. 
# The middle band is a moving average, while the upper and lower bands are based on standard deviation,
# which is a measure of price volatility.


# Check the output columns of the Bollinger Bands calculation
sample_output = ta.bbands(close=np.log1p(df['adj close']), length=20)
print(sample_output.columns)


Index(['BBL_20_2.0', 'BBM_20_2.0', 'BBU_20_2.0', 'BBB_20_2.0', 'BBP_20_2.0'], dtype='object')


In [26]:
# Calculate the lower Bollinger Band
df['bb_low'] = df.groupby(level=1)['adj close'].transform(
    lambda x: ta.bbands(close=np.log1p(x), length=20)['BBL_20_2.0'] if ta.bbands(close=np.log1p(x), length=20) is not None else np.nan
)

# Calculate the middle Bollinger Band
df['bb_middle'] = df.groupby(level=1)['adj close'].transform(
    lambda x: ta.bbands(close=np.log1p(x), length=20)['BBM_20_2.0'] if ta.bbands(close=np.log1p(x), length=20) is not None else np.nan
)

# Calculate the upper Bollinger Band
df['bb_high'] = df.groupby(level=1)['adj close'].transform(
    lambda x: ta.bbands(close=np.log1p(x), length=20)['BBU_20_2.0'] if ta.bbands(close=np.log1p(x), length=20) is not None else np.nan
)
df

Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume,garman_klass_vol,rsi,bb_low,bb_middle,bb_high
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2014-10-03 00:00:00+00:00,A,37.466999,40.708157,40.779686,40.171673,40.221745,2205065.0,-0.001832,,,,
2014-10-03 00:00:00+00:00,AAPL,22.113325,24.905001,25.052500,24.760000,24.860001,173878400.0,-0.005226,,,,
2014-10-03 00:00:00+00:00,ABBV,38.515568,58.689999,58.779999,57.340000,57.419998,11675600.0,-0.061293,,,,
2014-10-03 00:00:00+00:00,ABT,34.355885,41.790001,41.860001,41.279999,41.410000,3430900.0,-0.013375,,,,
2014-10-03 00:00:00+00:00,ACGL,18.469999,18.469999,18.490000,18.320000,18.333332,1470300.0,0.000021,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-27 00:00:00+00:00,XYL,134.509995,134.509995,136.289993,133.779999,134.660004,691300.0,0.000172,52.810998,4.849547,4.891114,4.932681
2024-09-27 00:00:00+00:00,YUM,139.919998,139.919998,140.020004,136.990005,137.520004,2778300.0,0.000124,62.536185,4.867805,4.902218,4.936632
2024-09-27 00:00:00+00:00,ZBH,107.980003,108.220001,110.190002,108.190002,108.769997,1166200.0,0.000147,48.599348,4.623476,4.685195,4.746914
2024-09-27 00:00:00+00:00,ZBRA,368.600006,368.600006,374.029999,368.269989,372.299988,213600.0,0.000082,64.223784,5.759747,5.851378,5.943010


Average True Range (ATR) is a technical analysis tool that measures the volatility of a market by averaging the true ranges over a specified period: 
ATR is a key component of many traders' risk management strategies because it helps them:
Set stop losses
ATR helps traders set more accurate stop-loss levels that account for an asset's natural price fluctuations.
Determine position sizes
ATR helps traders determine position sizes that align with their risk tolerance.
Identify breakouts or trend reversals
Traders can monitor changes in ATR values to identify potential breakouts or trend reversals

In [33]:
#ATR
def compute_atr(stock_data):
    atr = ta.atr(high=stock_data['high'], low=stock_data['low'], close=stock_data['close'], length=14)
    # Check if ATR calculation is successful before proceeding
    if atr is not None:
        return atr.sub(atr.mean()).div(atr.std())
    else:
        return pd.Series([np.nan] * len(stock_data), index=stock_data.index)

df['atr'] = df.groupby(level=1, group_keys=False).apply(compute_atr)

#atr.sub(atr.mean()): Subtracts the mean of the ATR values from each ATR value, 
# centering the data around zero.
#.div(atr.std()): Divides each centered ATR value by the standard deviation of the ATR values, 
# scaling the data. This standardizes the ATR, which now has a mean of 0 and a standard deviation of 1.




**Moving Average Convergence Divergence (MACD) **is a technical indicator that helps investors identify entry points for buying or selling. It tracks the convergence and divergence of two exponential moving averages (EMAs) to gauge the strength of stock price movement: 
Calculation
The MACD line is calculated by subtracting the 26-period EMA from the 12-period EMA. 
Signal line
A nine-period EMA of the MACD line is plotted on the MACD chart as the signal line. 


In [41]:
#MACD
def compute_macd(close):
    # Calculate MACD
    macd = ta.macd(close=close, length=20)
    
    # Check if MACD calculation is successful
    if macd is not None and not macd.empty:
        macd_values = macd.iloc[:, 0]  # Get the MACD values
        return macd_values.sub(macd_values.mean()).div(macd_values.std())
    else:
        return pd.Series([np.nan] * len(close), index=close.index)

# Apply the MACD computation by grouping by the second level
df['macd'] = df.groupby(level=1, group_keys=False)['adj close'].apply(compute_macd)


In [43]:
df['dollar_volume']=(df['adj close']*df['volume'])/1e6
df

Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume,garman_klass_vol,rsi,bb_low,bb_middle,bb_high,atr,macd,dollar_volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2014-10-03 00:00:00+00:00,A,37.466999,40.708157,40.779686,40.171673,40.221745,2205065.0,-0.001832,,,,,,,82.617168
2014-10-03 00:00:00+00:00,AAPL,22.113325,24.905001,25.052500,24.760000,24.860001,173878400.0,-0.005226,,,,,,,3845.029590
2014-10-03 00:00:00+00:00,ABBV,38.515568,58.689999,58.779999,57.340000,57.419998,11675600.0,-0.061293,,,,,,,449.692363
2014-10-03 00:00:00+00:00,ABT,34.355885,41.790001,41.860001,41.279999,41.410000,3430900.0,-0.013375,,,,,,,117.871604
2014-10-03 00:00:00+00:00,ACGL,18.469999,18.469999,18.490000,18.320000,18.333332,1470300.0,0.000021,,,,,,,27.156440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-27 00:00:00+00:00,XYL,134.509995,134.509995,136.289993,133.779999,134.660004,691300.0,0.000172,52.810998,4.849547,4.891114,4.932681,0.935011,0.088067,92.986759
2024-09-27 00:00:00+00:00,YUM,139.919998,139.919998,140.020004,136.990005,137.520004,2778300.0,0.000124,62.536185,4.867805,4.902218,4.936632,1.023418,0.131712,388.739731
2024-09-27 00:00:00+00:00,ZBH,107.980003,108.220001,110.190002,108.190002,108.769997,1166200.0,0.000147,48.599348,4.623476,4.685195,4.746914,-0.623002,-0.377126,125.926280
2024-09-27 00:00:00+00:00,ZBRA,368.600006,368.600006,374.029999,368.269989,372.299988,213600.0,0.000082,64.223784,5.759747,5.851378,5.943010,0.350878,1.174472,78.732961
