In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Analysis and Holdings
earnings_estimate = pd.read_csv('data/analysis_and_holdings/AAPL_earnings_estimate.csv')
earnings_history = pd.read_csv('data/analysis_and_holdings/AAPL_earnings_history.csv')
eps_revisions = pd.read_csv('data/analysis_and_holdings/AAPL_eps_revisions.csv')
eps_trend = pd.read_csv('data/analysis_and_holdings/AAPL_eps_trend.csv')
growth_estimates = pd.read_csv('data/analysis_and_holdings/AAPL_growth_estimates.csv')
insider_purchases = pd.read_csv('data/analysis_and_holdings/AAPL_insider_purchases.csv')
insider_roster_holders = pd.read_csv('data/analysis_and_holdings/AAPL_insider_roster_holders.csv')
institutional_holders = pd.read_csv('data/analysis_and_holdings/AAPL_institutional_holders.csv')
major_holders = pd.read_csv('data/analysis_and_holdings/AAPL_major_holders.csv')
mutualfunds_holders = pd.read_csv('data/analysis_and_holdings/AAPL_mutualfund_holders.csv')
reccomendations = pd.read_csv('data/analysis_and_holdings/AAPL_recommendations.csv')
revenue_estimate = pd.read_csv('data/analysis_and_holdings/AAPL_revenue_estimate.csv')
sustainability = pd.read_csv('data/analysis_and_holdings/AAPL_sustainability.csv')
upgrades_downgrades = pd.read_csv('data/analysis_and_holdings/AAPL_upgrades_downgrades.csv')

# Financials
balance_sheet = pd.read_csv('data/financials/AAPL_balance_sheet.csv', index_col=0)
cashflow = pd.read_csv('data/financials/AAPL_cashflow.csv', index_col=0)
income_statement = pd.read_csv('data/financials/AAPL_income_statement.csv', index_col=0)

# Info
actions = pd.read_csv('data/info/AAPL_actions.csv', index_col=0)
capital_gains = pd.read_csv('data/info/AAPL_capital_gains.csv', index_col=0)
dividends = pd.read_csv('data/info/AAPL_dividends.csv', index_col=0)
fast_info = pd.read_csv('data/info/AAPL_fast_info.csv', index_col=0)
history = pd.read_csv('data/info/AAPL_history.csv', index_col=0)
info = pd.read_csv('data/info/AAPL_info.csv')
splits = pd.read_csv('data/info/AAPL_splits.csv', index_col=0)

# Price and Volume
ohlc = pd.read_csv('data/price/AAPL_price_volume.csv', index_col=0, skiprows=2)
vix = pd.read_csv('../VIX/data/price/VIX_price_volume.csv', header=None, skiprows=3)

# Macro data
to_year_treasury_index = pd.read_csv('../market_data/data_transformed/10_year_treasury_index.csv')
dow_jones = pd.read_csv('../market_data/data_transformed/Dow_Jones_index.csv')
nasdaq = pd.read_csv('../market_data/data_transformed/Nasdaq_index.csv')
nyse_composite = pd.read_csv('../market_data/data_transformed/NYSE_composite_index.csv')
oil = pd.read_csv('../market_data/data_transformed/Oil_index.csv')
phlx_semiconductor_index = pd.read_csv('../market_data/data_transformed/phlx_semiconductor_index.csv')
russell_200 = pd.read_csv('../market_data/data_transformed/russell2000.csv')
sp_500 = pd.read_csv('../market_data/data_transformed/sp500.csv')

# Data preprocessing

In [3]:
numeric_cols = ['Open', 'High', 'Low', 'Close']

ohlc.columns = ['Open', 'High', 'Low', 'Close', 'Volume']

for col in numeric_cols:
    ohlc[col] = pd.to_numeric(ohlc[col], errors='coerce')

In [4]:
ohlc['Volume'] = pd.to_numeric(ohlc['Volume'], errors='coerce')
ohlc['High'] = pd.to_numeric(ohlc['High'], errors='coerce')
ohlc['Low'] = pd.to_numeric(ohlc['Low'], errors='coerce')
ohlc['Open'] = pd.to_numeric(ohlc['Open'], errors='coerce')
ohlc['Close'] = pd.to_numeric(ohlc['Close'], errors='coerce')


all_time_low = ohlc['Low'].min()
all_time_low_date = ohlc['Low'].idxmin()
print(f'All time low: {all_time_low} on {all_time_low_date}')

all_time_high = ohlc['High'].max()
all_time_high_date = ohlc['High'].idxmax()
print(f'All time high: {all_time_high} on {all_time_high_date}')

all_time_low_vol = ohlc['Volume'].min()
all_time_low_vol_date = ohlc['Volume'].idxmin()
print(f'All time low volume: {all_time_low_vol} on {all_time_low_vol_date}')

all_time_high_vol = ohlc['Volume'].max()
all_time_high_vol_date = ohlc['Volume'].idxmax()
print(f'All time high volume: {all_time_high_vol} on {all_time_high_vol_date}')

All time low: 0.037681121379137 on 1982-07-08
All time high: 259.239990234375 on 2025-10-03
All time low volume: 0 on 1981-08-10
All time high volume: 7421640800 on 2000-09-29


In [5]:
data = ohlc.copy()

data.index = pd.to_datetime(data.index, format='%Y-%m-%d')

Adding percentage change of the closing price# Feature Engineering

In [6]:
data['Return'] = data['Close'].pct_change()

Adding volatility indicators

In [7]:
data['Volatility_20'] = data['Close'].rolling(window=20).std()
data['Volatility_50'] = data['Close'].rolling(window=50).std()

Adding ATR (Average True Range) — captures volatility from high/low ranges

In [8]:
high_low = data['High'] - data['Low']
high_close = (data['High'] - data['Close'].shift()).abs()
low_close = (data['Low'] - data['Close'].shift()).abs()

data['TR'] = high_low.combine(high_close, max).combine(low_close, max)
data['ATR_14'] = data['TR'].rolling(window=14).mean()

Adding moving averages

In [9]:
windows = [5, 20, 50, 200]

for w in windows:
    data[f'SMA_{w}'] = data['Close'].rolling(window=w).mean()
    data[f'EMA_{w}'] = data['Close'].ewm(span=w, adjust=False).mean()

Momentum indicators

In [10]:
# RSI (Relative Strength Index)
delta = data['Close'].diff()
gain = delta.clip(lower=0)
loss = -delta.clip(upper=0)

avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()

rs = avg_gain / avg_loss
data['RSI_14'] = 100 - (100 / (1 + rs))

# MACD (Moving Average Convergence Divergence)
ema_12 = data['Close'].ewm(span=12, adjust=False).mean()
ema_26 = data['Close'].ewm(span=26, adjust=False).mean()

data['MACD'] = ema_12 - ema_26
data['MACD_signal'] = data['MACD'].ewm(span=9, adjust=False).mean()

# Stochastic Oscillator
low_14 = data['Low'].rolling(14).min()
high_14 = data['High'].rolling(14).max()

data['Stochastic'] = 100 * (data['Close'] - low_14) / (high_14 - low_14)

# ADX (Average Directional Index)
high = data['High']
low = data['Low']
close = data['Close']

plus_dm = high.diff()
minus_dm = low.diff().abs()
plus_dm[plus_dm < 0] = 0
minus_dm[minus_dm < 0] = 0

tr = pd.concat([high - low, (high - close.shift()).abs(), (low - close.shift()).abs()], axis=1).max(axis=1)
atr = tr.rolling(14).mean()

plus_di = 100 * (plus_dm.rolling(14).sum() / atr)
minus_di = 100 * (minus_dm.rolling(14).sum() / atr)

dx = 100 * (abs(plus_di - minus_di) / (plus_di + minus_di))
data['ADX_14'] = dx.rolling(14).mean()

# On-Balance Volume (OBV)
data['OBV'] = (np.sign(data['Close'].diff()) * data['Volume']).fillna(0).cumsum()

# Money Flow Index (MFI) - fully vectorized
typical_price = (data['High'] + data['Low'] + data['Close']) / 3
money_flow = typical_price * data['Volume']
tp_diff = typical_price.diff()

positive_mf = money_flow.where(tp_diff > 0, 0).rolling(14).sum()
negative_mf = money_flow.where(tp_diff < 0, 0).rolling(14).sum()
data['MFI_14'] = 100 * (positive_mf / (positive_mf + negative_mf))

# Basic Candlestick Patterns
data['Doji'] = abs(data['Close'] - data['Open']) <= 0.1 * (data['High'] - data['Low'])
data['Hammer'] = ((data['High'] - data['Low']) > 3 * (data['Open'] - data['Close'])) & \
                 ((data['Close'] - data['Low']) / (0.001 + data['High'] - data['Low']) > 0.6)
data['Engulfing'] = ((data['Close'] > data['Open'].shift()) &
                     (data['Open'] < data['Close'].shift()) &
                     (data['Close'] - data['Open'] > data['Open'].shift() - data['Close'].shift()))

Fundamental features

Dividends (0/1)

In [11]:
# Make sure data index is timezone-naive
data.index = pd.to_datetime(data.index).tz_localize(None)

# Convert dividends index to datetime (force conversion and make tz-naive safely)
dividend_dates = pd.to_datetime(dividends.index, utc=True, errors='raise')  # convert all to UTC first
dividend_dates = dividend_dates.tz_convert(None)  # then drop tz info

# Remove any invalid (NaT) entries
dividend_dates = dividend_dates.dropna()

# Compare only by date (normalize removes the time part)
dividend_dates = set(dividend_dates.normalize())

# Create Dividends (0/1) column in data
data['Dividends'] = data.index.normalize().isin(dividend_dates).astype(int)

Split (0/1)

In [12]:
splits_dates = pd.to_datetime(splits.index, utc=True, errors='raise')
splits_dates = splits_dates.tz_convert(None)

splits_dates = splits_dates.dropna()

splits_dates = set(splits_dates.normalize())

data['Splits'] = data.index.normalize().isin(splits_dates).astype(int)

Macro context

VIX

The VIX (Volatility Index) is often called the “fear gauge” of the market. It represents the market’s expectation of 30-day forward-looking volatility in the S&P 500 index, derived from options prices.
* High VIX → Market expects high volatility, often during market stress or uncertainty.
* Low VIX → Market expects low volatility, generally in calm or bullish periods.
* VIX is expressed in percentage points (e.g., VIX = 20 means the market expects ~20% annualized volatility over the next 30 days).

Important: VIX is not a price index; it’s a measure of expected volatility.

In [13]:
vix.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
vix['Date'] = pd.to_datetime(vix['Date'])
vix.set_index('Date', inplace=True)

vix.drop(columns=['Volume'], inplace=True)

data = data.join(vix, how='left', rsuffix='_VIX')

In [14]:
# VIX returns
data['VIX_Return'] = data['Close_VIX'].pct_change()

# Volatility
data['VIX_Volatility_20'] = data['Close_VIX'].rolling(window=20).std()
data['VIX_Volatility_50'] = data['Close_VIX'].rolling(window=50).std()

# Trend
data['VIX_SMA_20'] = data['Close_VIX'].rolling(window=20).mean()
data['VIX_EMA_20'] = data['Close_VIX'].ewm(span=20, adjust=False).mean()
data['VIX_RollingVol'] = data['Close_VIX'].rolling(5).std()
data['VIX_Change'] = data['Close_VIX'] - data['Close_VIX'].shift(1)

# Lagged features
for lag in range(1, 6):
    data[f'VIX_Close_lag{lag}'] = data['Close_VIX'].shift(lag)

S&P 500

In [15]:
sp_500.tail()

Unnamed: 0.1,Unnamed: 0,Date,Close,High,Low,Open,Volume
9588,9592,2025-10-06,2486.350098,2501.919922,2481.340088,2489.139893,5604460000.0
9589,9593,2025-10-07,2458.419922,2493.100098,2451.310059,2490.429932,5546150000.0
9590,9594,2025-10-08,2483.98999,2487.639893,2458.669922,2467.629883,5383130000.0
9591,9595,2025-10-09,2468.850098,2490.75,2459.629883,2485.73999,5385020000.0
9592,9596,2025-10-10,2415.836914,2484.346191,2413.170654,2473.294189,0.0


In [16]:
sp_500.dtypes

Unnamed: 0      int64
Date           object
Close         float64
High          float64
Low           float64
Open          float64
Volume        float64
dtype: object

In [17]:
sp_500.drop(sp_500.columns[0], axis=1, inplace=True)

sp_500.tail()

Unnamed: 0,Date,Close,High,Low,Open,Volume
9588,2025-10-06,2486.350098,2501.919922,2481.340088,2489.139893,5604460000.0
9589,2025-10-07,2458.419922,2493.100098,2451.310059,2490.429932,5546150000.0
9590,2025-10-08,2483.98999,2487.639893,2458.669922,2467.629883,5383130000.0
9591,2025-10-09,2468.850098,2490.75,2459.629883,2485.73999,5385020000.0
9592,2025-10-10,2415.836914,2484.346191,2413.170654,2473.294189,0.0


In [18]:
sp_500['Date'] = pd.to_datetime(sp_500['Date'])

sp_500.dtypes

Date      datetime64[ns]
Close            float64
High             float64
Low              float64
Open             float64
Volume           float64
dtype: object

In [19]:
sp_500.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,1987-09-14,170.429993,170.949997,170.149994,170.529999,154400000.0
1,1987-09-15,169.199997,170.440002,169.130005,170.419998,136200000.0
2,1987-09-16,168.919998,170.029999,168.820007,169.309998,195700000.0
3,1987-09-17,168.820007,169.25,168.539993,168.919998,150700000.0
4,1987-09-18,168.75,169.039993,168.75,168.820007,188100000.0


In [20]:
sp_500.set_index('Date', inplace=True)

In [21]:
data.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Return,Volatility_20,Volatility_50,TR,ATR_14,...,VIX_Volatility_50,VIX_SMA_20,VIX_EMA_20,VIX_RollingVol,VIX_Change,VIX_Close_lag1,VIX_Close_lag2,VIX_Close_lag3,VIX_Close_lag4,VIX_Close_lag5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-09-29,254.429993,255.0,253.009995,254.559998,40127700,0.00181,9.206906,13.902666,1.990005,6.644289,...,1.075057,15.748,15.795826,0.574439,0.830001,15.29,16.74,16.18,16.639999,16.1
2025-09-30,254.630005,255.919998,253.110001,254.860001,37704300,0.001179,9.230117,14.150006,2.809998,6.055717,...,1.070034,15.7035,15.841938,0.524995,0.16,16.120001,15.29,16.74,16.18,16.639999
2025-10-01,255.449997,258.790009,254.929993,255.039993,48713900,0.000706,9.544292,14.373807,3.930008,5.940717,...,1.06743,15.7005,15.884611,0.530311,0.01,16.280001,16.120001,15.29,16.74,16.18
2025-10-02,257.130005,258.179993,254.149994,256.579987,42630200,0.006038,9.90092,14.642702,4.029999,5.683575,...,1.073287,15.767,15.9556,0.500869,0.339998,16.290001,16.280001,16.120001,15.29,16.74
2025-10-03,258.019989,259.23999,253.949997,254.669998,49107000,-0.007444,10.104221,14.770066,5.289993,5.420718,...,1.078989,15.8405,16.021733,0.234584,0.02,16.629999,16.290001,16.280001,16.120001,15.29


In [22]:
print(type(data.index))
print(type(sp_500.index))

<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>


In [23]:
sp_500.loc['2025-09-29']

Close     2.435250e+03
High      2.449260e+03
Low       2.428300e+03
Open      2.442660e+03
Volume    5.358760e+09
Name: 2025-09-29 00:00:00, dtype: float64

In [24]:
sp_500_renamed = sp_500.add_suffix('_SP500')

data = data.join(sp_500_renamed, how='left')
data.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Return,Volatility_20,Volatility_50,TR,ATR_14,...,VIX_Close_lag1,VIX_Close_lag2,VIX_Close_lag3,VIX_Close_lag4,VIX_Close_lag5,Close_SP500,High_SP500,Low_SP500,Open_SP500,Volume_SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-09-29,254.429993,255.0,253.009995,254.559998,40127700,0.00181,9.206906,13.902666,1.990005,6.644289,...,15.29,16.74,16.18,16.639999,16.1,2435.25,2449.26001,2428.300049,2442.659912,5358760000.0
2025-09-30,254.630005,255.919998,253.110001,254.860001,37704300,0.001179,9.230117,14.150006,2.809998,6.055717,...,16.120001,15.29,16.74,16.18,16.639999,2436.47998,2438.330078,2413.280029,2432.639893,6057210000.0
2025-10-01,255.449997,258.790009,254.929993,255.039993,48713900,0.000706,9.544292,14.373807,3.930008,5.940717,...,16.280001,16.120001,15.29,16.74,16.18,2442.350098,2447.459961,2424.879883,2428.22998,6037950000.0
2025-10-02,257.130005,258.179993,254.149994,256.579987,42630200,0.006038,9.90092,14.642702,4.029999,5.683575,...,16.290001,16.280001,16.120001,15.29,16.74,2458.48999,2458.550049,2435.790039,2448.97998,5416130000.0
2025-10-03,258.019989,259.23999,253.949997,254.669998,49107000,-0.007444,10.104221,14.770066,5.289993,5.420718,...,16.629999,16.290001,16.280001,16.120001,15.29,2476.179932,2497.360107,2466.679932,2466.679932,5713110000.0


In [25]:
dow_jones.drop(dow_jones.columns[0], axis=1, inplace=True)

dow_jones.tail()

dow_jones['Date'] = pd.to_datetime(dow_jones['Date'])

dow_jones.dtypes

dow_jones.set_index('Date', inplace=True)

dow_jones_renamed = dow_jones.add_suffix('_DOWJONES')

data = data.join(dow_jones_renamed, how='left')
data.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Return,Volatility_20,Volatility_50,TR,ATR_14,...,Close_SP500,High_SP500,Low_SP500,Open_SP500,Volume_SP500,Close_DOWJONES,High_DOWJONES,Low_DOWJONES,Open_DOWJONES,Volume_DOWJONES
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-09-29,254.429993,255.0,253.009995,254.559998,40127700,0.00181,9.206906,13.902666,1.990005,6.644289,...,2435.25,2449.26001,2428.300049,2442.659912,5358760000.0,46316.070312,46387.058594,46149.050781,46306.339844,479740000.0
2025-09-30,254.630005,255.919998,253.110001,254.860001,37704300,0.001179,9.230117,14.150006,2.809998,6.055717,...,2436.47998,2438.330078,2413.280029,2432.639893,6057210000.0,46397.890625,46425.300781,46103.390625,46282.628906,584780000.0
2025-10-01,255.449997,258.790009,254.929993,255.039993,48713900,0.000706,9.544292,14.373807,3.930008,5.940717,...,2442.350098,2447.459961,2424.879883,2428.22998,6037950000.0,46441.101562,46528.78125,46276.390625,46366.78125,552790000.0
2025-10-02,257.130005,258.179993,254.149994,256.579987,42630200,0.006038,9.90092,14.642702,4.029999,5.683575,...,2458.48999,2458.550049,2435.790039,2448.97998,5416130000.0,46519.71875,46589.308594,46283.570312,46461.109375,442010000.0
2025-10-03,258.019989,259.23999,253.949997,254.669998,49107000,-0.007444,10.104221,14.770066,5.289993,5.420718,...,2476.179932,2497.360107,2466.679932,2466.679932,5713110000.0,46758.28125,47049.640625,46566.871094,46583.949219,425050000.0


In [26]:
nasdaq.drop(nasdaq.columns[0], axis=1, inplace=True)

nasdaq.tail()

nasdaq['Date'] = pd.to_datetime(nasdaq['Date'])

nasdaq.dtypes

nasdaq.set_index('Date', inplace=True)

nasdaq_renamed = nasdaq.add_suffix('_NASDAQ')

data = data.join(nasdaq_renamed, how='left')
data.tail()


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Return,Volatility_20,Volatility_50,TR,ATR_14,...,Close_DOWJONES,High_DOWJONES,Low_DOWJONES,Open_DOWJONES,Volume_DOWJONES,Close_NASDAQ,High_NASDAQ,Low_NASDAQ,Open_NASDAQ,Volume_NASDAQ
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-09-29,254.429993,255.0,253.009995,254.559998,40127700,0.00181,9.206906,13.902666,1.990005,6.644289,...,46316.070312,46387.058594,46149.050781,46306.339844,479740000.0,22591.150391,22704.900391,22536.050781,22605.300781,9046720000.0
2025-09-30,254.630005,255.919998,253.110001,254.860001,37704300,0.001179,9.230117,14.150006,2.809998,6.055717,...,46397.890625,46425.300781,46103.390625,46282.628906,584780000.0,22660.009766,22671.390625,22493.960938,22580.359375,8928070000.0
2025-10-01,255.449997,258.790009,254.929993,255.039993,48713900,0.000706,9.544292,14.373807,3.930008,5.940717,...,46441.101562,46528.78125,46276.390625,46366.78125,552790000.0,22755.160156,22782.580078,22516.740234,22530.949219,10043980000.0
2025-10-02,257.130005,258.179993,254.149994,256.579987,42630200,0.006038,9.90092,14.642702,4.029999,5.683575,...,46519.71875,46589.308594,46283.570312,46461.109375,442010000.0,22844.050781,22900.599609,22729.75,22885.900391,9633200000.0
2025-10-03,258.019989,259.23999,253.949997,254.669998,49107000,-0.007444,10.104221,14.770066,5.289993,5.420718,...,46758.28125,47049.640625,46566.871094,46583.949219,425050000.0,22780.509766,22925.429688,22695.820312,22886.160156,10515730000.0


In [27]:
# NYSE Composite
nyse_composite.drop(nyse_composite.columns[0], axis=1, inplace=True)
nyse_composite['Date'] = pd.to_datetime(nyse_composite['Date'])
nyse_composite.set_index('Date', inplace=True)
nyse_composite_renamed = nyse_composite.add_suffix('_NYSE')
data = data.join(nyse_composite_renamed, how='left')

# Oil
oil.drop(oil.columns[0], axis=1, inplace=True)
oil['Date'] = pd.to_datetime(oil['Date'])
oil.set_index('Date', inplace=True)
oil_renamed = oil.add_suffix('_OIL')
data = data.join(oil_renamed, how='left')

# PHLX Semiconductor Index
phlx_semiconductor_index.drop(phlx_semiconductor_index.columns[0], axis=1, inplace=True)
phlx_semiconductor_index['Date'] = pd.to_datetime(phlx_semiconductor_index['Date'])
phlx_semiconductor_index.set_index('Date', inplace=True)
phlx_semiconductor_index_renamed = phlx_semiconductor_index.add_suffix('_PHLX')
data = data.join(phlx_semiconductor_index_renamed, how='left')

# Russell 2000
russell_200.drop(russell_200.columns[0], axis=1, inplace=True)
russell_200['Date'] = pd.to_datetime(russell_200['Date'])
russell_200.set_index('Date', inplace=True)
russell_200_renamed = russell_200.add_suffix('_RUSSELL')
data = data.join(russell_200_renamed, how='left')

data.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Return,Volatility_20,Volatility_50,TR,ATR_14,...,Close_PHLX,High_PHLX,Low_PHLX,Open_PHLX,Volume_PHLX,Close_RUSSELL,High_RUSSELL,Low_RUSSELL,Open_RUSSELL,Volume_RUSSELL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-09-29,254.429993,255.0,253.009995,254.559998,40127700,0.00181,9.206906,13.902666,1.990005,6.644289,...,,,,,,2435.25,2449.26001,2428.300049,2442.659912,5358760000.0
2025-09-30,254.630005,255.919998,253.110001,254.860001,37704300,0.001179,9.230117,14.150006,2.809998,6.055717,...,,,,,,2436.47998,2438.330078,2413.280029,2432.639893,6057210000.0
2025-10-01,255.449997,258.790009,254.929993,255.039993,48713900,0.000706,9.544292,14.373807,3.930008,5.940717,...,,,,,,2442.350098,2447.459961,2424.879883,2428.22998,6037950000.0
2025-10-02,257.130005,258.179993,254.149994,256.579987,42630200,0.006038,9.90092,14.642702,4.029999,5.683575,...,,,,,,2458.48999,2458.550049,2435.790039,2448.97998,5416130000.0
2025-10-03,258.019989,259.23999,253.949997,254.669998,49107000,-0.007444,10.104221,14.770066,5.289993,5.420718,...,,,,,,2476.179932,2497.360107,2466.679932,2466.679932,5713110000.0
