In [25]:
!pip install yfinance --upgrade
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd
import seaborn as sns
from scipy.optimize import minimize
import datetime as dt
sns.set_style('darkgrid')
import yfinance as yf

sp500_tickers = [
"WMT", "AMZN", "AAPL", "CVS", "TSLA", "GOOGL", "META", "JPM", "COST", "KR",
"WBA", "TGT", "UPS", "CNC", "CI", "MSFT", "VZ", "IBM", "UNH", "XOM",
"MCK", "CVX", "CAH", "HD", "WBA", "MPC", "COR", "F", "C", "DELL",
"GM", "NVDA", "PFE", "PG", "CMCSA", "JNJ", "WFC", "BAC", "ABBV", "SBUX",
"CSCO", "T", "PEP", "INTC", "DIS", "BA", "GS", "MS", "HON", "CRM"
]

# Get the data for tickers from yahoo finance

stock_data = yf.download(sp500_tickers,'2023-1-1','2025-10-1', auto_adjust=True)['Close']
SP500 = yf.download('^GSPC','2023-1-1','2025-10-1', auto_adjust=True)['Close']




[*********************100%***********************]  49 of 49 completed
[*********************100%***********************]  1 of 1 completed


In [26]:
#Version with NaN values for weekends/holidays.
# Create a date range including weekends
all_days = pd.date_range(start=stock_data.index.min(), end=stock_data.index.max(), freq='D')

# Reindex the DataFrame with this date range
reindexed_stock_data = stock_data.reindex(all_days)
reindexed_SP500 = SP500.reindex(all_days)

#Function to test if a date is a trading day
def is_trading_day(date):
    return date in stock_data.index

is_trading_day('2025-1-1')

def next_trading_day(day):
    date = dt.datetime.strptime(day, '%Y-%m-%d')
    next_day = date + dt.timedelta(1)
    max_lookahead = 365  # prevent infinite loop going 1 year ahead
    count = 0
    while not is_trading_day(next_day):
        next_day += dt.timedelta(1)
        count += 1
        if count > max_lookahead:
            raise ValueError("No trading day found within 1 year from start date")
    return next_day.strftime('%Y-%m-%d')

next_trading_day('2025-1-1')


'2025-01-02'

In [27]:
import pandas as pd

dfs = {k: v for k, v in globals().items() if isinstance(v, pd.DataFrame)}
for name, df in dfs.items():
    print(f"{name:25s} -> shape={df.shape} | cols={list(df.columns)[:6]}")

__                        -> shape=(5, 50) | cols=['Date', 'AAPL', 'ABBV', 'AMZN', 'BA', 'BAC']
stock_data                -> shape=(688, 49) | cols=['AAPL', 'ABBV', 'AMZN', 'BA', 'BAC', 'C']
SP500                     -> shape=(688, 1) | cols=['^GSPC']
log_returns               -> shape=(665, 49) | cols=['AAPL', 'ABBV', 'AMZN', 'BA', 'BAC', 'C']
SP_log_returns            -> shape=(687, 1) | cols=['^GSPC']
SP_pct_change             -> shape=(687, 1) | cols=['^GSPC']
pct_change                -> shape=(687, 49) | cols=['AAPL', 'ABBV', 'AMZN', 'BA', 'BAC', 'C']
SP_weekly_variance        -> shape=(683, 1) | cols=['^GSPC']
weekly_variance           -> shape=(661, 49) | cols=['AAPL', 'ABBV', 'AMZN', 'BA', 'BAC', 'C']
SP_monthly_variance       -> shape=(667, 1) | cols=['^GSPC']
monthly_variance          -> shape=(645, 49) | cols=['AAPL', 'ABBV', 'AMZN', 'BA', 'BAC', 'C']
SP_biweekly_variance      -> shape=(678, 1) | cols=['^GSPC']
biweekly_variance         -> shape=(656, 49) | cols=['AAPL', 'A

In [28]:
#log returns
log_returns = np.log(stock_data/stock_data.shift(1)).dropna()
SP_log_returns = np.log(SP500/SP500.shift(1)).dropna()
#Percentage change in stock returns day-to-day (in lieu of trading day variance)
SP_pct_change = SP500.pct_change().dropna()
pct_change = stock_data.pct_change().dropna()
#variance of stock returns over each week in the period (5 trading days) 
SP_weekly_variance = SP_log_returns.rolling(window=5).var().dropna()
weekly_variance = log_returns.rolling(window=5).var().dropna()
#variance of stock returns over each month in the period (21 trading days)
SP_monthly_variance = SP_log_returns.rolling(window=21).var().dropna()
monthly_variance = log_returns.rolling(window=21).var().dropna()
#variance of stock returns over each 2 weeks in the period (10 trading days)
SP_biweekly_variance = SP_log_returns.rolling(window=10).var().dropna()
biweekly_variance = log_returns.rolling(window=10).var().dropna()


#super averages
#average percent change in a single day
Av_SP_pct_change = SP_pct_change.mean()
av_pct_change = pct_change.mean()
#average weekly variance
av_weekly_variance = weekly_variance.mean()
Av_SP_weekly_variance = SP_weekly_variance.mean()
#average monthly variance
av_monthly_variance = monthly_variance.mean()
Av_SP_monthly_variance = SP_monthly_variance.mean()
#average biweekly variance
av_biweekly_variance = biweekly_variance.mean()
Av_SP_biweekly_variance = SP_biweekly_variance.mean()

#Note: variance windows are in trading days, not calendar days. Moreover, these are backwards looking rolling variances.

  pct_change = stock_data.pct_change().dropna()


In [16]:
#  listing dataframe 
import pandas as pd
dfs = {k:v for k,v in globals().items() if isinstance(v, pd.DataFrame)}
print(list(dfs.keys()))


['stock_data', 'SP500', 'log_returns', 'SP_log_returns', 'SP_pct_change', 'pct_change', 'SP_weekly_variance', 'weekly_variance', 'SP_monthly_variance', 'monthly_variance', 'SP_biweekly_variance', 'biweekly_variance']


In [18]:
# Fix: moving index (which is datetime) into a proper column
stock_data = stock_data.reset_index()

# Renaming it to match  merge code later
stock_data = stock_data.rename(columns={"index": "Date"})

# Confirming it worked
print(stock_data.columns)
stock_data.head()


Index(['Date', 'AAPL', 'ABBV', 'AMZN', 'BA', 'BAC', 'C', 'CAH', 'CI', 'CMCSA',
       'CNC', 'COR', 'COST', 'CRM', 'CSCO', 'CVS', 'CVX', 'DELL', 'DIS', 'F',
       'GM', 'GOOGL', 'GS', 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'JPM', 'KR',
       'MCK', 'META', 'MPC', 'MS', 'MSFT', 'NVDA', 'PEP', 'PFE', 'PG', 'SBUX',
       'T', 'TGT', 'TSLA', 'UNH', 'UPS', 'VZ', 'WBA', 'WFC', 'WMT', 'XOM'],
      dtype='object', name='Ticker')


Ticker,Date,AAPL,ABBV,AMZN,BA,BAC,C,CAH,CI,CMCSA,...,T,TGT,TSLA,UNH,UPS,VZ,WBA,WFC,WMT,XOM
0,2023-01-03,123.33065,146.532654,85.82,195.389999,31.135239,41.294094,72.897873,306.059662,32.576893,...,15.827339,138.432388,108.099998,494.281006,153.092575,32.78289,32.070889,38.841858,46.241467,96.910591
1,2023-01-04,124.602715,147.714767,85.139999,203.639999,31.720583,42.358459,73.154381,295.415955,33.511063,...,16.165173,139.335632,113.639999,480.805054,154.690918,33.608173,32.329601,39.641197,46.29298,97.192657
2,2023-01-05,123.281334,147.534302,83.120003,204.990005,31.655556,42.169041,72.793358,289.598999,33.868248,...,16.224293,140.740662,110.339996,466.947968,151.834839,34.07394,30.346182,39.427425,46.135201,99.367256
3,2023-01-06,127.81736,150.295654,86.080002,213.0,31.971447,42.674168,74.712479,288.161469,34.701675,...,16.494556,146.114426,113.059998,466.986084,156.298004,34.474323,31.570726,39.780613,47.265465,100.568275
4,2023-01-09,128.339981,145.882889,87.360001,208.570007,31.488304,42.881634,73.695915,288.50415,34.390285,...,16.312922,142.647507,119.769997,467.043304,158.691193,34.335724,31.346514,39.39954,46.67617,98.693954


In [23]:
import pandas as pd
import numpy as np

# 1) If your price matrix is still wide (tickers as columns, dates as index):
#    move index to a column and reshape to long
if not isinstance(stock_data.index, pd.RangeIndex) and "Date" not in stock_data.columns:
    stock_data = stock_data.reset_index().rename(columns={"index": "Date"})

# Now: columns look like: ['Date', 'AAPL','AMZN','MSFT', ...]
# Convert to long
stock_long = stock_data.melt(id_vars="Date", var_name="Ticker", value_name="Close")

# 2) clean + sort
stock_long["Date"] = pd.to_datetime(stock_long["Date"], errors="coerce")
stock_long = stock_long.sort_values(["Ticker", "Date"]).dropna(subset=["Close"])

# 3) per-ticker log returns + rolling variances (5≈week, 21≈month; trading days)
# Corrected log return calculation
stock_long["log_return"] = (
    stock_long.groupby("Ticker")["Close"].transform(lambda x: np.log(x).diff())
)


stock_long["weekly_variance"] = (
    stock_long.groupby("Ticker")["log_return"]
    .rolling(5).var().reset_index(level=0, drop=True)
)

stock_long["monthly_variance"] = (
    stock_long.groupby("Ticker")["log_return"]
    .rolling(21).var().reset_index(level=0, drop=True)
)

print(stock_long.shape)
display(stock_long.head(100))

# 4) save for your Bluesky EDA notebook
OUT = "../data_aquisition/clean_stock_data.csv"
stock_long.to_csv(OUT, index=False)
print("Saved:", OUT)


(33690, 6)


Unnamed: 0,Date,Ticker,Close,log_return,weekly_variance,monthly_variance
0,2023-01-03,AAPL,123.330650,,,
1,2023-01-04,AAPL,124.602715,0.010261,,
2,2023-01-05,AAPL,123.281334,-0.010661,,
3,2023-01-06,AAPL,127.817360,0.036133,,
4,2023-01-09,AAPL,128.339981,0.004080,,
...,...,...,...,...,...,...
95,2023-05-19,AAPL,173.227036,0.000628,0.000040,0.000182
96,2023-05-22,AAPL,172.277649,-0.005496,0.000049,0.000177
97,2023-05-23,AAPL,169.666748,-0.015271,0.000115,0.000193
98,2023-05-24,AAPL,169.943649,0.001631,0.000111,0.000186


Saved: ../data_aquisition/clean_stock_data.csv


Notes: Need to account for weekend social media activity