In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from IPython.display import display

## Fill Gaps by average of neighbors


In [2]:

# 1) Load the raw CSV (make sure this path matches where you saved it)
prices_raw = pd.read_csv("data/sp500_prices_raw.csv", index_col=0, parse_dates=True)


# 2) Compute coverage and drop low‐coverage tickers
coverage = prices_raw.count() / len(prices_raw)
keep = coverage[coverage >= 0.95].index
prices_filt = prices_raw[keep]

# 3) Fill isolated gaps (average of neighbors)
def fill_iso(s):
    for i in range(1, len(s)-1):
        if pd.isna(s.iat[i]):
            prev, nxt = s.iat[i-1], s.iat[i+1]
            if not pd.isna(prev) and not pd.isna(nxt):
                s.iat[i] = (prev + nxt) / 2
    return s

prices_clean = prices_filt.apply(fill_iso, axis=0)
prices_clean = prices_clean.dropna(axis=1)  # drop any remaining columns with NaNs

# 4) Save
prices_clean.to_csv("data/sp500_prices_clean.csv")
print("Cleaned data saved with shape:", prices_clean.shape)


Cleaned data saved with shape: (2515, 469)


In [3]:


# 1. Load cleaned price data
prices_clean = pd.read_csv("data/sp500_prices_clean.csv", index_col=0, parse_dates=True)

# 2. Compute daily simple returns
returns = prices_clean.pct_change().dropna()

# 3. Standardize returns (zero mean, unit variance) across each ticker
scaler = StandardScaler()
returns_scaled_array = scaler.fit_transform(returns)
returns_scaled = pd.DataFrame(
    returns_scaled_array,
    index=returns.index,
    columns=returns.columns
)

# 4. Confirm zero mean and unit variance for a few tickers
example_tickers = returns_scaled.columns[:5]
summary_scaled = returns_scaled[example_tickers].describe().T[['mean', 'std']]
display(summary_scaled)

# 5. Check overall shape
print("Standardized returns matrix shape:", returns_scaled.shape)


Unnamed: 0,mean,std
A,-1.97844e-17,1.000199
AAPL,-1.39904e-16,1.000199
ABBV,9.185616e-18,1.000199
ABT,4.7341250000000005e-17,1.000199
ACGL,3.3562830000000005e-17,1.000199


Standardized returns matrix shape: (2514, 469)
