In [3]:
from google.colab import drive
import os
import pandas as pd
import yfinance as yf
import numpy as np
from sklearn.decomposition import PCA

In [4]:
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
project_path = '/content/drive/My Drive/market-regime-forecasting'
os.chdir(project_path)

In [6]:
print(os.getcwd())

/content/drive/My Drive/market-regime-forecasting


## Loading data

In [None]:
DATA_DIR = Path("../History")
csv_files = sorted(DATA_DIR.glob("*.csv"))
len(csv_files)

In [None]:
frames = []

for fp in tqdm(csv_files, desc="Reading 15-min bars"):
    ticker = fp.stem.upper()

    df = (
        pd.read_csv(fp,
                    usecols=["timestamp", "close"],
                    parse_dates=["timestamp"],
                    infer_datetime_format=True)
          .rename(columns={"close": ticker})
          .set_index("timestamp")
    )

    df[ticker] = df[ticker].astype("float32")
    frames.append(df)

prices = (
    pd.concat(frames, axis=1)
      .sort_index()
      .loc[lambda x: ~x.index.duplicated(keep="first")]
)

prices.info(memory_usage="deep")
prices.head()

In [None]:
parquet_path = DATA_DIR.parent / "all_prices.parquet"
prices.to_parquet(parquet_path, engine="pyarrow", compression="zstd")

print(f"Saved tidy dataset → {parquet_path}")

In [7]:
prices = pd.read_parquet('all_prices.parquet')

We identify and rank the worst tickers by their longest consecutive streak of missing values, in order to later exclude them and prevent excessive row loss when computing returns and dropping NaNs.


In [48]:
def max_nan_streak(s):
    # boolean → int run-length encoding
    return (
        s.isna()
         .astype(int)
         .groupby(s.notna().cumsum())
         .sum()
         .max()
    )

coverage = prices.notna().mean()            # per-ticker coverage ratio (0-1)
streak   = prices.apply(max_nan_streak)     # longest NaN run (bars)

diag = pd.DataFrame({
    "coverage": coverage,
    "max_streak": streak
}).sort_values("max_streak",ascending=False)

display(diag.head(40))        # worst tickers

Unnamed: 0,coverage,max_streak
JPST,0.642056,9963
SPXS,0.790172,9585
CVNA,0.745771,9558
SNAP,0.820242,8485
ESGE,0.648385,4374
SH,0.883526,3861
ICSH,0.622534,204
SPLG,0.728399,90
ARKK,0.77125,67
KOLD,0.745316,32


In [49]:
COV_MIN     = 0.6       # keep tickers with ≥60 % observed bars
STREAK_MAX   = 10000   # drop many consecutive NaNs

# find bad tickers
bad = diag.index[
      (diag.coverage < COV_MIN) |
      (diag.max_streak > STREAK_MAX)
]

In [50]:
len(bad)

0

In [51]:
prices=prices.drop(bad,axis=1)


In [52]:
prices.shape

(48354, 170)

In [53]:
prices.index.normalize().nunique()

1791

In [67]:
# raw log returns
returns = np.log(prices).diff()

# drop rows that are >20 % NaN (very early sparse history)
min_valid = int(0.8 * returns.shape[1])    # keep rows with ≥80 % populated
returns   = returns.dropna(thresh=min_valid)

# patch 1-bar outages, then final drop of any leftover NaNs
returns   = returns.ffill(limit=1).dropna()
returns.shape



(13057, 170)

In [68]:
returns.index.normalize().nunique()

1164

In [69]:
returns

Unnamed: 0_level_0,AAL,AAPL,ABEV,ACWI,AGG,AMC,AMD,AMZN,ARKK,AUY,...,XLF,XLI,XLK,XLP,XLU,XLV,XLY,XOM,XRT,YANG
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-10-25 19:45:00,0.001210,0.000810,-0.001631,0.000159,-0.000103,-0.001768,-0.007675,0.000206,-0.001893,0.000000,...,-0.000418,0.000916,0.000526,0.000216,0.000000,0.000264,0.000234,0.001580,-0.000806,-0.001535
2018-01-05 20:45:00,0.001365,0.000000,0.000697,0.000891,0.000309,-0.005222,0.001096,0.002607,0.000540,0.004692,...,0.001563,0.000988,0.001119,0.001408,0.000000,0.000256,0.001554,0.001802,0.001414,-0.001210
2018-01-08 21:00:00,0.000394,-0.000483,0.000000,-0.000297,0.000103,0.003559,0.001627,-0.000963,0.002981,-0.009299,...,-0.001173,0.000000,0.000636,0.000602,0.000453,0.000128,0.000207,0.000000,0.001406,0.002041
2018-01-12 20:00:00,-0.000880,-0.000475,-0.001907,-0.001026,0.000103,0.010195,-0.002493,-0.000613,-0.000534,-0.001264,...,-0.000381,-0.001094,-0.001428,-0.001010,-0.002329,-0.000754,-0.001807,-0.000894,-0.000681,0.002171
2018-01-16 19:45:00,0.001245,0.000954,0.001724,0.000734,-0.000412,-0.003584,0.003291,-0.001231,0.001374,0.001252,...,0.001909,0.001244,0.001274,0.000805,0.001872,0.000750,0.000708,0.002699,0.002557,-0.001962
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-11 20:00:00,-0.001957,0.002111,-0.003683,0.000508,-0.000070,-0.010103,0.003286,0.004889,0.001863,-0.001675,...,0.000280,0.001276,0.000895,0.000535,0.000555,0.000389,0.001890,0.000536,0.000997,-0.001241
2023-01-11 20:15:00,-0.000980,0.000842,0.003683,0.000113,0.000754,-0.002489,0.000729,0.001410,0.000645,0.000000,...,-0.000700,-0.001129,0.000273,-0.000200,0.000243,-0.000167,0.000581,0.000719,0.000690,0.000000
2023-01-11 20:30:00,0.000980,0.000099,0.003670,-0.000068,0.000151,0.004215,-0.000291,0.001687,-0.001368,0.000000,...,-0.000700,0.000000,0.000078,-0.000869,-0.000520,-0.000222,0.000000,-0.001754,-0.001379,0.009366
2023-01-11 20:45:00,0.001631,0.004091,-0.001833,0.001817,0.000401,0.030059,0.006971,0.001631,0.004762,-0.002518,...,0.002099,0.001816,0.004580,0.002070,-0.000139,0.001702,0.002391,0.002923,0.003367,-0.000622


In [70]:
yf_tickers = {
    "^VIX":  "VIX",        # implied vol
    "DX-Y.NYB": "USD_Index",  # dollar strength
    "^IRX":  "TBill_13w",  # 3-month Treasury yield
    "^TNX":  "TNote_10y",  # 10-year yield
    "GLD":   "Gold",       # gold ETF
    "USO":   "Oil",        # crude proxy
    "XLK":   "Tech_ETF",   # sector ETFs give risk-on/off flavour
    "XLF":   "Fin_ETF",
    "XLU":   "Util_ETF",
}

# returns already cleaned & dense
model_idx = returns.index

# download daily macro / ETF levels
macro_daily = (
    yf.download(
        list(yf_tickers.keys()),
        start=model_idx.min().date().isoformat(),
        end  =model_idx.max().date().isoformat(),
        interval="1d",
        progress=False
    )["Close"]
      .rename(columns=yf_tickers)
)

# forward-fill each daily value down through the intraday bars,
macro = (
    macro_daily
      .reindex(model_idx.union(macro_daily.index))   # add missing 15-min stamps
      .sort_index()
      .ffill()                                       # carry each day's value downward
      .reindex(model_idx)                            #  align to returns.index
      .astype("float32")
)

# quick sanity
assert macro.index.equals(model_idx)        # True
assert macro.isna().sum().sum() == 0        # no NaNs                         # exact 15-min grid

In [58]:
macro

Ticker,USD_Index,Gold,Oil,Fin_ETF,Tech_ETF,Util_ETF,TBill_13w,TNote_10y,VIX
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-10-25 19:45:00,93.709999,121.349998,83.839996,23.042749,56.082111,43.001713,1.083,2.444,11.23
2018-01-05 20:45:00,91.949997,125.330002,98.480003,24.678598,61.535160,40.728580,1.370,2.476,9.22
2018-01-08 21:00:00,92.330002,125.309998,99.040001,24.643856,61.767223,41.109444,1.380,2.480,9.52
2018-01-12 20:00:00,90.970001,126.959999,102.959999,25.390898,62.008530,39.879566,1.410,2.552,10.16
2018-01-16 19:45:00,90.389999,127.169998,102.080002,25.321407,61.767223,39.792294,1.395,2.544,11.66
...,...,...,...,...,...,...,...,...,...
2023-01-11 20:00:00,103.239998,174.740005,65.900002,34.126892,124.791985,66.590424,4.565,3.621,20.58
2023-01-11 20:15:00,103.239998,174.740005,65.900002,34.126892,124.791985,66.590424,4.565,3.621,20.58
2023-01-11 20:30:00,103.239998,174.740005,65.900002,34.126892,124.791985,66.590424,4.565,3.621,20.58
2023-01-11 20:45:00,103.239998,174.740005,65.900002,34.126892,124.791985,66.590424,4.565,3.621,20.58


In [71]:
# 5-factor PCA on the dense returns matrix
pca = PCA(n_components=5, random_state=42).fit(returns)
pca_scores = pd.DataFrame(
    pca.transform(returns).astype("float32"),
    index=returns.index,
    columns=[f"PC{i+1}" for i in range(5)]
)

In [60]:
# --- 1-day cross-section dispersion ---------------------------------
xsec_disp  = returns.std(axis=1)
dispersion = (
    xsec_disp
      .rolling("1D", closed="left")    # past 24 h, exclude current bar
      .mean()
      .rename("CrossDisp1d")
)

# --- 1-day average pair-wise correlation ----------------------------
avg_corr = (
    returns
      .rolling("1D", closed="left")
      .corr()
      .groupby(level=0).mean()         # DataFrame  (timestamp × ticker)
      .mean(axis=1)                    # Series     (timestamp)
      .rename("AvgCorr1d")
)

WIN = 26
# cross-section dispersion
dispersion = (returns.std(axis=1)
                     .rolling(WIN, min_periods=WIN)
                     .mean()
                     .rename("CrossDisp1d"))

avg_corr = (returns.rolling(WIN, min_periods=WIN)
                     .corr()
                     .groupby(level=0).mean()
                     .mean(axis=1)
                     .rename("AvgCorr1d"))

features["CrossDisp1d"] = dispersion
features["AvgCorr1d"]   = avg_corr


  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


In [61]:
features = (
    pd.concat([macro, pca_scores, dispersion, avg_corr], axis=1)
      .ffill()                    # rare macro gaps
      .astype("float32")
)

In [65]:
features.isna().sum()

Unnamed: 0,0
USD_Index,0
Gold,0
Oil,0
Fin_ETF,0
Tech_ETF,0
Util_ETF,0
TBill_13w,0
TNote_10y,0
VIX,0
PC1,0


In [63]:
features.isna().sum().sum()

np.int64(11)

In [72]:
bars_per_day = returns.groupby(returns.index.date).size()   # Series (index = yyyy-mm-dd)

# show “how many days have X bars?”
dist = bars_per_day.value_counts().sort_index()             # e.g. {2: 11 days, 3: 38 days, …}
print(dist)

1      59
2      59
3      56
4      61
5      57
6      71
7      70
8      64
9      61
10     69
11     69
12     51
13     37
14     43
15     28
16     26
17     18
18     10
19     14
20     11
21     33
22    132
23      9
24      4
25      2
26     46
27      4
Name: count, dtype: int64


In [73]:
6.5*4

26.0

In [75]:
prices = pd.read_parquet("all_prices.parquet")

# drop hopeless tickers
def max_nan_streak(s):
    return (
        s.isna().astype(int)
          .groupby(s.notna().cumsum())
          .sum().max()
    )

cov      = prices.notna().mean()             # % bars observed
streak   = prices.apply(max_nan_streak)      # longest consecutive NaNs
BARS_DAY = 26

bad = cov[cov < 0.60].index.union(           # <60 % history
      streak[streak > BARS_DAY*5].index)     # ≥1 week missing
prices = prices.drop(columns=bad)
print(f" Dropped {len(bad)} tickers – {prices.shape[1]} remain")

 Dropped 49 tickers – 163 remain


In [76]:


# log-returns & row filter
returns = np.log(prices).diff()

ROW_KEEP = 0.70                              # keep rows with ≥70 % cols filled
returns  = returns[returns.notna().mean(axis=1) >= ROW_KEEP]
returns  = returns.ffill(limit=1).dropna()   # 1-bar patch : fully dense

print("After trim:", returns.shape)

After trim: (13657, 163)


In [77]:
returns.index.normalize().nunique()

1256

In [83]:
#  macro context (daily : ffill to 15-min)
yf_map = {
    "^VIX":  "VIX",
    "GLD":   "Gold",
    "USO":   "Oil",
    "DXY":   "USD",
    "^IRX":  "TBill_3m",
    "^TNX":  "TNote_10y",
    "XLK":   "Tech",
    "XLF":   "Fin",
    "XLU":   "Util",
}
macro_d = ( yf.download(list(yf_map.keys()),
                        start=returns.index.min().date().isoformat(),
                        end  =returns.index.max().date().isoformat(),
                        interval="1d", progress=False)["Close"]
              .rename(columns=yf_map) )

macro = ( macro_d
            .reindex(returns.index.union(macro_d.index))
            .sort_index()
            .ffill()
            .reindex(returns.index)
            .astype("float32") )

In [81]:
# cross-section factors
WIN = BARS_DAY                                  # previous trading session
disp = (returns.std(axis=1)
                .rolling(WIN, min_periods=WIN)
                .mean().rename("CrossDisp1d"))
avgc = (returns.rolling(WIN, min_periods=WIN)
                 .corr()
                 .groupby(level=0).mean()
                 .mean(axis=1).rename("AvgCorr1d"))

In [82]:
# PCA factors (5 components)
pca = PCA(n_components=5, random_state=42).fit(returns)
pcs = pd.DataFrame(pca.transform(returns).astype("float32"),
                   index=returns.index,
                   columns=[f"PC{i+1}" for i in range(5)])


In [84]:
#  merge & final NaN check
features = pd.concat([macro, pcs, disp, avgc], axis=1).ffill()
nan_total = features.isna().sum().sum()
if nan_total:
    # drop the very first <WIN rows with rolling NaNs
    features  = features.dropna()
    returns   = returns.loc[features.index]
print("NaNs after merge:", nan_total)

NaNs after merge: 50


In [94]:
returns.tail(26)

Unnamed: 0_level_0,AAL,AAPL,ABEV,ACWI,AGG,AMC,AMD,AMZN,ARKK,AUY,...,XLF,XLI,XLK,XLP,XLU,XLV,XLY,XOM,XRT,YANG
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-01-11 14:45:00,0.009445,-1.5e-05,-0.007463,-0.00102,0.000654,0.033987,-0.00242,0.002258,0.003814,-0.008797,...,-0.002668,-0.001477,-0.002826,-0.003327,0.000835,-0.001747,0.001609,-0.000135,0.003006,-0.000616
2023-01-11 15:00:00,0.002011,0.003675,0.005602,0.0,0.000101,0.017103,-0.007964,0.003539,0.007291,0.005029,...,-0.000985,-0.000197,0.001414,-0.001267,-0.001253,0.001672,0.003574,-0.00969,0.0043,0.008594
2023-01-11 15:15:00,-0.001959,0.005012,0.00186,0.001134,-0.000503,-0.021851,0.008846,-0.000281,0.006234,-0.006734,...,0.001406,-0.001035,0.003917,-0.001803,-0.001255,0.001262,0.001455,0.001,0.000154,0.003052
2023-01-11 15:30:00,-0.006536,-0.005545,-0.000632,-0.001815,0.0,-0.01289,-0.008696,-0.006217,-0.013203,-0.000845,...,-0.001406,-0.000247,-0.003093,-0.00107,0.001603,-0.000112,-0.005505,-0.004465,-0.005609,-0.00061
2023-01-11 15:45:00,-0.000991,0.001408,-0.001228,-0.001932,-0.000403,0.00216,-0.003757,-0.00151,-0.007817,0.005059,...,-0.001123,-0.002865,-0.001766,-0.000669,0.000905,-0.002042,-0.002232,-0.002514,-0.003473,0.003652
2023-01-11 16:00:00,0.00589,0.00124,-0.001864,0.001818,0.000567,0.02556,0.007966,0.007457,0.006616,-0.001683,...,0.002811,0.00089,0.003451,-0.000201,0.00139,-0.002978,0.003438,0.005432,0.002779,-0.006094
2023-01-11 16:15:00,-0.002221,0.000356,0.001864,0.000227,1.7e-05,-0.019109,-0.001549,0.003063,0.00278,0.004203,...,0.000843,0.000395,0.000861,-0.001072,-0.002503,0.000373,0.001241,0.001092,0.0,-0.004902
2023-01-11 16:30:00,0.000588,0.002655,-0.001864,0.000737,-0.000548,0.0,0.001731,0.000347,0.002627,0.000839,...,0.000702,0.000297,0.001173,-0.00235,-0.000418,-0.000484,0.000146,0.001454,0.002157,-0.005544
2023-01-11 16:45:00,0.00098,0.001929,0.006805,-0.000113,6.4e-05,-0.008611,-0.000245,0.000827,-0.001021,0.001675,...,0.00014,0.000395,0.000391,0.000471,-0.000697,-0.000336,-0.000365,0.00118,-0.00077,-0.002474
2023-01-11 17:00:00,0.002608,0.002114,-0.001224,0.001191,0.0,0.009683,0.005452,0.002094,0.002623,0.000836,...,0.00028,0.001973,0.001873,0.000873,0.000279,0.001714,0.001531,0.003351,0.002922,-0.003101
