In [38]:
import numpy as np
import pandas as pd
import sys
from pathlib import Path

In [39]:
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

In [40]:
from src.config import DATA_DIR

In [41]:
path1 = DATA_DIR / "raw" / "prices.csv"
if not path1.exists():
        raise FileNotFoundError(f"Portfolio data not found at {path1}. Run data_loader.py first.")
else:
    prices = pd.read_csv(
        path1, 
        parse_dates=["Date"]
        ).set_index("Date")

In [42]:
path2 = DATA_DIR / "processed" / "monthly_returns.csv"
if not path2.exists():
        raise FileNotFoundError(f"Portfolio data not found at {path2}. Run returns.py first.")
else:
    returns = pd.read_csv(
        path2, 
        parse_dates=["Date"]
        ).set_index("Date")

In [43]:
path3 = DATA_DIR / "portfolio" / "market_returns.csv"
if not path3.exists():
        raise FileNotFoundError(f"Portfolio data not found at {path3}. Run capm_params.py first.")
else:
    market_returns = pd.read_csv(
        path3, 
        parse_dates=["Date"]
        ).set_index("Date")

In [44]:
from src.betas import compute_rolling_beta
from src.risk_metrics import max_drawdown, rolling_volatility

In [45]:
betas = compute_rolling_beta(returns, market_returns, window=12)

In [46]:
dates=returns.index

In [47]:
features = []

In [48]:
J=6
K=6

In [49]:
for t in range(J, len(dates) - K):
    date = dates[t]

    past = returns.iloc[t-J:t]
    future = returns.iloc[t+1:t+1+K]

    for stock in returns.columns:
        past_ret = past[stock].dropna()
        future_ret = future[stock].dropna()

        if len(past_ret) < J or len(future_ret) < K:
            continue

        beta_val = betas.loc[date, stock]
        if pd.isna(beta_val):
            continue

        past_prices = prices.iloc[t-6:t][stock].dropna()
        if len(past_prices) < 6:
            continue

        row = {
            "Date": date,
            "stock": stock,

            # Momentum
            "ret_1m": past_ret.iloc[-1],
            "ret_3m": (1 + past_ret.iloc[-3:]).prod() - 1,
            "ret_6m": (1 + past_ret).prod() - 1,
            "pos_months_6m": (past_ret > 0).sum(),   #number of positive months in past 6 months

            # Risk
            "vol_3m": rolling_volatility(past_ret, window=3).iloc[-1],
            "vol_6m": rolling_volatility(past_ret, window=6).iloc[-1],
            "max_dd_6m": max_drawdown(past_ret),

            # Exposure & trend
            "beta_12m": beta_val,
            "ma_ratio_6m": (past_prices.iloc[-1]/past_prices.mean()) - 1,

            # Label
            "future_6m_return": (1 + future_ret).prod() - 1
        }

        features.append(row)

In [50]:
df = pd.DataFrame(features)

In [51]:
df["label"] = (
    df.groupby("Date")["future_6m_return"]
      .transform(lambda x: x >= x.quantile(0.7))
      .astype(int)
)

In [52]:
df.drop(columns=["future_6m_return"], inplace=True)

In [53]:
df.to_csv(DATA_DIR / "processed"/ "ml_dataset_j6k6.csv", index=False)