In [2]:
import pandas as pd
import numpy as np

In [None]:
"""df = pd.read_csv('../data/processed/master_panel.csv')
df['date'] = pd.to_datetime(df['date'])"""

In [7]:
from __future__ import annotations
import numpy as np
import pandas as pd
from typing import Optional

def reduce_panel_to_fraction_per_year(
    panel: pd.DataFrame,
    *,
    date_col: str = "date",
    id_col: str = "permno",
    frac_per_year: float = 0.05,
    random_seed: int = 42,
    include_prev_december: bool = True
) -> pd.DataFrame:
    """
    Create a smaller panel by sampling ~`frac_per_year` of permnos independently in each calendar year.
    For each sampled permno in year Y, keep ALL its rows in year Y. Optionally also include rows from
    previous December (Y-1, month=12) for these permnos so the January formation step still works.

    Args:
        panel: Full master panel with at least [id_col, date_col].
        date_col: Name of the date column (string parseable to datetime).
        id_col: Security identifier column (e.g., 'permno').
        frac_per_year: Fraction of permnos to keep in each year (e.g., 0.05 for 5%).
        random_seed: Base seed for reproducibility (varied by year under the hood).
        include_prev_december: If True, include Y-1 December rows for the chosen permnos in year Y.

    Returns:
        A reduced DataFrame containing ~5% of the universe per year (plus prev December if requested),
        sorted by [date_col, id_col].
    """
    if not 0 < frac_per_year <= 1:
        raise ValueError("frac_per_year must be in (0, 1].")

    df = panel.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    df["year"] = df[date_col].dt.year
    df["month"] = df[date_col].dt.month

    reduced_frames = []
    years = sorted(df["year"].unique().tolist())

    for y in years:
        rng = np.random.RandomState(random_seed + y)

        # All permnos active in this year
        permnos_in_year = df.loc[df["year"] == y, id_col].unique()
        if len(permnos_in_year) == 0:
            continue

        # Sample ~5% permnos for this year (at least 1)
        sample_size = max(1, int(np.ceil(len(permnos_in_year) * frac_per_year)))
        sampled_permnos = rng.choice(permnos_in_year, size=sample_size, replace=False)

        # Keep all rows for sampled permnos in this year
        keep_y = df[(df["year"] == y) & (df[id_col].isin(sampled_permnos))]

        # Optionally also keep previous December for the SAME permnos, to enable Jan formation
        if include_prev_december:
            prev_december_mask = (df["year"] == (y - 1)) & (df["month"] == 12) & (df[id_col].isin(sampled_permnos))
            keep_prev_dec = df[prev_december_mask]
            keep_y = pd.concat([keep_prev_dec, keep_y], axis=0, ignore_index=True)

        reduced_frames.append(keep_y)

    reduced = pd.concat(reduced_frames, axis=0, ignore_index=True) if reduced_frames else df.iloc[0:0].copy()

    # Cleanup and sort
    reduced = reduced.drop(columns=["year", "month"])
    reduced = reduced.sort_values([date_col, id_col]).reset_index(drop=True)

    return reduced


In [None]:
"""import pandas as pd
from pathlib import Path

# Reduce to ~5% per year, include previous December rows for those permnos
small_panel = reduce_panel_to_fraction_per_year(
    df,
    date_col="date",
    id_col="permno",
    frac_per_year=0.05,
    random_seed=42,
    include_prev_december=True
)

small_panel_path = Path('../data/processed/master_panel_reduced.csv')
small_panel.to_csv(small_panel_path, index=False)
print(f"Saved reduced panel to: {small_panel_path}")
print(small_panel.head())
"""

In [3]:
df = pd.read_csv('../data/processed/master_panel_reduced.csv')
quantiles = np.arange(0.0, 1.1, 0.1)
print(f"Quantiles of BidAskSpread in reduced panel:\n{df['BidAskSpread'].quantile(quantiles)}")

  df = pd.read_csv('../data/processed/master_panel_reduced.csv')


Quantiles of BidAskSpread in reduced panel:
0.0   -0.999779
0.1   -0.685332
0.2   -0.409513
0.3   -0.161538
0.4    0.000000
0.5    0.000000
0.6    0.132496
0.7    0.349844
0.8    0.566118
0.9    0.783006
1.0    1.000000
Name: BidAskSpread, dtype: float64


In [7]:
from ml_research_kills_alpha.support.constants import PREDICTED_COL
# drop column where PREDICTED_COL is NaN
df = df.dropna(subset=PREDICTED_COL)

In [None]:
from ml_research_kills_alpha.modeling.rolling_trainer import RollingTrainer
from ml_research_kills_alpha.modeling.algorithms.elastic_net import ElasticNetModel
from ml_research_kills_alpha.modeling.algorithms.huber_ols import HuberRegressorModel
from ml_research_kills_alpha.modeling.algorithms.neural_networks import FFNNModel

models = [ElasticNetModel(), HuberRegressorModel(), FFNNModel(2), FFNNModel(3), FFNNModel(4), FFNNModel(5)]
trainer = RollingTrainer(models=models, data=df, end_year=2007)

2025-10-01 16:59:10,091 [INFO] RollingTrainer initialized for years 2005 to 2007 with target column: 'ret'


In [9]:
results = trainer.run()
results

2025-10-01 16:59:10,170 [INFO] Training models for test year 2005...
2025-10-01 16:59:10,178 [INFO] Year 2005: Using 102 valid features out of 212 total features.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_data[self.year_col] = pd.to_datetime(features_data[self.year_col])
2025-10-01 16:59:13,693 [INFO] Year 2005: Train data from start to 1998, Validation data from 1999 to 2004, Test data for 2005.
2025-10-01 16:59:14,734 [INFO] Training model: ENET for year 2005
2025-10-01 16:59:30,787 [INFO] Evaluating model: ENET for year 2005, month 2005-02
2025-10-01 16:59:30,819 [INFO] Generated predictions for ENET
2025-10-01 16:59:30,854 [INFO] Evaluating model: ENET for year 2005, month 2005-03
2025-10-01 16:59:30,873 [INFO] Generated predictions for ENET
2025-10-01 1

KeyboardInterrupt: 