In [3]:
import pandas as pd
import numpy as np
import re
import os

In [4]:
# -------------------------
# Parameters (change if needed)
# -------------------------
RET_FILE = "/content/Stock-Returns-Cleaned.csv"
MCAP_FILE = "/content/Market-Capitalization-Cleaned.csv"
BM_FILE = "/content/BV-to-MV-Cleaned.csv"   # must be book-to-market (book / market). If you have book values instead, compute BM externally.
FORMATION_MONTH = 9            # September by default
HOLD_MONTHS = 12
SIZE_GROUPS = 2
VALUE_GROUPS = 3
MIN_STOCKS_PER_PORTFOLIO = 5
REQUIRE_POSITIVE_BM = True     # exclude bm <= 0 firms when forming portfolios
OUTPUT_DIR = "."

In [5]:
# -------------------------
# Helper utilities
# -------------------------
def clean_colname(c):
    c = str(c).strip()
    c = re.sub(r'\.\d+$', '', c)      # remove .1, .2 suffixes
    c = re.sub(r'\s+', ' ', c)
    return c

def load_and_clean(path):
    df = pd.read_csv(path, index_col=0)
    df.columns = [clean_colname(c) for c in df.columns]
    # Drop columns that start with 'Unnamed:'
    df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
    # parse month-like columns into PeriodIndex (monthly)
    parsed = []
    for c in df.columns:
        dt = None
        for fmt in ("%b-%y", "%b-%Y", "%Y-%m", "%Y-%b", None):
            try:
                if fmt is None:
                    dt = pd.to_datetime(c)
                else:
                    dt = pd.to_datetime(c, format=fmt)
                break
            except Exception:
                dt = None
        if dt is None:
            raise ValueError(f"Could not parse column name into date: '{c}'")
        parsed.append(pd.Period(dt, freq='M'))
    df.columns = pd.PeriodIndex(parsed, freq='M')
    return df

def label(sz, vm):
    return f"S{sz}_V{vm}"

In [7]:
# -------------------------
# Build portfolios function
# -------------------------
def build_ff25(returns_df, mcap_df, bm_df,
               formation_month=FORMATION_MONTH,
               hold_months=HOLD_MONTHS,
               size_groups=SIZE_GROUPS,
               value_groups=VALUE_GROUPS,
               min_stocks=MIN_STOCKS_PER_PORTFOLIO,
               require_positive_bm=REQUIRE_POSITIVE_BM):
    # align indices and columns (intersection)
    common_idx = returns_df.index.intersection(mcap_df.index).intersection(bm_df.index)
    common_cols = returns_df.columns.intersection(mcap_df.columns).intersection(bm_df.columns)
    R = returns_df.loc[common_idx, common_cols].copy()
    M = mcap_df.loc[common_idx, common_cols].copy()
    BM = bm_df.loc[common_idx, common_cols].copy()

    all_periods = sorted(R.columns)
    years = sorted({p.year for p in all_periods})

    portfolio_ts = {}   # { "S1_V1": {timestamp: ret, ...}, ... }
    counts_ts = {}      # counts per cell: { "S1_V1_count": {timestamp: n, ...}, ... }

    for year in years:
        formation_period = pd.Period(f"{year}-{formation_month:02d}", freq='M')
        if formation_period not in all_periods:
            continue

        # holding months: formation + 1 .. formation + hold_months
        hold_months_list = [formation_period + i for i in range(1, hold_months + 1)]
        hold_months_list = [m for m in hold_months_list if m in all_periods]
        if not hold_months_list:
            continue

        mcap_at_form = M[formation_period]
        bm_at_form = BM[formation_period]

        # eligibility mask
        valid = (~mcap_at_form.isna()) & (~bm_at_form.isna())
        if require_positive_bm:
            valid = valid & (bm_at_form > 0)

        eligible = valid[valid].index.tolist()
        if len(eligible) < 10:
            continue

        mc_cs = mcap_at_form.loc[eligible]
        bm_cs = bm_at_form.loc[eligible]

        # compute breakpoints
        # Size: top 20% big (S2), rest small (S1)
        size_cutoffs = [0, mc_cs.quantile(0.8), mc_cs.max()]
        # Value: 30%, 30%, rest
        value_cutoffs = [0, bm_cs.quantile(0.3), bm_cs.quantile(0.6), bm_cs.max()]

        size_rank = pd.cut(mc_cs, bins=size_cutoffs, labels=False, include_lowest=True) + 1
        value_rank = pd.cut(bm_cs, bins=value_cutoffs, labels=False, include_lowest=True) + 1

        group_df = pd.DataFrame({
            'size_q': size_rank.astype(int),
            'value_q': value_rank.astype(int),
            'mcap': mc_cs
        })

        # formation weights (by marketcap at formation)
        group_df['w_form'] = group_df['mcap'] / group_df['mcap'].sum()

        # iterate holding months
        for hm in hold_months_list:
            ret_month = R[hm].reindex(group_df.index)
            for s in range(1, size_groups + 1):
                for v in range(1, value_groups + 1):
                    cell = label(s, v)
                    members = group_df.index[(group_df['size_q'] == s) & (group_df['value_q'] == v)].tolist()
                    count = len(members)
                    counts_ts.setdefault(cell + "_count", {})[hm.to_timestamp()] = count

                    if count < min_stocks:
                        port_ret = np.nan
                    else:
                        w = group_df.loc[members, 'w_form'].copy()
                        r = ret_month.loc[members]
                        valid_this_month = r.notna()
                        if valid_this_month.sum() == 0:
                            port_ret = np.nan
                        else:
                            w = w[valid_this_month]
                            r = r[valid_this_month]
                            w = w / w.sum()
                            port_ret = (w * r).sum()
                    portfolio_ts.setdefault(cell, {})[hm.to_timestamp()] = port_ret

    # to DataFrame
    all_months = sorted({m for d in portfolio_ts.values() for m in d.keys()})
    df_ports = pd.DataFrame(index=pd.to_datetime(all_months))
    df_counts = pd.DataFrame(index=pd.to_datetime(sorted({m for d in counts_ts.values() for m in d.keys()})))

    for plabel, ts in portfolio_ts.items():
        s = pd.Series(ts)
        s.index = pd.to_datetime(list(s.index))
        df_ports[plabel] = s

    for clabel, ts in counts_ts.items():
        s = pd.Series(ts)
        s.index = pd.to_datetime(list(s.index))
        df_counts[clabel] = s

    df_ports = df_ports.sort_index()
    df_counts = df_counts.sort_index()

    # compute SMB and HML
    # SMB = average_over_v( S_small_v - S_big_v )
    smb_parts = []
    for v in range(1, value_groups + 1):
        small = label(1, v)
        big = label(size_groups, v)
        if (small in df_ports.columns) and (big in df_ports.columns):
            smb_parts.append(df_ports[small] - df_ports[big])
    SMB = pd.concat(smb_parts, axis=1).mean(axis=1) if smb_parts else pd.Series(index=df_ports.index, dtype=float)

    # HML = average_over_s ( high_v - low_v )
    hml_parts = []
    for s in range(1, size_groups + 1):
        lowv = label(s, 1)
        highv = label(s, value_groups)
        if (lowv in df_ports.columns) and (highv in df_ports.columns):
            hml_parts.append(df_ports[highv] - df_ports[lowv])
    HML = pd.concat(hml_parts, axis=1).mean(axis=1) if hml_parts else pd.Series(index=df_ports.index, dtype=float)

    factors = pd.DataFrame({'SMB': SMB, 'HML': HML})

    return df_ports, factors, df_counts

In [8]:
# -------------------------
# Main execution
# -------------------------
def main():
    # load
    print("Loading files...")
    if not os.path.exists(RET_FILE):
        raise FileNotFoundError(f"{RET_FILE} not found in working directory.")
    if not os.path.exists(MCAP_FILE):
        raise FileNotFoundError(f"{MCAP_FILE} not found in working directory.")
    if not os.path.exists(BM_FILE):
        raise FileNotFoundError(f"{BM_FILE} not found in working directory.")

    R = load_and_clean(RET_FILE)
    M = load_and_clean(MCAP_FILE)
    BM = load_and_clean(BM_FILE)

    print("Shapes after load and alignment suggestions:")
    print("returns:", R.shape, "mcap:", M.shape, "bm:", BM.shape)

    print("Building FF25 portfolios ... (this might take a moment)")
    ports, factors, counts = build_ff25(R, M, BM)

    # Save outputs
    ports_out = os.path.join(OUTPUT_DIR, "FF25_portfolios.csv")
    factors_out = os.path.join(OUTPUT_DIR, "FF_factors_SMB_HML.csv")
    counts_out = os.path.join(OUTPUT_DIR, "FF25_counts.csv")

    ports.to_csv(ports_out, index_label="date")
    factors.to_csv(factors_out, index_label="date")
    counts.to_csv(counts_out, index_label="date")

    print(f"Saved portfolio returns -> {ports_out}")
    print(f"Saved SMB/HML factors    -> {factors_out}")
    print(f"Saved counts per cell    -> {counts_out}")
    print("Done.")

if __name__ == "__main__":
    main()

Loading files...
Shapes after load and alignment suggestions:
returns: (982, 132) mcap: (982, 132) bm: (982, 132)
Building FF25 portfolios ... (this might take a moment)
Saved portfolio returns -> ./FF25_portfolios.csv
Saved SMB/HML factors    -> ./FF_factors_SMB_HML.csv
Saved counts per cell    -> ./FF25_counts.csv
Done.
