In [3]:
import pandas as pd
pd.set_option("display.max_seq_items", None)  
pd.set_option("display.width", None)         

In [None]:
raw_data = pd.read_csv("C:/Users/alexi/OneDrive/Documents/école/McGill-FIAM/2025/Hackathon-Final-2025/DATA ASSET MANAGEMENT HACKATHON 2025 FINALS/MAIN DATA and SUPPORTING CODES/ret_sample_update.csv")

In [None]:
data = raw_data.copy()

In [None]:
# for 2005 to 2025, read pickle file get gvkeys into a dictionary and delete the dataframe to free up memory
gvkeys_dict = {}
for year in range(2005, 2026):
    text_data = pd.read_pickle(f"C:\\Users\\alexi\\OneDrive\\Documents\\école\\McGill-FIAM\\2025\\Hackathon-Final-2025\\DATA ASSET MANAGEMENT HACKATHON 2025 FINALS\\TEXT DATA US by YEAR\\{year}\\text_us_{year}.pkl")
    gvkeys_dict[year] = text_data['gvkey'].unique().tolist()
    del text_data

In [None]:
# For every gvkey in the dictionary, filter the data dataframe to only include rows with those gvkeys for the corresponding year
filtered_data_list = []
for year in range(2005, 2026):
    # data from that year contains only gvkeys in gvkeys_dict[year]
    filtered_data = data[(data['year'] == year) & (data['gvkey'].isin(gvkeys_dict[year]))]
    filtered_data_list.append(filtered_data)

# concatenate all filtered dataframes
filtered_data = pd.concat(filtered_data_list, ignore_index=True)
del filtered_data_list

In [None]:
# Keep columns: date, excntry, stock_ret, year, month, char_date, market_equity, be_me, ni_me, at_gr1, tangibility, at_be, debt_me, div12m_me, eqpo_me, eqnetis_at, debt_iss, ni_be, profit_sale, gp_at, turnover_126d
filtered_data = filtered_data[['date', 'gvkey', 'excntry', 'stock_ret', 'year', 'month', 'char_date', 'market_equity', 'be_me', 'ni_me', 'at_gr1', 'tangibility', 'at_be', 'debt_me', 'div12m_me', 'eqpo_me', 'eqnetis_at', 'dbnetis_at', 'ni_be', 'ebit_sale', 'gp_at', 'turnover_126d']]

In [None]:
joining_table = pd.read_csv("C:/Users/alexi/OneDrive/Documents/école/McGill-FIAM/2025/Hackathon-Final-2025/DATA ASSET MANAGEMENT HACKATHON 2025 FINALS/MAIN DATA and SUPPORTING CODES/North America Company Name Merge by DataDate-GVKEY-IID.csv")
# rename datadate to date
joining_table = joining_table.rename(columns={"datadate": "date"})

In [None]:
# === Ajout de 'tic' et 'conm' de façon robuste (week-ends / jours fériés) ===
# Problème: 'char_date' peut tomber sur un week-end / jour férié (ex: 2021-02-28, 2021-05-31),
# alors que joining_table est souvent indexée sur le dernier jour *ouvrable* du mois.
# Un merge exact sur la date peut donc échouer et créer des mois manquants.
#
# Solution: pour chaque (gvkey, char_date), prendre la dernière ligne de joining_table
# dont joining_table.date <= char_date (as-of join), en utilisant np.searchsorted (robuste, rapide),
# au lieu de pd.merge_asof (qui impose une contrainte de tri très stricte).

import numpy as np
import pandas as pd

fd = filtered_data.copy()
fd["_row_id"] = np.arange(len(fd))

# Keys côté filtered_data
fd["gvkey_key"] = pd.to_numeric(fd["gvkey"], errors="coerce")
fd["char_date_key"] = pd.to_datetime(fd["char_date"].astype(str), format="%Y%m%d", errors="coerce")

# Keys côté joining_table
jt = joining_table[["gvkey", "date", "tic", "conm"]].copy()
jt["gvkey_key"] = pd.to_numeric(jt["gvkey"], errors="coerce")
jt["jt_date_key"] = pd.to_datetime(jt["date"], errors="coerce")

jt = jt.dropna(subset=["gvkey_key", "jt_date_key"]).copy()
jt = (jt.sort_values(["gvkey_key", "jt_date_key"])
        .drop_duplicates(["gvkey_key", "jt_date_key"], keep="last")
        .reset_index(drop=True))

# Grouper les lignes de joining_table par gvkey pour lookup rapide
jt_groups = {k: g.reset_index(drop=True) for k, g in jt.groupby("gvkey_key", sort=False)}

def _attach_tic_conm(g: pd.DataFrame) -> pd.DataFrame:
    t = jt_groups.get(g.name)
    if t is None or t.empty:
        g["tic"] = pd.NA
        g["conm"] = pd.NA
        return g

    rdates = t["jt_date_key"].to_numpy()
    ldates = g["char_date_key"].to_numpy()

    pos = np.searchsorted(rdates, ldates, side="right") - 1

    tic = np.full(len(g), pd.NA, dtype=object)
    conm = np.full(len(g), pd.NA, dtype=object)

    ok = pos >= 0
    if ok.any():
        sel = pos[ok]
        tic[ok] = t.loc[sel, "tic"].to_numpy()
        conm[ok] = t.loc[sel, "conm"].to_numpy()

    g["tic"] = tic
    g["conm"] = conm
    return g

invalid_mask = fd["gvkey_key"].isna() | fd["char_date_key"].isna()
fd_invalid = fd.loc[invalid_mask].copy()
fd_valid   = fd.loc[~invalid_mask].copy()

# Tri à l'intérieur de chaque gvkey
fd_valid = fd_valid.sort_values(["gvkey_key", "char_date_key"])
fd_valid = (fd_valid.groupby("gvkey_key", group_keys=False, sort=False)
                    .apply(_attach_tic_conm))

# Recombiner et restaurer l'ordre original
out = pd.concat([fd_valid, fd_invalid], ignore_index=True)
out = out.sort_values("_row_id").drop(columns=["_row_id"], errors="ignore")

# Nettoyage des clés auxiliaires si tu ne veux pas les garder:
# out = out.drop(columns=["gvkey_key", "char_date_key"], errors="ignore")

filtered_data = out

print("[OK] tic/conm ajoutés via as-of join (searchsorted). Couverture tic:", filtered_data["tic"].notna().mean())


In [None]:
sector_mapping = pd.read_csv("C:/Users/alexi/OneDrive/Documents/école/McGill-FIAM/2025/Hackathon-Final-2025/DATA ASSET MANAGEMENT HACKATHON 2025 FINALS/MAIN DATA and SUPPORTING CODES/Sector Info SIC and GIC codes All Countries to merge by GVKEY and Date.csv")

In [None]:
filtered_data['gvkey_key'] = pd.to_numeric(filtered_data['gvkey'], errors='coerce').astype('Int64')
filtered_data['date_key']  = pd.to_datetime(filtered_data['date'].astype(str), format='%Y%m%d', errors='coerce')

# --- Keys sector_mapping (date = YYYYMMDD en int) ---
sm = sector_mapping[['gvkey','date','gics','sic','naics']].copy()
sm['gvkey_key'] = pd.to_numeric(sm['gvkey'], errors='coerce').astype('Int64')
sm['date_key']  = pd.to_datetime(sm['date'].astype(str), format='%Y%m%d', errors='coerce')

# (optionnel) éviter les duplications si plusieurs lignes par gvkey-date
sm = sm.drop_duplicates(subset=['gvkey_key','date_key'])

# --- Merge ---
filtered_data = filtered_data.merge(
    sm[['gvkey_key','date_key','gics','sic','naics']],
    on=['gvkey_key','date_key'],
    how='left'
).drop(columns=['gvkey_key','date_key'])

In [None]:
# 1) Extraire le code secteur GICS (2 premiers chiffres) à partir du code GICS 8 chiffres
# ex: 20101010 -> 20
filtered_data['gics_sector_code'] = (
    pd.to_numeric(filtered_data['gics'], errors='coerce')
      .floordiv(10**6)
      .astype('Int64')
)

# 2) Mapping MSCI / GICS (11 secteurs)
gics_sector_map = {
    10: "Energy",
    15: "Materials",
    20: "Industrials",
    25: "Consumer Discretionary",
    30: "Consumer Staples",
    35: "Health Care",
    40: "Financials",
    45: "Information Technology",
    50: "Communication Services",
    55: "Utilities",
    60: "Real Estate",
}

# 3) Ajouter le nom du secteur
filtered_data['gics_sector_name'] = filtered_data['gics_sector_code'].map(gics_sector_map)
filtered_data

In [None]:
from pathlib import Path
import pandas as pd
import ast
import re

# Répertoire (relatif au notebook)
SP500_DIR = Path("sp500-master/sp500_constituants_2005_2024")

def norm_tic(s):
    if pd.isna(s):
        return pd.NA
    s = str(s).strip().upper()
    # optionnel: harmoniser BRK.B vs BRK-B, BF.B vs BF-B, etc.
    s = s.replace("-", ".")
    return s

# 1) Charger tous les fichiers annuels et construire (year, tic_norm)
members = []
for fp in sorted(SP500_DIR.glob("*-sp500-ticker-list.csv")):
    # année depuis le nom du fichier (ex: 2006-sp500-ticker-list.csv)
    year = int(fp.name.split("-")[0])

    df = pd.read_csv(fp)
    # tickers est une string du type "['A', 'AAPL', ...]"
    tickers = ast.literal_eval(df.loc[0, "tickers"]) if isinstance(df.loc[0, "tickers"], str) else df.loc[0, "tickers"]
    tickers = [norm_tic(t) for t in tickers]

    members.append(pd.DataFrame({"year": year, "tic_norm": tickers}))

sp500_members = pd.concat(members, ignore_index=True).dropna().drop_duplicates()

# 2) Filtrer filtered_data par année (S&P500 de l'année correspondante)
fd = filtered_data.copy()
fd["tic_norm"] = fd["tic"].map(norm_tic)

filtered_data = (
    fd.merge(sp500_members, on=["year", "tic_norm"], how="inner")
      .drop(columns=["tic_norm"])
)


In [None]:
# Forward fill values par gvkey, trié par date pour les colonnes: stock_ret, 'market_equity', 'be_me', 'ni_me', 'at_gr1', 'tangibility', 'at_be', 'debt_me', 'div12m_me', 'eqpo_me', 'eqnetis_at', 'dbnetis_at', 'ni_be', 'ebit_sale', 'gp_at', 'turnover_126d', 'gics', 'sic', 'naics', 'gics_sector_code', 'gics_sector_name'
filtered_data = filtered_data.sort_values(by=['gvkey', 'date'])
cols_to_ffill = ['stock_ret', 'market_equity', 'be_me', 'ni_me', 'at_gr1', 'tangibility', 'at_be', 'debt_me', 'div12m_me', 'eqpo_me', 'eqnetis_at', 'dbnetis_at', 'ni_be', 'ebit_sale', 'gp_at', 'turnover_126d', 'gics', 'sic', 'naics', 'gics_sector_code', 'gics_sector_name']
filtered_data[cols_to_ffill] = (filtered_data.groupby('gvkey')[cols_to_ffill].ffill())

In [None]:
# Réduction de la taille des données
# Garder données entre 2021 et 2023 inclus
# Éliminer la moitié des tickers

filtered_data = filtered_data[(filtered_data['year'] >= 2021) & (filtered_data['year'] <= 2023)]
filtered_data = filtered_data.sort_values(by=['gvkey'])
# unique_gvkeys = filtered_data['gvkey'].unique()
# reduced_gvkeys = unique_gvkeys[::2]  # garder un gvkey sur deux
# filtered_data = filtered_data[filtered_data['gvkey'].isin(reduced_gvkeys)]

In [None]:
# Afficher le nombre de tickers unique par année
for year in range(2005, 2026):
    n_unique_tickers = filtered_data[filtered_data['year'] == year]['gvkey'].nunique()
    print(f"Year {year}: {n_unique_tickers} unique tickers")

In [None]:
# === Add latest available summarized filing JSON (summary_json) as-of each monthly observation ===
# This reads the summarized pickles created by summarize_text_reports_v3.py (e.g., C:\TEXT DATA US SUMMARIZED\2021\text_us_2021.pkl)
# and attaches, for each (gvkey, date) in filtered_data, the most recent filing summary_json with report_date <= date.

import os
import sys
import numpy as np
import pandas as pd

# --- Pickle compatibility (some pickles reference numpy._core) ---
try:
    import numpy._core as _ncore
    sys.modules.setdefault("numpy._core", _ncore)
    try:
        sys.modules.setdefault("numpy._core._multiarray_umath", _ncore._multiarray_umath)
    except Exception:
        pass
except Exception:
    import numpy as _np
    sys.modules.setdefault("numpy._core", _np.core)
    try:
        sys.modules.setdefault("numpy._core._multiarray_umath", _np.core._multiarray_umath)
    except Exception:
        pass

TEXT_SUM_ROOT = r"C:\TEXT DATA US SUMMARIZED"

def _parse_yyyymmdd(x):
    """Parse YYYYMMDD-like values (int/str) to pandas.Timestamp."""
    if pd.isna(x):
        return pd.NaT
    s = str(x)
    s_digits = "".join(ch for ch in s if ch.isdigit())
    if len(s_digits) == 8:
        return pd.to_datetime(s_digits, format="%Y%m%d", errors="coerce")
    return pd.to_datetime(s, errors="coerce")

# Decide which years to load based on filtered_data
min_y = int(pd.to_numeric(filtered_data["year"], errors="coerce").min())
max_y = int(pd.to_numeric(filtered_data["year"], errors="coerce").max())

parts = []
available_years = []
for y in range(max(2005, min_y - 1), min(2025, max_y) + 1):
    p = os.path.join(TEXT_SUM_ROOT, str(y), f"text_us_{y}.pkl")
    if os.path.exists(p):
        df_y = pd.read_pickle(p)
        if "summary_json" not in df_y.columns:
            continue
        available_years.append(y)
        tmp = df_y[["gvkey", "date", "summary_json"]].copy()
        tmp["gvkey"] = pd.to_numeric(tmp["gvkey"], errors="coerce")
        tmp["report_date"] = tmp["date"].apply(_parse_yyyymmdd)
        tmp = tmp.dropna(subset=["gvkey", "report_date"]).copy()
        parts.append(tmp[["gvkey", "report_date", "summary_json"]])

print("Summarized years found:", available_years)

if not parts:
    print("[WARN] No summarized pickles with summary_json found under:", TEXT_SUM_ROOT)
    if "summary_json" not in filtered_data.columns:
        filtered_data["summary_json"] = pd.NA
else:
    filings = pd.concat(parts, ignore_index=True)
    filings = filings.sort_values(["gvkey", "report_date"]).drop_duplicates(subset=["gvkey", "report_date"], keep="last")

    # Build per-gvkey arrays for fast as-of matching
    filing_groups = {}
    for gv, g in filings.groupby("gvkey", sort=False):
        g = g.sort_values("report_date")
        filing_groups[gv] = (
            g["report_date"].to_numpy(dtype="datetime64[ns]"),
            g["summary_json"].to_numpy(dtype=object),
        )

    # Prepare filtered_data keys
    fd = filtered_data.copy()
    fd["gvkey_key"] = pd.to_numeric(fd["gvkey"], errors="coerce")
    fd["date_dt"] = fd["date"].apply(_parse_yyyymmdd)
    fd["__row_id"] = np.arange(len(fd))

    valid = fd.dropna(subset=["gvkey_key", "date_dt"])[["__row_id", "gvkey_key", "date_dt"]].copy()

    out = pd.Series(pd.NA, index=fd["__row_id"], dtype=object)

    for gv, g in valid.groupby("gvkey_key", sort=False):
        bundle = filing_groups.get(gv)
        if bundle is None:
            continue
        rdates, rjson = bundle
        ldates = g["date_dt"].to_numpy(dtype="datetime64[ns]")
        pos = np.searchsorted(rdates, ldates, side="right") - 1
        ok = pos >= 0
        if np.any(ok):
            rid = g.loc[ok, "__row_id"].to_numpy(dtype=int)
            out.loc[rid] = rjson[pos[ok]]

    fd["summary_json"] = fd["__row_id"].map(out)
    fd = fd.drop(columns=["gvkey_key", "date_dt", "__row_id"], errors="ignore")

    filtered_data = fd
    print("Added column: summary_json")
    print("Coverage:", pd.Series(filtered_data["summary_json"]).notna().mean())


In [None]:
filtered_data.to_csv("yfinance/filtered_sp500_data.csv", index=False)

In [5]:
filtered_data = pd.read_csv("yfinance/filtered_sp500_data.csv")
filtered_data.head()

Unnamed: 0,date,gvkey,excntry,stock_ret,year,month,char_date,market_equity,be_me,ni_me,...,turnover_126d,char_date_key,tic,conm,gics,sic,naics,gics_sector_code,gics_sector_name,summary_json
0,20210129,1045.0,USA,0.088776,2021,1,20201231,9800.739885,0.031038,-0.353953,...,0.141481,2020-12-31,AAL,AMERICAN AIRLINES GROUP INC,20302010.0,4512.0,481111.0,20,Industrials,
1,20220930,1045.0,USA,-0.073133,2022,9,20220831,8441.499391,0.031038,-0.281704,...,0.05721,2022-08-31,AAL,AMERICAN AIRLINES GROUP INC,20302010.0,4512.0,481111.0,20,Industrials,"{""summary"": ""The SEC filing outlines various r..."
2,20221031,1045.0,USA,0.177741,2022,10,20220930,7824.350495,0.031038,-0.303923,...,0.05179,2022-09-30,AAL,AMERICAN AIRLINES GROUP INC,20302010.0,4512.0,481111.0,20,Industrials,"{""summary"": ""The SEC filing outlines various r..."
3,20221130,1045.0,USA,0.01763,2022,11,20221031,9215.596378,0.031038,-0.208451,...,0.051799,2022-10-31,AAL,AMERICAN AIRLINES GROUP INC,20302010.0,4512.0,481111.0,20,Industrials,"{""summary"": ""The SEC filing outlines various r..."
4,20221230,1045.0,USA,-0.118503,2022,12,20221130,9378.071628,0.031038,-0.20484,...,0.049972,2022-11-30,AAL,AMERICAN AIRLINES GROUP INC,20302010.0,4512.0,481111.0,20,Industrials,"{""summary"": ""The SEC filing outlines various r..."


In [None]:
filtered_data['date'].dtypes

In [None]:
filtered_data.columns