In [1]:
import os
from typing import List
import pandas as pd

# =========================
# Load Compustat data
# =========================
def load_compustat_data(path: str) -> pd.DataFrame:
    return pd.read_csv(path, dtype=str)


# =========================
# Load Macro data
# =========================
def load_macro_data(path: str) -> pd.DataFrame:
    return pd.read_csv(path, sep=";", dtype=str, na_values=["NA"])


# =========================
# Preprocess Compustat
# =========================
def preprocess_compustat(df: pd.DataFrame) -> pd.DataFrame:
    useful: List[str] = [
        "gvkey", "datadate", "fyear", "conm", "tic",
        "sic", "fyr",
        "act", "lct", "at", "lt",
        "seq", "teq", "ceq",
        "dlc", "dltt",
        "revt", "ebit", "xint", "oancf",
        "dlrsn",
    ]

    df = df[[c for c in useful if c in df.columns]]

    if "datadate" in df.columns:
        df["datadate"] = pd.to_datetime(df["datadate"], errors="coerce")

    for c in ["fyear", "fyr", "dlrsn"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")

    num_cols = [
        "act", "lct", "at", "lt", "seq", "teq", "ceq",
        "dlc", "dltt", "revt", "ebit", "xint", "oancf",
    ]
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    return df


# =========================
# Compute 5 KPIs (clean format)
# =========================
def compute_kpis(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()

    total_debt = d["dltt"] + d["dlc"] if {"dltt", "dlc"}.issubset(d.columns) else None
    ebt = d["ebit"] - d["xint"] if {"ebit", "xint"}.issubset(d.columns) else None

    equity = None
    for eq in ["seq", "ceq", "teq"]:
        if eq in d.columns:
            equity = d[eq]
            break

    if ebt is not None and "at" in d.columns:
        d["roa"] = ebt / d["at"]

    if total_debt is not None and equity is not None:
        d["total_debt_to_equity"] = total_debt / equity

    if {"act", "lct"}.issubset(d.columns):
        d["current_ratio"] = d["act"] / d["lct"]

    if {"oancf", "revt"}.issubset(d.columns):
        d["cfo_margin"] = d["oancf"] / d["revt"]

    if {"revt", "at"}.issubset(d.columns):
        d["asset_turnover"] = d["revt"] / d["at"]

    return d


# =========================
# Main
# =========================
if __name__ == "__main__":
    base = "/files/financial-kpis-analysis-and-distress-prediction/"

    # Paths
    comp_in = base + "data/raw/compustat_data.csv"
    comp_out = base + "data/processed/compustat_kpis.csv"
    macro_in = base + "data/raw/macro_data.csv"
    macro_out = base + "data/processed/macro_data.csv"

    os.makedirs(os.path.dirname(comp_out), exist_ok=True)

    # ----- Compustat -----
    df_raw = load_compustat_data(comp_in)
    df_prep = preprocess_compustat(df_raw)
    df_kpis = compute_kpis(df_prep)
    df_kpis.to_csv(comp_out, index=False)
    print(f"Saved Compustat KPIs → {comp_out}")

    # ----- Macro -----
    df_macro = load_macro_data(macro_in)

    if "Name" in df_macro.columns:
        df_macro = df_macro[df_macro["Name"] != "2025"]

    missing_ratio = df_macro.isna().mean()
    cols_to_keep = missing_ratio[missing_ratio < 0.20].index.tolist()
    df_macro = df_macro[cols_to_keep]

    df_macro.to_csv(macro_out, index=False)
    print(f"Saved cleaned macro data → {macro_out}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["datadate"] = pd.to_datetime(df["datadate"], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")
A value is tryin

Saved Compustat KPIs → /files/financial-kpis-analysis-and-distress-prediction/data/processed/compustat_kpis.csv
Saved cleaned macro data → /files/financial-kpis-analysis-and-distress-prediction/data/processed/macro_data.csv
