In [1]:
# === target_engineering.py ===

import pandas as pd
import numpy as np

# === Load raw data ===
print("\U0001F504 Loading df_raw.parquet...")
df = pd.read_parquet("C:/Users/flass/OneDrive/AI Financial Model/S&P 500 Chatgpt Version/df_raw.parquet")

# Ensure 'ticker' column is present
if 'ticker' not in df.columns:
    for col in df.columns:
        if 'ticker' in col.lower():
            df['ticker'] = df[col]
            break
    else:
        raise ValueError("❌ 'ticker' column not found.")

print(f"✅ Unique tickers loaded: {df['ticker'].nunique()}")

# === Ensure correct types ===
print("\U0001F522 Forcing numeric conversion...")
df = df.apply(pd.to_numeric, errors="ignore")

# === Sort for time-series operations ===
df = df.sort_values(by=["ticker", "date"])

# === Calculate forward returns ===
print("\U0001F4C8 Calculating forward returns...")
thresholds = {
    "2w_5": (10, 0.05),
    "2w_7": (10, 0.07),
    "1m_5": (21, 0.05),
    "1m_7": (21, 0.07),
    "1m_10": (21, 0.10),
    "3m_10": (63, 0.10),
    "3m_15": (63, 0.15),
    "3m_20": (63, 0.20),
    "6m_15": (126, 0.15),
    "6m_20": (126, 0.20),
    "6m_25": (126, 0.25),
    "1y_30": (252, 0.30),
}



for name, (periods, thresh) in thresholds.items():
    shifted = df.groupby("ticker")["adjusted_close"].transform(lambda x: x.shift(-periods))
    forward_return = (shifted - df["adjusted_close"]) / df["adjusted_close"]
    df[f"target_{name}"] = (forward_return >= thresh).astype(int)

# === Compound Targets ===
print("\U0001F501 Creating compound targets...")
target_cols = [f"target_{k}" for k in thresholds.keys()]
compound_targets = {}
for i in range(len(target_cols)):
    for j in range(i + 1, len(target_cols)):
        t1, t2 = target_cols[i], target_cols[j]
        colname = f"{t1}_and_{t2}"
        compound_targets[colname] = (df[t1] & df[t2]).astype(int)

df = pd.concat([df, pd.DataFrame(compound_targets)], axis=1)

# === Fundamental Ratios ===
print("\U0001F4CA Calculating fundamental ratios...")
df["eps_ttm"] = df["inc_netIncome"] / (df["bal_commonStockSharesOutstanding"] + 1e-6)
df["pe_ttm"] = df["adjusted_close"] / (df["eps_ttm"] + 1e-6)

market_cap = df["adjusted_close"] * df["bal_commonStockSharesOutstanding"]
ev = market_cap + df["bal_longTermDebt"] + df["bal_shortTermDebt"] - df["bal_cashAndShortTermInvestments"]
df["ev_ebitda"] = ev / (df["inc_ebitda"] + 1e-6)

df["sales_growth_1y"] = df.groupby("ticker")["inc_totalRevenue"].pct_change(periods=252)
df["sales_growth_3y"] = df.groupby("ticker")["inc_totalRevenue"].pct_change(periods=756)
df["eps_growth_3y"] = df.groupby("ticker")["inc_netIncome"].pct_change(periods=756)

# === Clean NaNs ===
print("\U0001F9F9 Cleaning NaNs for model input...")
required_cols = [
    "pe_ttm", "ev_ebitda", "sales_growth_1y",
    "sales_growth_3y", "eps_growth_3y", "adjusted_close"
]
df["valid_count"] = df[required_cols].notna().sum(axis=1)
df = df[df["valid_count"] >= 4].drop(columns="valid_count")

print(f"✅ Final shape: {df.shape}")

# === Save ===
output_path = "C:/Users/flass/OneDrive/AI Financial Model/S&P 500 Chatgpt Version/df_labeled.csv"
df.to_csv(output_path, index=False)
print(f"💾 Saved labeled dataset to: {output_path}")

# === Also save to Parquet for pipeline compatibility ===
parquet_path = "C:/Users/flass/OneDrive/AI Financial Model/S&P 500 Chatgpt Version/df_labeled.parquet"
df.to_parquet(parquet_path, index=False)
print(f"💾 Also saved labeled dataset to: {parquet_path}")



🔄 Loading df_raw.parquet...


KeyboardInterrupt: 