In [1]:
# === fundamental_features.py ===

import pandas as pd
import numpy as np
import os

# === CONFIG ===
BASE_DIR = r"C:\Users\flass\OneDrive\AI Financial Model\S&P 500 Chatgpt Version"
LABELED_DATA = os.path.join(BASE_DIR, "df_labeled.csv")
INDEX_PE_PATH = os.path.join(BASE_DIR, "sp500_pe_index_timeseries.csv")
STOCK_PE_PATH = os.path.join(BASE_DIR, "sp500_pe_timeseries.csv")
TICKER_INFO_PATH = os.path.join(BASE_DIR, "sp500_tickers.csv")
OUTPUT_PATH = os.path.join(BASE_DIR, "df_fundamental_features.csv")

# === Load Labeled Core Data ===
print("📂 Loading core data...")
df = pd.read_csv(LABELED_DATA, low_memory=False)
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["ticker"] = df["ticker"].str.strip().str.upper()

# === Load and Normalize Index PE ===
index_pe = pd.read_csv(INDEX_PE_PATH, parse_dates=["date"])
index_pe.columns = index_pe.columns.str.strip().str.lower()

# === Load and Normalize Stock PE Timeseries ===
stock_pe = pd.read_csv(STOCK_PE_PATH, parse_dates=["date"])
stock_pe.columns = stock_pe.columns.str.strip().str.lower()
stock_pe["ticker"] = stock_pe["ticker"].str.strip().str.upper()

# ✅ Confirm required column
if "pe_ttm" not in stock_pe.columns:
    raise KeyError("❌ 'pe_ttm' not found in stock_pe CSV. Please double-check column names.")

# === Merge PE data ===
print("🔁 Merging index and stock PE data...")
df = df.merge(index_pe[["date", "pe_index"]], on="date", how="left")

# Drop pre-existing pe_ttm if present to avoid merge conflicts
if "pe_ttm" in df.columns:
    df.drop(columns=["pe_ttm"], inplace=True)

df = df.merge(stock_pe[["ticker", "date", "pe_ttm"]], on=["ticker", "date"], how="left")

# Debug info
print("🧪 Columns after merge:", df.columns.tolist())
if "pe_ttm" not in df.columns:
    raise KeyError("❌ 'pe_ttm' missing after merge. Check for merge conflicts or suffixes.")
if df["pe_ttm"].isna().all():
    raise ValueError("❌ All 'pe_ttm' values are NaN after merge. Check ticker/date alignment.")

df["pe_ttm"] = pd.to_numeric(df["pe_ttm"], errors="coerce")

# === Relative Valuation Metrics ===
print("📈 Calculating PE-relative metrics...")
df["pe_rel_index"] = df["pe_ttm"] / (df["pe_index"] + 1e-6)
df["pe_self_3y_avg"] = df.groupby("ticker")["pe_ttm"].transform(lambda x: x.rolling(756, min_periods=200).mean())
df["pe_rel_self_3y_avg"] = df["pe_ttm"] / (df["pe_self_3y_avg"] + 1e-6)
df["pe_rel_index_rolling_avg"] = df.groupby("ticker")["pe_rel_index"].transform(lambda x: x.rolling(756, min_periods=200).mean())
df["pe_rel_rel_self_vs_index_3y"] = df["pe_rel_index"] / (df["pe_rel_index_rolling_avg"] + 1e-6)

# === Forward EPS Growth ===
print("🔮 Estimating forward EPS growth...")
df["eps_ttm"] = df["inc_netIncome"] / (df["bal_commonStockSharesOutstanding"] + 1e-6)
df["eps_fwd_1y"] = df.groupby("ticker")["eps_ttm"].shift(-252)
df["fwd_eps_growth"] = (df["eps_fwd_1y"] - df["eps_ttm"]) / (df["eps_ttm"] + 1e-6)

# === Financial Ratios ===
print("🧮 Computing final features...")
df["market_cap"] = df["adjusted_close"] * df["bal_commonStockSharesOutstanding"]
df["debt_to_equity"] = (df["bal_longTermDebt"] + df["bal_shortTermDebt"]) / (df["bal_totalStockholderEquity"] + 1e-6)
df["free_cash_flow_yield"] = df["cf_totalCashFromOperatingActivities"] / (df["market_cap"] + 1e-6)
df["return_on_equity"] = df["inc_netIncome"] / (df["bal_totalStockholderEquity"] + 1e-6)

# === Select Features ===
feature_cols = [
    "ticker", "date",
    "pe_ttm", "ev_ebitda",
    "sales_growth_1y", "sales_growth_3y", "eps_growth_3y", "fwd_eps_growth",
    "pe_rel_index", "pe_rel_self_3y_avg", "pe_rel_rel_self_vs_index_3y",
    "market_cap", "debt_to_equity", "free_cash_flow_yield", "return_on_equity"
]

# === Filter Rows with Sufficient Data ===
print("🔎 Filtering rows with >=10 valid features...")
df["valid_count"] = df[feature_cols[2:]].notna().sum(axis=1)
df_out = df[df["valid_count"] >= 10][feature_cols].copy()
print(f"✅ Final shape: {df_out.shape}")

# === Save Output ===
print(f"💾 Saving to {OUTPUT_PATH}")
df_out.to_csv(OUTPUT_PATH, index=False)
print("✅ Done.")


📂 Loading core data...
🔁 Merging index and stock PE data...
🧪 Columns after merge: ['date', 'open', 'high', 'low', 'close', 'adjusted_close', 'volume', 'ticker_x', 'ticker_y', 'inc_filing_date', 'inc_currency_symbol', 'inc_researchDevelopment', 'inc_effectOfAccountingCharges', 'inc_incomeBeforeTax', 'inc_minorityInterest', 'inc_netIncome', 'inc_sellingGeneralAdministrative', 'inc_sellingAndMarketingExpenses', 'inc_grossProfit', 'inc_reconciledDepreciation', 'inc_ebit', 'inc_ebitda', 'inc_depreciationAndAmortization', 'inc_nonOperatingIncomeNetOther', 'inc_operatingIncome', 'inc_otherOperatingExpenses', 'inc_interestExpense', 'inc_taxProvision', 'inc_interestIncome', 'inc_netInterestIncome', 'inc_extraordinaryItems', 'inc_nonRecurring', 'inc_otherItems', 'inc_incomeTaxExpense', 'inc_totalRevenue', 'inc_totalOperatingExpenses', 'inc_costOfRevenue', 'inc_totalOtherIncomeExpenseNet', 'inc_discontinuedOperations', 'inc_netIncomeFromContinuingOps', 'inc_netIncomeApplicableToCommonShares', 'i

In [1]:
import pandas as pd

file_path = "C:/Users/flass/OneDrive/AI Financial Model/S&P 500 Chatgpt Version/df_fundamental_features.csv"
df = pd.read_csv(file_path, parse_dates=["date"])

print(f"✅ Loaded df_fundamental_features.csv with shape: {df.shape}\n")

# Show column names
print("📋 Columns:")
for col in df.columns:
    print("•", col)

# Check for nulls
print("\n🔍 Missing values per column:")
print(df.isna().sum())

# Preview sample rows
print("\n🔎 Sample rows:")
print(df.head())


✅ Loaded df_fundamental_features.csv with shape: (2894127, 15)

📋 Columns:
• ticker
• date
• pe_ttm
• ev_ebitda
• sales_growth_1y
• sales_growth_3y
• eps_growth_3y
• fwd_eps_growth
• pe_rel_index
• pe_rel_self_3y_avg
• pe_rel_rel_self_vs_index_3y
• market_cap
• debt_to_equity
• free_cash_flow_yield
• return_on_equity

🔍 Missing values per column:
ticker                              0
date                                0
pe_ttm                              0
ev_ebitda                      636920
sales_growth_1y                     0
sales_growth_3y                 31591
eps_growth_3y                   31591
fwd_eps_growth                 125665
pe_rel_index                        0
pe_rel_self_3y_avg               2241
pe_rel_rel_self_vs_index_3y      2241
market_cap                          0
debt_to_equity                 631369
free_cash_flow_yield             2787
return_on_equity                 1059
dtype: int64

🔎 Sample rows:
  ticker       date    pe_ttm  ev_ebitda  sales_grow

In [2]:
import pandas as pd

# Load the labeled dataset
file_path = r"C:\Users\flass\OneDrive\AI Financial Model\S&P 500 Chatgpt Version\df_labeled.csv"
df = pd.read_csv(file_path, low_memory=False)

# Lowercase and strip all column names for consistency
df.columns = df.columns.str.strip()

# Check for relevant columns
relevant_cols = [
    "ev", "ebitda", 
    "bal_longTermDebt", "bal_shortTermDebt", "bal_totalStockholderEquity"
]

# Print which ones exist
print("✅ Available Columns:")
for col in relevant_cols:
    print(f"{col:30} {'✅' if col in df.columns else '❌ MISSING'}")

# Show null counts
print("\n🔍 Missing Values:")
existing = [col for col in relevant_cols if col in df.columns]
print(df[existing].isna().sum())


✅ Available Columns:
ev                             ❌ MISSING
ebitda                         ❌ MISSING
bal_longTermDebt               ✅
bal_shortTermDebt              ✅
bal_totalStockholderEquity     ✅

🔍 Missing Values:
bal_longTermDebt              501843
bal_shortTermDebt             320825
bal_totalStockholderEquity      1561
dtype: int64
