In [33]:
#This notebook performs comprehensive ML diagnostics including:
#Feature importance analysis
#SHAP (SHapley Additive exPlanations) analysis
#Partial Dependence Plots (PDP)
#Model performance comparison


SyntaxError: invalid syntax (1910357536.py, line 1)

In [None]:
import warnings
import shap
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import lightgbm as lgb
from datetime import datetime


In [None]:

rom sklearn.experimental import enable_halving_search_cv  # for HalvingGridSearchCV
from sklearn.model_selection import (
    TimeSeriesSplit, RandomizedSearchCV, HalvingGridSearchCV
)
from sklearn.pipeline     import Pipeline
from sklearn.impute       import SimpleImputer
from sklearn.metrics      import r2_score
from sklearn.ensemble     import RandomForestRegressor    
from scipy.stats          import randint, loguniform
from sklearn.ensemble     import HistGradientBoostingRegressor
from joblib               import Memory

warnings.filterwarnings("ignore")
RANDOM_STATE = 0

In [None]:
# Global settings
RANDOM_STATE = 0
TIMESTAMP = "2025-05-21 05:11:46"
USER = "EricLu"


In [None]:
# %% Helpers & caching
mem = Memory("__cache__", verbose=0)

def build_pipeline(model):
    return Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model",   model)
    ], memory=mem)

tscv = TimeSeriesSplit(n_splits=3)


In [None]:
# Configure warnings and visualization settings
warnings.filterwarnings('ignore')

# JupyterLab specific settings
%matplotlib inline

In [118]:
# Set visualization style using seaborn directly instead of matplotlib style
sns.set_style("whitegrid")  # This replaces plt.style.use('seaborn')
sns.set_palette("husl")

In [120]:

print(f"Analysis started by {USER} at {TIMESTAMP}")

Analysis started by EricLu at 2025-05-21 05:11:46


In [122]:
# DATA
df = pd.read_csv("clean_panel.csv")
df.sort_values(["fyear","gvkey"], inplace=True)

# Winsorize at 1st/99th pctile, log1p for positive series
for tgt in ["invest_cap_sum","Tobins_Q","DisagreementInvFCompDir","debt_to_asset"]:
    lo, hi = df[tgt].quantile([0.01,0.99])
    df[tgt] = df[tgt].clip(lo, hi)
    if (df[tgt] > 0).all():
        df[f"log_{tgt}"] = np.log1p(df[tgt])

# Split train/test (80/20)
train = df[df["fyear"] <= 2020]
test  = df[df["fyear"] >  2020]



In [124]:
# %%
features_invest = [
    "lag_Tobins_Q",         # Tobin's Q at t-1
    "lag_invest_cap_sum",   # invest_cap_sum at t-1
    "lag_cash_flow",        # cash_flow at t-1
    "lag_debt_to_asset",    # debt_to_asset at t-1
    "lag_equity_issuance",  # equity_issuance at t-1
    "lag_ROA",              # ROA at t-1
    "lag_sales_growth",     # sales_growth at t-1
    "baa_rate",         # baa_rate at t-1 (compute below)
    "lag_DisagreementInvFCompDir"
]

features_disagr = [
    "lag_Tobins_Q",
    "lag_invest_cap_sum",
    "lag_cash_flow",
    "lag_debt_to_asset",
    "lag_equity_issuance",
    "lag_ROA",
    "lag_sales_growth",
    "baa_rate"
]

features_q = [
    "lag_invest_cap_sum",
    "lag_cash_flow",
    "lag_debt_to_asset",
    "lag_equity_issuance",
    "lag_ROA",
    "lag_sales_growth",
    "baa_rate",
    "lag_DisagreementInvFCompDir"
]

# And if you want a stand-alone leverage model:
features_debt = [
    "lag_Tobins_Q",
    "lag_invest_cap_sum",
    "lag_cash_flow",
    "lag_ROA",
    "lag_sales_growth",
    "baa_rate",
    "lag_DisagreementInvFCompDir"
]

# Make sure you actually create the `lag_baa_rate` column:
df["lag_baa_rate"] = df.groupby("gvkey")["baa_rate"].shift(1)

# Now your target-sets dictionary (use {} not []):
target_sets = {
    "invest": (features_invest,   "invest_cap_sum"),
    "disagr": (features_disagr,   "DisagreementInvFCompDir"),
    "q":      (features_q,        "Tobins_Q"),
    "debt":   (features_debt,     "debt_to_asset")
}

In [126]:
# Helpers
tscv = TimeSeriesSplit(n_splits=5)

def build_pipeline(model):
    return Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model",   model)
    ])

# Random Forest grid (kept small for notebook speed)
rf_grid = {
    "model__n_estimators":   [300, 600],
    "model__max_depth":      [None, 10],
    "model__max_features":   ["sqrt", 0.5],
    "model__min_samples_leaf":[1, 5]
}

# LightGBM expanded grid
lgb_grid = {
    "model__n_estimators":      [400, 800],
    "model__learning_rate":     [0.05, 0.01],
    "model__num_leaves":        [31, 63, 127],
    "model__min_child_samples": [5, 20],
    "model__min_gain_to_split": [0.0, 0.001],
    "model__subsample":         [0.8, 1.0],
    "model__colsample_bytree":  [0.8, 1.0]
}


In [128]:
def tune_lgb(pipe, param_grid, X, y):
    best_estimators = []
    for train_idx, val_idx in tscv.split(X):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Use sklearn’s GridSearch but with our own single-split CV
        gs = GridSearchCV(
            pipe, param_grid,
            cv=[(train_idx, val_idx)],
            scoring="r2",
            n_jobs=-1, refit=True, error_score="raise"
        )
        # Early-stopping callback—works for all LightGBM versions:
        early_cb = lgb.early_stopping(stopping_rounds=50, verbose=False)
        gs.fit(
            X, y,
            **{
                "model__eval_set":   [(X_val, y_val)],
                "model__callbacks": [early_cb]
            }
        )
        best_estimators.append((gs.best_estimator_, gs.best_score_))
    # pick the fold with highest validation R²
    best = max(best_estimators, key=lambda x: x[1])[0]
    print(f"  → LGBM best val R²: {max(e for _,e in best_estimators):.3f}")
    return best

def tune_rf(pipe, param_grid, X, y):
    gs = GridSearchCV(pipe, param_grid, cv=tscv,
                      scoring="r2", n_jobs=-1, refit=True,
                      error_score="raise")
    gs.fit(X, y)
    print(f"  → RF best CV R²: {gs.best_score_:.3f}")
    return gs.best_estimator_

In [130]:
# ## 6. Train & Evaluate in Notebook
results = {}

for name, (feat, tgt) in target_sets.items():
    print(f"\n### Target: {tgt}")
    X_tr, y_tr = train[feat], train[tgt]
    X_te, y_te = test[feat],  test[tgt]

    # RF
    rf_pipe = build_pipeline(RandomForestRegressor(
        random_state=RANDOM_STATE, n_jobs=-1
    ))
    best_rf = tune_rf(rf_pipe, rf_grid, X_tr, y_tr)
    rf_r2  = r2_score(y_te, best_rf.predict(X_te))

    # LGBM (silent & col-wise)
    lgb_pipe = build_pipeline(LGBMRegressor(
        random_state=RANDOM_STATE,
        n_jobs=-1,
        verbosity=-1,
        force_col_wise=True
    ))
    best_lgb = tune_lgb(lgb_pipe, lgb_grid, X_tr, y_tr)
    lgb_r2   = r2_score(y_te, best_lgb.predict(X_te))

    print(f"  ▶ Hold-out R²: RF = {rf_r2:.3f}  |  LGBM = {lgb_r2:.3f}")
    results[name] = (rf_r2, lgb_r2)


### Target: invest_cap_sum
  → RF best CV R²: 0.542


KeyboardInterrupt: 

In [44]:
# Diagnostics for 'invest'
    if key == "invest":
        # 1) RF Feature Importance
        rf_model = best_rf.named_steps['model']
        importances = rf_model.feature_importances_
        order = np.argsort(importances)[::-1]
        plt.figure(figsize=(6,4))
        plt.bar(range(len(feat)), importances[order])
        plt.xticks(range(len(feat)), [feat[i] for i in order], rotation=45, ha='right')
        plt.title('RF Feature Importance – invest_cap_sum')
        plt.tight_layout()
        plt.show()


IndentationError: unexpected indent (3953759278.py, line 2)

In [46]:
# 2) SHAP summary (LGBM)
        bg = X_tr.sample(min(len(X_tr), 1000), random_state=RANDOM_STATE)
        expl = shap.TreeExplainer(best_lgb.named_steps['model'], bg)
        shap_vals = expl.shap_values(X_tr)
        shap.summary_plot(shap_vals, X_tr, show=False)
        plt.tight_layout()
        plt.show()

IndentationError: unexpected indent (2609668813.py, line 2)

In [48]:
# 3) Partial Dependence (top 3 features)
        topN = 3
        top_feats = [feat[i] for i in order[:topN]]
        fig, axes = plt.subplots(1, topN, figsize=(4*topN, 3))
        PartialDependenceDisplay.from_estimator(best_rf, X_tr, top_feats, ax=axes)
        fig.suptitle('RF PDP – invest_cap_sum')
        plt.tight_layout()
        plt.show()

IndentationError: unexpected indent (2857601287.py, line 2)

In [50]:
 # 5. SUMMARY
print("\n=====  Test‑set R² summary  =====")
for k,v in results.items():
    print(f"{k:8s} | RF = {v[2]['rf_r2']:.3f} | LGBM = {v[2]['lgb_r2']:.3f}")


=====  Test‑set R² summary  =====
invest   | RF = 0.493 | LGBM = 0.500
disagr   | RF = 0.034 | LGBM = 0.021
q        | RF = 0.115 | LGBM = 0.123
