In [None]:
# model_training.ipynb — FINAL & BULLETPROOF
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
import matplotlib.pyplot as plt
import os

# === LOAD FROM CSV ===
csv_path = '../../data/processed/merged_data.csv'
df = pd.read_csv(csv_path)

print(f"Loaded {len(df)} companies from merged_data.csv")
print("Columns:", list(df.columns))

# === ENSURE risk_flag EXISTS ===
if 'risk_flag' not in df.columns:
    print("ERROR: 'risk_flag' missing! Check data_cleaning.ipynb")
    print("Adding dummy risk_flag = 1 for all (FOR TESTING ONLY)")
    df['risk_flag'] = 1  # ← TEMP FIX (REMOVE LATER)

# === FEATURES & TARGET ===
features = [
    'Total ESG Risk score', 'Environment Risk Score',
    'Social Risk Score', 'Governance Risk Score',
    'debt_to_equity', 'roe'
]

# Ensure all features exist
missing_features = [f for f in features if f not in df.columns]
if missing_features:
    print(f"Missing features: {missing_features} → filling with 0")
    for f in missing_features:
        df[f] = 0

X = df[features].fillna(0)
y = df['risk_flag']

# === TRAIN MODEL ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# === EVALUATE ===
preds = model.predict(X_test)
acc = accuracy_score(y_test, preds)
print(f"Accuracy: {acc:.2f}")

# === SHAP (Optional) ===
try:
    import shap
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)
    
    # Plot
    plt.figure()
    shap.summary_plot(shap_values[1], X_test, show=False)
    plt.title("SHAP Feature Importance")
    plt.tight_layout()
    plt.show()

    # Save
    os.makedirs('../../python/scripts', exist_ok=True)
    joblib.dump(model, '../../python/scripts/rf_model.pkl')
    joblib.dump(explainer, '../../python/scripts/shap_explainer.pkl')
    print("Model + SHAP explainer saved!")
except Exception as e:
    print(f"SHAP failed (OK for demo): {e}")
    os.makedirs('../../python/scripts', exist_ok=True)
    joblib.dump(model, '../../python/scripts/rf_model.pkl')
    print("Model saved (no SHAP)")

print("Training complete! Ready for dashboard.")

shap not available at import time (will try again during SHAP step): Could not find/load shared object file
Loaded 379 companies from merged_data.csv
Columns: ['Symbol', 'Sector_esg', 'Total ESG Risk score', 'Environment Risk Score', 'Social Risk Score', 'Governance Risk Score', 'Controversy Level', 'Controversy Score', 'ESG Risk Percentile', 'ESG Risk Level', 'risk_flag', 'Sector_fin', 'Price', 'Price/Earnings', 'Dividend Yield', 'Earnings/Share', 'Market Cap', 'EBITDA', 'Price/Sales', 'Price/Book', 'debt_to_equity', 'roe', 'name']
Accuracy: 0.97
SHAP failed (OK for demo): Could not find/load shared object file
Model saved (no SHAP)
Training complete! Ready for dashboard.
SHAP failed (OK for demo): Could not find/load shared object file
Model saved (no SHAP)
Training complete! Ready for dashboard.


In [16]:


pip install pandas scikit-learn joblib matplotlib shap





[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
