In [None]:
# 4_industry_comparison.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from utils.helpers import evaluate_basic_metrics, expected_loss, find_best_threshold, plot_confusion, plot_roc_curves

# Load full data
data = pd.read_csv("data/bisnode_firms_clean.csv")
data = data.dropna(subset=["fast_growth"])
data["fast_growth"] = data["fast_growth"].astype(int)

# Feature definitions
numerical_features = [
    "sales_mil_log", "sales_mil_log_sq", "age", "age2", 
    "growth_1y", "growth_1y_sq", "ceo_age", "foreign"
]
categorical_features = ["ind2_cat", "urban_m", "gender_m", "m_region_loc"]
binary_features = ["new", "ceo_young", "foreign_management"]

# Use best model class (from classification results)
best_model_class = GradientBoostingClassifier  # Update based on result

# Evaluate per industry
def evaluate_industry(industry_name):
    subset = data[data["industry_group"] == industry_name]
    X = subset[numerical_features + categorical_features + binary_features].copy()
    X = pd.get_dummies(X, columns=categorical_features, drop_first=True)
    X = X.fillna(X.mean())
    y = subset["fast_growth"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    if best_model_class == LogisticRegression:
        model = Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(random_state=42, max_iter=1000))
        ])
    else:
        model = best_model_class(random_state=42)

    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    best_thresh, min_loss = find_best_threshold(model, X_train, y_train)
    y_pred = (y_proba >= best_thresh).astype(int)

    metrics = evaluate_basic_metrics(y_test, y_pred, y_proba)
    metrics.update({
        "threshold": best_thresh,
        "expected_loss": min_loss,
        "y_true": y_test,
        "y_pred": y_pred,
        "y_proba": y_proba
    })

    plot_confusion(y_test, y_pred, ["Normal Growth", "Fast Growth"], f"Confusion Matrix - {industry_name}")
    return metrics

# Evaluate manufacturing and services
print("\nEvaluating Manufacturing...")
mfg_results = evaluate_industry("Manufacturing")

print("\nEvaluating Services...")
srv_results = evaluate_industry("Services")

# Compare
plot_roc_curves({
    "Manufacturing": mfg_results,
    "Services": srv_results
})

# Summary table
summary = pd.DataFrame({
    "Metric": ["AUC", "Accuracy", "Precision", "Recall", "F1 Score", "Threshold", "Expected Loss"],
    "Manufacturing": [
        f"{mfg_results['roc_auc']:.4f}", f"{mfg_results['accuracy']:.4f}", f"{mfg_results['precision']:.4f}",
        f"{mfg_results['recall']:.4f}", f"{mfg_results['f1']:.4f}", f"{mfg_results['threshold']:.4f}", f"${mfg_results['expected_loss']}"
    ],
    "Services": [
        f"{srv_results['roc_auc']:.4f}", f"{srv_results['accuracy']:.4f}", f"{srv_results['precision']:.4f}",
        f"{srv_results['recall']:.4f}", f"{srv_results['f1']:.4f}", f"{srv_results['threshold']:.4f}", f"${srv_results['expected_loss']}"
    ]
})

print("\nIndustry Comparison:")
print(summary.to_string(index=False))