In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from src.modeling.data_prep import prepare_features
from src.modeling.train_models import train_models
from src.modeling.evaluate_models import evaluate_models
from src.modeling.interpret_model import explain_model

# Load data
df = pd.read_csv("data/raw/MachineLearningRating_v3.txt", sep="|")

# Create target
df["TotalClaims"] = df["TotalClaims"].astype(float)
df = df[df["TotalClaims"] > 0]  # risk modeling only

# Prepare features
X, y, preprocessor = prepare_features(df, "TotalClaims")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models
models = train_models(X_train, y_train, preprocessor)

# Evaluate
results = evaluate_models(models, X_test, y_test)
print("📊 Evaluation Results:")
for model_name, metrics in results.items():
    print(f"{model_name} - RMSE: {metrics['RMSE']:.2f}, R²: {metrics['R2']:.3f}")

# Interpret best model (optional SHAP)
explain_model(models["XGBoost"], X_test.sample(100, random_state=42))


ModuleNotFoundError: No module named 'sklearn'