In [9]:
import sys
import os

# Go from notebooks/modeling → notebooks → project root
project_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

# Add to sys.path if not already
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print("Using project root:", project_root)
print("src exists:", os.path.isdir(os.path.join(project_root, "src")))

from src.features.build_features import load_and_preprocess_data, prepare_model_data
from src.models.train_model import train_models, explain_model

print("Imports loaded successfully!")


Using project root: d:\KAIM\weak-3\insurance-risk-analytics
src exists: True
Imports loaded successfully!


In [12]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.features.build_features import load_and_preprocess_data, prepare_model_data
from src.models.train_model import train_models, explain_model
from src.data.load import load_data

import joblib
import shap

# Set up visualization
sns.set_theme(style="whitegrid")
%matplotlib inline

# 1. Load and preprocess data
print("Loading and preprocessing data...")
df = load_and_preprocess_data('../data/MachineLearningRating_v3.txt')

# 2. Prepare data for claim severity model (only rows with claims > 0)
print("\nPreparing claim severity model data...")
claims_df = df[df['TotalClaims'] > 0].copy()
X_train_sev, X_test_sev, y_train_sev, y_test_sev, preprocessor_sev = prepare_model_data(
    claims_df, 
    target_col='TotalClaims'
)

# 3. Train claim severity models
print("\nTraining claim severity models...")
severity_results = train_models(
    X_train_sev, 
    X_test_sev, 
    y_train_sev, 
    y_test_sev, 
    preprocessor_sev
)

# 4. Prepare data for claim probability model
print("\nPreparing claim probability model data...")
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)
X_train_prob, X_test_prob, y_train_prob, y_test_prob, preprocessor_prob = prepare_model_data(
    df,
    target_col='HasClaim'
)

# 5. Train claim probability models
print("\nTraining claim probability models...")
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# (Add classification model training code here)

# 6. Save models and results
print("\nSaving models and results...")
joblib.dump(severity_results, '../models/severity_models.pkl')
# Save other models and results

# 7. Generate SHAP explanations
print("\nGenerating SHAP explanations...")
best_severity_model = severity_results['XGBoost']['pipeline']
X_explain = X_test_sev.sample(100, random_state=42)  # Use a sample for faster computation
shap_values = explain_model(best_severity_model, X_explain, X_explain.columns)

# 8. Generate model comparison
print("\nGenerating model comparison...")
results_df = pd.DataFrame({
    'Model': list(severity_results.keys()),
    'RMSE': [x['rmse'] for x in severity_results.values()],
    'R2': [x['r2'] for x in severity_results.values()]
})
print("\nModel Comparison:")
print(results_df)

# 9. Feature importance analysis
print("\nTop 10 Features for XGBoost Model:")
print(severity_results['XGBoost']['feature_importance'].head(10))

print("\n✅ Model training and evaluation complete!")

Loading and preprocessing data...


ModuleNotFoundError: No module named 'src.data.load_data'