In [None]:
# Import libraries
import sys
sys.path.append('../scripts')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from data_loader import DataLoader
from preprocessing import DataPreprocessor
from ml_models import ClaimSeverityModel, PremiumOptimizationModel, build_claim_severity_model
from shap_analysis import SHAPAnalyzer, perform_shap_analysis

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully")

## 1. Load and Prepare Data

In [None]:
# Load data
loader = DataLoader()
df = loader.load_data('../data/MachineLearningRating_v3.txt', sep='|')

print(f"Data loaded: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")

In [None]:
# Preprocess data
preprocessor = DataPreprocessor()

# Handle missing values
df_clean = preprocessor.handle_missing_values(df, strategy='median')

# Convert data types
df_clean = preprocessor.convert_data_types(df_clean)

# Create features
df_clean = preprocessor.create_features(df_clean)

print(f"Preprocessed data: {df_clean.shape}")
print(f"\nNew features created: LossRatio, ProfitMargin, HasClaim, VehicleAge")

In [None]:
# Check target variable distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Distribution of TotalClaims
axes[0].hist(df_clean['TotalClaims'], bins=50, edgecolor='black')
axes[0].set_title('Distribution of TotalClaims', fontsize=14, fontweight='bold')
axes[0].set_xlabel('TotalClaims')
axes[0].set_ylabel('Frequency')

# Log-transformed distribution
df_with_claims = df_clean[df_clean['TotalClaims'] > 0]
axes[1].hist(np.log1p(df_with_claims['TotalClaims']), bins=50, edgecolor='black', color='orange')
axes[1].set_title('Log-transformed TotalClaims (Claims > 0)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('log(TotalClaims + 1)')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print(f"\nTotal policies: {len(df_clean):,}")
print(f"Policies with claims: {len(df_with_claims):,} ({len(df_with_claims)/len(df_clean)*100:.2f}%)")
print(f"Policies without claims: {len(df_clean) - len(df_with_claims):,}")

## 2. Build Claim Severity Models

We'll predict TotalClaims for policies that have claims (TotalClaims > 0).

In [None]:
# Define features for modeling
numerical_features = [
    'SumInsured', 'CalculatedPremiumPerTerm', 'Kilowatts',
    'Cubiccapacity', 'VehicleAge', 'RegistrationYear'
]

categorical_features = [
    'Province', 'VehicleType', 'Make', 'CoverType',
    'Gender', 'MaritalStatus'
]

# Filter existing columns
numerical_features = [col for col in numerical_features if col in df_with_claims.columns]
categorical_features = [col for col in categorical_features if col in df_with_claims.columns]

print(f"Numerical features ({len(numerical_features)}): {numerical_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")

In [None]:
# Initialize model
claim_model = ClaimSeverityModel()

# Prepare features
X, y = claim_model.prepare_features(
    df_with_claims,
    target_col='TotalClaims',
    categorical_cols=categorical_features,
    numerical_cols=numerical_features
)

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget statistics:")
print(y.describe())

In [None]:
# Split data
X_train, X_test, y_train, y_test = claim_model.train_test_split_data(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Scale features
X_train_scaled, X_test_scaled = claim_model.scale_features(X_train, X_test)

print("Features scaled successfully")

### 2.1 Train All Models

In [None]:
# Train all models
models = claim_model.train_all_models(X_train_scaled, y_train)

print(f"\nTrained models: {list(models.keys())}")

### 2.2 Evaluate Models

In [None]:
# Evaluate all models
results = claim_model.evaluate_all_models(X_test_scaled, y_test)

# Create comparison DataFrame
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('RMSE')

print("\nModel Performance Comparison:")
print(results_df)

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

metrics = ['RMSE', 'MAE', 'R2', 'MSE']
for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    results_df[metric].plot(kind='bar', ax=ax, color='skyblue', edgecolor='black')
    ax.set_title(f'{metric} Comparison', fontsize=14, fontweight='bold')
    ax.set_xlabel('Model')
    ax.set_ylabel(metric)
    ax.tick_params(axis='x', rotation=45)
    
    # Add value labels
    for i, v in enumerate(results_df[metric]):
        ax.text(i, v, f'{v:.2f}', ha='center', va='bottom')

plt.tight_layout()
plt.savefig('../reports/figures/model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nBest model by RMSE: {results_df.index[0]} (RMSE: {results_df.iloc[0]['RMSE']:.2f})")

### 2.3 Feature Importance Analysis

In [None]:
# Get feature importance from Random Forest
feature_importance_rf = claim_model.get_feature_importance('random_forest', top_n=10)

print("\nTop 10 Features (Random Forest):")
print(feature_importance_rf)

In [None]:
# Visualize feature importance
plt.figure(figsize=(12, 6))
plt.barh(feature_importance_rf['feature'], feature_importance_rf['importance'], color='teal', edgecolor='black')
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Top 10 Feature Importance (Random Forest)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('../reports/figures/feature_importance_rf.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Get feature importance from XGBoost
feature_importance_xgb = claim_model.get_feature_importance('xgboost', top_n=10)

print("\nTop 10 Features (XGBoost):")
print(feature_importance_xgb)

### 2.4 Predictions vs Actual

In [None]:
# Select best model
best_model_name = results_df.index[0]
best_model = models[best_model_name]

# Make predictions
y_pred = best_model.predict(X_test_scaled)

# Plot predictions vs actual
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Scatter plot
axes[0].scatter(y_test, y_pred, alpha=0.5, s=10)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual TotalClaims', fontsize=12)
axes[0].set_ylabel('Predicted TotalClaims', fontsize=12)
axes[0].set_title(f'Predictions vs Actual ({best_model_name})', fontsize=14, fontweight='bold')

# Residual plot
residuals = y_test - y_pred
axes[1].scatter(y_pred, residuals, alpha=0.5, s=10)
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted TotalClaims', fontsize=12)
axes[1].set_ylabel('Residuals', fontsize=12)
axes[1].set_title('Residual Plot', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../reports/figures/predictions_vs_actual.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. SHAP Analysis for Model Interpretability

In [None]:
# Perform SHAP analysis on best model
import shap
shap.initjs()

# Convert to DataFrame for SHAP
X_train_df = pd.DataFrame(X_train_scaled, columns=claim_model.feature_names)
X_test_df = pd.DataFrame(X_test_scaled, columns=claim_model.feature_names)

print("Initialized SHAP")

In [None]:
# Create SHAP analyzer
shap_analyzer = SHAPAnalyzer(best_model, model_type='tree')

# Create explainer
shap_analyzer.create_explainer(X_train_df)

# Calculate SHAP values (use subset for speed)
X_test_sample = X_test_df.sample(min(1000, len(X_test_df)), random_state=42)
shap_values = shap_analyzer.calculate_shap_values(X_test_sample)

print(f"SHAP values calculated for {len(X_test_sample)} samples")

In [None]:
# SHAP Summary Plot
shap_analyzer.plot_summary(X_test_sample, max_display=10,
                          save_path='../reports/figures/shap_summary.png')

In [None]:
# SHAP Bar Plot
shap_analyzer.plot_bar(X_test_sample, max_display=10,
                      save_path='../reports/figures/shap_bar.png')

In [None]:
# Get feature importance from SHAP
shap_feature_importance = shap_analyzer.get_feature_importance_df(X_test_sample, top_n=10)

print("\nTop 10 Features by SHAP:")
print(shap_feature_importance)

In [None]:
# Generate interpretation report
interpretation_report = shap_analyzer.generate_interpretation_report(X_test_sample, top_n=10)

print(interpretation_report)

# Save report
with open('../reports/shap_interpretation.md', 'w') as f:
    f.write(interpretation_report)

print("\nSHAP interpretation report saved")

In [None]:
# Waterfall plot for single prediction
shap_analyzer.plot_waterfall(X_test_sample, sample_index=0,
                            save_path='../reports/figures/shap_waterfall_example.png')

## 4. Premium Optimization Model

In [None]:
# Initialize premium optimization model
premium_model = PremiumOptimizationModel()

# Prepare features for premium prediction
X_premium, y_premium = premium_model.prepare_features(
    df_clean,
    target_col='CalculatedPremiumPerTerm',
    categorical_cols=categorical_features,
    numerical_cols=numerical_features
)

print(f"Premium model features: {X_premium.shape}")
print(f"Premium model target: {y_premium.shape}")

In [None]:
# Split and scale data
X_train_p, X_test_p, y_train_p, y_test_p = premium_model.train_test_split_data(X_premium, y_premium)
X_train_p_scaled, X_test_p_scaled = premium_model.scale_features(X_train_p, X_test_p)

print(f"Premium training set: {X_train_p_scaled.shape}")
print(f"Premium test set: {X_test_p_scaled.shape}")

In [None]:
# Train claim probability model
claim_prob_model = premium_model.train_claim_probability_model(X_train_p_scaled, y_train_p)

# Train premium prediction models
premium_models = premium_model.train_premium_models(X_train_p_scaled, y_train_p)

print(f"\nTrained premium models: {list(premium_models.keys())}")

In [None]:
# Evaluate premium models
premium_results = {}

for model_name, model in premium_models.items():
    y_pred_p = model.predict(X_test_p_scaled)
    metrics = premium_model.evaluate_regression(y_test_p, y_pred_p)
    premium_results[model_name] = metrics
    print(f"{model_name} - RMSE: {metrics['RMSE']:.2f}, R2: {metrics['R2']:.4f}")

# Create comparison
premium_results_df = pd.DataFrame(premium_results).T
premium_results_df = premium_results_df.sort_values('RMSE')

print("\nPremium Model Performance:")
print(premium_results_df)

## 5. Save Models

In [None]:
# Create models directory
import os
os.makedirs('../models', exist_ok=True)

# Save best claim severity model
import joblib
joblib.dump(best_model, f'../models/claim_severity_{best_model_name}.pkl')
joblib.dump(claim_model.scaler, '../models/claim_severity_scaler.pkl')
joblib.dump(claim_model.label_encoders, '../models/claim_severity_encoders.pkl')

# Save premium model
best_premium_model_name = premium_results_df.index[0]
best_premium_model = premium_models[best_premium_model_name]
joblib.dump(best_premium_model, f'../models/premium_{best_premium_model_name}.pkl')
joblib.dump(premium_model.scaler, '../models/premium_scaler.pkl')
joblib.dump(premium_model.label_encoders, '../models/premium_encoders.pkl')

print("Models saved successfully")

## 6. Summary and Key Insights

### Model Performance
- **Best Claim Severity Model**: Shows model performance metrics
- **Best Premium Model**: Shows premium prediction performance

### Key Findings from SHAP Analysis
1. Most important features for claim prediction identified
2. Feature interactions and their impact quantified
3. Model decisions made interpretable for business stakeholders

### Business Recommendations
1. Use models to optimize premium pricing based on risk factors
2. Focus on top features identified by SHAP for risk assessment
3. Implement dynamic pricing based on customer segments
4. Monitor model performance and retrain periodically