In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_squared_error, r2_score, classification_report
import matplotlib.pyplot as plt

# Step 2: Data Preparation
def load_and_preprocess_data(filepath):
    # Load data
    df = pd.read_csv(filepath)
    
    # Convert date columns to datetime and extract features
    date_cols = ['TransactionMonth', 'VehicleIntroDate']  # Add all date columns here
    for col in date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col])
            df[f'{col}_year'] = df[col].dt.year
            df[f'{col}_month'] = df[col].dt.month
            df = df.drop(col, axis=1)
    
    # Feature Engineering
    if 'RegistrationYear' in df.columns:
        df['vehicle_age'] = 2025 - df['RegistrationYear']
    if 'TotalPremium' in df.columns and 'SumInsured' in df.columns:
        df['premium_to_insured_ratio'] = df['TotalPremium'] / df['SumInsured']
    
    return df

# Load and preprocess data
df = load_and_preprocess_data('../data/MachineLearningRating_v3.txt', delimiter='|')

# Step 3: Define features and targets
# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove target variables and IDs from features
if 'TotalClaims' in numerical_cols:
    numerical_cols.remove('TotalClaims')
if 'PolicyID' in numerical_cols:
    numerical_cols.remove('PolicyID')
if 'PolicyID' in categorical_cols:
    categorical_cols.remove('PolicyID')

# Create preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Step 4: Prepare data for modeling
# For Claim Severity Model (only policies with claims)
severity_df = df[df['TotalClaims'] > 0].copy()
X_sev = severity_df.drop(['TotalClaims', 'PolicyID'], axis=1)
y_sev = severity_df['TotalClaims']

# For Claim Probability Model (all policies)
X_prob = df.drop(['TotalClaims', 'PolicyID'], axis=1)
y_prob = (df['TotalClaims'] > 0).astype(int)  # Binary target

# Train-test splits
X_sev_train, X_sev_test, y_sev_train, y_sev_test = train_test_split(
    X_sev, y_sev, test_size=0.3, random_state=42)
X_prob_train, X_prob_test, y_prob_train, y_prob_test = train_test_split(
    X_prob, y_prob, test_size=0.3, random_state=42)

# Step 5: Model Building with Pipelines
# Claim Severity Models
models_sev = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# Claim Probability Models
models_prob = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}

# Train and evaluate severity models
severity_results = {}
for name, model in models_sev.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)])
    
    pipeline.fit(X_sev_train, y_sev_train)
    preds = pipeline.predict(X_sev_test)
    
    severity_results[name] = {
        'RMSE': mean_squared_error(y_sev_test, preds, squared=False),
        'R2': r2_score(y_sev_test, preds)
    }

# Train and evaluate probability models
probability_results = {}
for name, model in models_prob.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)])
    
    pipeline.fit(X_prob_train, y_prob_train)
    preds = pipeline.predict(X_prob_test)
    
    report = classification_report(y_prob_test, preds, output_dict=True)
    probability_results[name] = {
        'Accuracy': report['accuracy'],
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1': report['weighted avg']['f1-score']
    }

# Step 6: Feature Importance Analysis (Alternative to SHAP)
def plot_feature_importance(pipeline, title):
    """Plot feature importance for tree-based models"""
    try:
        # Get feature names after one-hot encoding
        cat_encoder = pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
        cat_features = cat_encoder.get_feature_names_out(categorical_cols)
        all_features = numerical_cols + list(cat_features)
        
        # Get feature importances
        model = pipeline.named_steps['model']
        if hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
            
            # Create importance DataFrame
            importance_df = pd.DataFrame({
                'feature': all_features,
                'importance': importances
            }).sort_values('importance', ascending=False).head(10)
            
            # Plot
            plt.figure(figsize=(10, 6))
            plt.barh(importance_df['feature'], importance_df['importance'])
            plt.title(f'Feature Importance - {title}')
            plt.xlabel('Importance Score')
            plt.gca().invert_yaxis()
            plt.show()
            
            return importance_df
    except Exception as e:
        print(f"Could not plot feature importance for {title}: {str(e)}")
        return None

# Plot feature importance for severity models
print("\nFeature Importance for Severity Models:")
for name, model in models_sev.items():
    if name != 'Linear Regression':  # Only tree-based models have feature importance
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)])
        pipeline.fit(X_sev_train, y_sev_train)
        importance_df = plot_feature_importance(pipeline, f"Severity - {name}")
        if importance_df is not None:
            print(f"\nTop 10 Features for {name}:")
            print(importance_df)

# Plot feature importance for probability models
print("\nFeature Importance for Probability Models:")
for name, model in models_prob.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)])
    pipeline.fit(X_prob_train, y_prob_train)
    importance_df = plot_feature_importance(pipeline, f"Probability - {name}")
    if importance_df is not None:
        print(f"\nTop 10 Features for {name}:")
        print(importance_df)

# Step 7: Results Presentation
print("\nClaim Severity Model Results:")
print(pd.DataFrame(severity_results).T)
print("\nClaim Probability Model Results:")
print(pd.DataFrame(probability_results).T)

# Step 8: Premium Optimization Concept
def calculate_premium(model_prob, model_sev, preprocessor, X, expense_loading=0.2, profit_margin=0.1):
    X_processed = preprocessor.transform(X)
    prob = model_prob.predict_proba(X_processed)[:, 1]
    severity = model_sev.predict(X_processed)
    return (prob * severity) * (1 + expense_loading + profit_margin)

# Example usage
sample_data = X_prob_test.sample(5, random_state=42)
premiums = calculate_premium(
    Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', models_prob['XGBoost'])]), 
    Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', models_sev['XGBoost'])]), 
    preprocessor, 
    sample_data)

print("\nSample Premium Calculations:")
print(pd.DataFrame({
    'CalculatedPremium': sample_data['CalculatedPremiumPerTerm'].values if 'CalculatedPremiumPerTerm' in sample_data.columns else [np.nan]*len(sample_data),
    'ModelPremium': premiums
}))

ImportError: Numba needs NumPy 2.2 or less. Got NumPy 2.3.