In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Reload data (optional)
df = pd.read_csv("../data/MachineLearningRating_v3.txt", delimiter="|")

# Filter to only rows where a claim occurred
df_claims = df[df['TotalClaims'] > 0].copy()

# Drop columns that are identifiers or mostly irrelevant for severity
cols_to_drop = [
    'UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'Title', 'Language',
    'Bank', 'Product', 'Country', 'ItemType', 'Model', 'VehicleIntroDate'
]
df_claims.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# Remove target column from features
target = 'TotalClaims'

# Identify categorical columns
categorical_cols = df_claims.select_dtypes(include='object').columns.tolist()

# One-hot encode categoricals (drop_first=True to avoid dummy variable trap)
df_encoded = pd.get_dummies(df_claims, columns=categorical_cols, drop_first=True)

# Fill any remaining missing values
df_encoded.fillna(0, inplace=True)

# Split features and target
X = df_encoded.drop(columns=[target])
y = df_encoded[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training shape:", X_train.shape)
print("Testing shape:", X_test.shape)


  df = pd.read_csv("../data/MachineLearningRating_v3.txt", delimiter="|")


Training shape: (2230, 388)
Testing shape: (558, 388)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Store results
results = {}

def evaluate_model(model, name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    results[name] = {"RMSE": rmse, "R²": r2}
    print(f"\n{name}")
    print("RMSE:", rmse)
    print("R²:", r2)

# 1. Linear Regression
linreg = LinearRegression()
evaluate_model(linreg, "Linear Regression")

# 2. Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
evaluate_model(rf, "Random Forest")

# 3. XGBoost
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42)
evaluate_model(xgb, "XGBoost")
