In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
import statsmodels.api as sm


# -----------------------------
# Cleaning
# -----------------------------
def clean_data(df, drop_exp_above_1=True):
    df = df.copy()
    if drop_exp_above_1:
        df = df[df["Exposure"] <= 1]
    df = df.dropna()
    return df


# -----------------------------
# Feature selection (project choices)
# -----------------------------
def feature_selection(df):
    df = df.copy()
    # Drop IDs and Density (we decided to keep Area and drop Density/Urbanicity)
    drop_cols = [c for c in ["IDpol", "Density"] if c in df.columns]
    return df.drop(columns=drop_cols)

def map_topk(series, k=15, other_label="Other"):
    counts = series.value_counts()
    keep = set(counts.nlargest(k).index)
    return series.where(series.isin(keep), other_label)

# -----------------------------
# Final preprocessing for M1 tree
# -----------------------------

def encode_categoricals(df):
    cat_cols = ["Area", "VehBrand", "VehGas", "Region"]
    
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    return df

def scale_features(df, features):
    scaler = StandardScaler()
    df[features] = scaler.fit_transform(df[features])
    return df, scaler


train = pd.read_csv("../data/claims_train.csv")
train = feature_selection(train)
# train = scale_features(train, features=["VehPower", "VehAge", "DrivAge", "BonusMalus"])[0] #only for M2 or (M3)
train = encode_categoricals(train)
train = clean_data(train)
train




(541416, 41) (541416,) (541416,)
Columns example:
 Index(['VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Area_B', 'Area_C',
       'Area_D', 'Area_E', 'Area_F', 'VehGas_Regular'],
      dtype='object')
y_rate head:
 0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
dtype: float64
Weights (Exposure) head:
 0    0.43
1    0.10
2    0.33
3    0.56
4    0.27
Name: Exposure, dtype: float64
