In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
import statsmodels.api as sm


# -----------------------------
# Cleaning
# -----------------------------
def clean_data(df, drop_exp_above_1=True):
    df = df.copy()
    if drop_exp_above_1:
        df = df[df["Exposure"] <= 1]
    df = df.dropna()
    return df


# -----------------------------
# Feature selection (project choices)
# -----------------------------
def feature_selection(df):
    df = df.copy()
    # Drop IDs and Density (we decided to keep Area and drop Density)
    drop_cols = [c for c in ["IDpol", "Density"] if c in df.columns]
    return df.drop(columns=drop_cols)

def map_topk(series, k=15, other_label="Other"):
    counts = series.value_counts()
    keep = set(counts.nlargest(k).index)
    return series.where(series.isin(keep), other_label)

# -----------------------------
# Final preprocessing for M1 tree
# -----------------------------

def encode_categoricals(df):
    cat_cols = ["Area", "VehBrand", "VehGas", "Region"]
    
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    return df

def scale_features(df, features):
    scaler = StandardScaler()
    df[features] = scaler.fit_transform(df[features])
    return df, scaler


train = pd.read_csv("../data/claims_train.csv")
train = feature_selection(train)
# train = scale_features(train, features=["VehPower", "VehAge", "DrivAge", "BonusMalus"])[0] #only for M2 or (M3)
train = encode_categoricals(train)
train = clean_data(train)





In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# -----------------------------
# Cleaning
# -----------------------------
def clean_data(df, drop_exp_above_1=True):
    df = df.copy()
    if drop_exp_above_1:
        df = df[df["Exposure"] <= 1]
    df = df.dropna()
    return df


# -----------------------------
# Feature selection (project choices)
# -----------------------------
def feature_selection(df):
    # Drop IDs and Density (we decided to keep Area and drop Density)
    drop_cols = [c for c in ["IDpol", "Density"] if c in df.columns]
    return df.drop(columns=drop_cols)


# -----------------------------
# Scaling numerical features (Prep for models that need scaling)
# -----------------------------
def scale_features(df, features):
    scaler = StandardScaler()
    df[features] = scaler.fit_transform(df[features])
    return df, scaler

# -----------------------------
# Final preprocessing for M1 tree
# -----------------------------
def preprocess_for_tree(df):
    """
    Returns:
      X      -> one-hot features (no scaling)
      y_rate -> ClaimNb / Exposure
      w_expo -> Exposure
    """
    df = feature_selection(clean_data(df)).copy()

    y_rate = (df["ClaimNb"] / df["Exposure"]).astype(float)
    w_expo = df["Exposure"].astype(float)

    num_cols = [c for c in ["VehPower","VehAge","DrivAge","BonusMalus"] if c in df.columns]
    cat_cols = [c for c in ["Area","VehBrand","VehGas","Region"] if c in df.columns]

    X = pd.get_dummies(df[num_cols + cat_cols], columns=cat_cols, drop_first=True)
    return X, y_rate, w_expo

# =============================
# Example usage
# =============================
train = pd.read_csv("../data/claims_train.csv")
test  = pd.read_csv("../data/claims_test.csv")

X_tr, y_tr, w_tr = preprocess_for_tree(train)
X_te, y_te, w_te = preprocess_for_tree(test)




(541416, 41) (135373, 41)
