# Work Package 3 (WP3): Open Kaggle Competition

Supervised Machine Learning Model

Development and Deployment

---

**Universitat de Lleida**  
**Enginyeria Informàtica**  
**Sistemes Intel·ligents**  

---

**Professor:** Mariano Garralda Barrio  

**Authors:**  
- Jordi García Ventura  
- Christian López García  

**Date:** 12/01/2025  


## 0. Setup

In [1]:
!python --version

Python 3.9.13


In [2]:
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'd:\projects\meinf\meinf-ai-wp-03\env\Scripts\python.exe -m pip install --upgrade pip' command.


In [3]:
%matplotlib inline

In [4]:
import sys
sys.path.append("../")

GPU_ENABLED = True

%load_ext autoreload
%autoreload 2
from lib.cache import cache, DataFrameCache
from lib.estimators import (
    CustomImputer,
    CustomMappingEncoder,
    CustomDateEncoder,
    CustomDropColumns
)

In [5]:
from pathlib import Path

import pandas as pd
from sklearn.preprocessing import PowerTransformer
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import root_mean_squared_log_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [6]:
TRAIN_FILENAME = "train.csv"
TEST_FILENAME = "test.csv"
DATA_PATH = Path.joinpath(Path.cwd().parent, "data")
CACHE_PATH = Path.joinpath(Path.cwd().parent, "cache")
CACHE_IMAGES_PATH = Path.joinpath(CACHE_PATH, "images")
CACHE_MODELS_PATH = Path.joinpath(CACHE_PATH, "models")
CACHE_DATAFRAMES_PATH = Path.joinpath(CACHE_PATH, "dataframes")
CACHE_NUMPY_PATH = Path.joinpath(CACHE_PATH, "numpy")

In [7]:
df = pd.read_csv(Path.joinpath(DATA_PATH, TRAIN_FILENAME))

In [8]:
df.shape

(1200000, 21)

In [9]:
df["Health Score"]

0          22.598761
1          15.569731
2          47.177549
3          10.938144
4          20.376094
             ...    
1199995    13.772907
1199996    11.483482
1199997    14.724469
1199998    18.547381
1199999    10.125323
Name: Health Score, Length: 1200000, dtype: float64

## Parameter grid for XGBoost

In [10]:
# Define column groups
columns_impute_median = [
    "Age",
    "Annual Income",
    "Previous Claims",
    "Vehicle Age",
    "Credit Score",
    "Insurance Duration",
]
columns_impute_mean = ["Health Score"]
columns_impute_mode = ["Marital Status", "Number of Dependents"]
columns_impute_constant = ["Occupation", "Customer Feedback"]
columns_encode_onehot = [
    "Gender",
    "Marital Status",
    "Occupation",
    "Location",
    "Property Type",
    "Customer Feedback",
]

mapping = {
    "Education Level": {"High School": 0, "Bachelor's": 1, "Master's": 2, "PhD": 3},
    "Policy Type": {"Basic": 0, "Comprehensive": 1, "Premium": 2},
    "Smoking Status": {"No": 0, "Yes": 1},
    "Exercise Frequency": {"Rarely": 0, "Monthly": 1, "Weekly": 2, "Daily": 3},
}

imputer = Pipeline(
    [
        (
            "impute_constant",
            CustomImputer(columns_impute_constant, strategy="constant"),
        ),
        (
            "impute_median",
            CustomImputer(columns_impute_median, strategy="median"),
        ),
        (
            "impute_mean",
            CustomImputer(columns_impute_mean, strategy="mean"),
        ),
        (
            "impute_mode",
            CustomImputer(columns_impute_mode, strategy="mode"),
        ),
    ]
)

encoder = Pipeline(
    [
        ("encode_mapping", CustomMappingEncoder(mapping)),
        ("encode_date", CustomDateEncoder(["Policy Start Date"])),
        ("drop_columns", CustomDropColumns(["Policy Start Date"])),
        [
            "encode_onehot",
            ColumnTransformer(
                [
                    (
                        "onehot",
                        OneHotEncoder(drop="first", handle_unknown="ignore"),
                        columns_encode_onehot,
                    )
                ],
                remainder="passthrough",
            ),
        ],
    ]
)

preprocessing = Pipeline(
    [
        ("power_transformer", PowerTransformer()),
    ]
)

preprocessor = Pipeline(
    [("imputer", imputer), ("encoder", encoder), ("preprocessing", preprocessing)]
)

params = {
    "n_estimators": [500],
    "min_child_weight": [4, 5],
    "gamma": [i / 10.0 for i in range(3, 6)],
    "subsample": [i / 10.0 for i in range(6, 11)],
    "colsample_bytree": [i / 10.0 for i in range(6, 11)],
    "max_depth": [2, 3, 4, 6, 7],
    "objective": ["reg:squarederror", "reg:tweedie"],
    "booster": ["gbtree", "gblinear"],
    "eval_metric": ["rmse"],
    "eta": [i / 10.0 for i in range(3, 6)],
}

reg = XGBRegressor(nthread=-1)
n_random_search_iterations = 10

random_search = RandomizedSearchCV(
    reg,
    param_distributions=params,
    n_iter=n_random_search_iterations,
    scoring="neg_root_mean_squared_log_error",
    n_jobs=-1,
    cv=5,
    verbose=3,
    random_state=42,
)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("random_search", random_search),
    ]
)

pipeline

In [11]:
target = "Premium Amount"

X = df.drop(columns=["id", target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Check preprocessor
X_prep = preprocessor.fit_transform(X)
column_names = preprocessor.get_feature_names_out()
column_names = list(map(lambda x: x.rsplit("__")[-1], column_names))
X_prep_df = pd.DataFrame(X_prep, columns=column_names)
X_prep_df

Unnamed: 0,Gender_Male,Marital Status_Married,Marital Status_Single,Occupation_Self-Employed,Occupation_Unemployed,Occupation_Unknown,Location_Suburban,Location_Urban,Property Type_Condo,Property Type_House,Customer Feedback_Good,Customer Feedback_Poor,Customer Feedback_Unknown,Age,Annual Income,Number of Dependents,Education Level,Health Score,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Smoking Status,Exercise Frequency,Policy Start Date_Year
0,-1.004294,1.429421,-0.725646,1.801557,-0.547217,-0.652154,-0.709152,1.420839,-0.706673,1.413289,-0.666044,1.481751,-0.263346,-1.718850,-0.614464,-0.779400,-0.395714,-0.164843,1.174553,1.202672,1.221824,-1.551739,0.054103,-1.003127,0.504957,0.934128
1,-1.004294,-0.699584,-0.725646,-0.555075,-0.547217,1.533380,-0.709152,-0.703809,-0.706673,1.413289,-0.666044,-0.674877,-0.263346,-0.122431,0.336758,0.659496,0.482338,-0.812426,0.085404,0.168250,0.472718,0.710623,-1.173178,0.996883,-0.368269,0.934128
2,0.995724,-0.699584,-0.725646,1.801557,-0.547217,-0.652154,1.410135,-0.703809,-0.706673,1.413289,1.501403,-0.674877,-0.263346,-1.378656,0.136567,0.659496,-1.427187,1.663885,1.174553,0.168250,0.780055,0.000865,-0.738375,0.996883,0.504957,0.934128
3,0.995724,1.429421,-0.725646,-0.555075,-0.547217,1.533380,-0.709152,-0.703809,-0.706673,-0.707570,-0.666044,1.481751,-0.263346,-1.547049,2.139184,-0.048083,-0.395714,-1.298675,-1.269883,0.168250,-1.867336,-1.585660,-1.648818,0.996883,1.283796,1.614988
4,0.995724,-0.699584,1.378082,1.801557,-0.547217,-0.652154,-0.709152,-0.703809,-0.706673,1.413289,-0.666044,1.481751,-0.263346,-1.547049,0.561084,-0.779400,-0.395714,-0.360471,1.174553,-1.459382,-0.182817,0.022209,-0.331618,0.996883,0.504957,-0.422240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,-1.004294,1.429421,-0.725646,-0.555075,1.827428,-0.652154,-0.709152,1.420839,-0.706673,-0.707570,-0.666044,1.481751,-0.263346,-0.347954,0.196232,-1.548893,0.482338,-0.993983,1.174553,0.168250,-0.726253,-1.551739,-0.738375,-1.003127,1.283796,0.934128
1199996,0.995724,-0.699584,-0.725646,1.801557,-0.547217,-0.652154,-0.709152,-0.703809,-0.706673,-0.707570,-0.666044,1.481751,-0.263346,0.955395,0.456856,0.659496,0.482338,-1.238076,0.085404,0.168250,0.152759,0.015093,-0.331618,-1.003127,0.504957,0.255053
1199997,0.995724,-0.699584,-0.725646,-0.555075,-0.547217,1.533380,1.410135,-0.703809,1.415081,-0.707570,1.501403,-0.674877,-0.263346,-1.718850,0.848945,-1.548893,0.482338,-0.896844,-1.269883,-1.459382,1.505626,0.000865,0.423165,-1.003127,-0.368269,-0.422240
1199998,0.995724,-0.699584,1.378082,-0.555075,-0.547217,1.533380,1.410135,-0.703809,-0.706673,-0.707570,-0.666044,1.481751,-0.263346,1.024753,0.074811,-0.779400,1.273570,-0.527330,1.174553,0.168250,-0.357797,-1.313050,-0.331618,-1.003127,1.283796,-0.422240


In [13]:
# Tran the model
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

score = root_mean_squared_log_error(y_test, y_pred)
print(f"RMSLE: {score}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 -1.13424923 -1.13981902 -1.16732017 -1.16733889]


RMSLE: 1.1364201626659542


In [14]:
# Save the pipeline
import cloudpickle
import lib.estimators

cloudpickle.register_pickle_by_value(lib.estimators)
with open(Path.joinpath(CACHE_MODELS_PATH, "pipeline.pkl"), "wb") as f:
    f.write(cloudpickle.dumps(pipeline))

In [15]:
# Load the pipeline
from joblib import load
from pathlib import Path

DATA_PATH = Path.joinpath(Path.cwd().parent, "data")
CACHE_PATH = Path.joinpath(Path.cwd().parent, "cache")
CACHE_MODELS_PATH = Path.joinpath(CACHE_PATH, "models")

pipeline = load(Path.joinpath(CACHE_MODELS_PATH, "pipeline.pkl"))

In [16]:
# Test the pipeline
sample = X.sample(1)
sample_dict = sample.to_dict(orient="records")
print(sample_dict)
sample_df = pd.DataFrame(sample_dict)
pipeline.predict(sample_df)

[{'Age': 40.0, 'Gender': 'Female', 'Annual Income': 37117.0, 'Marital Status': 'Divorced', 'Number of Dependents': 0.0, 'Education Level': "Master's", 'Occupation': 'Self-Employed', 'Health Score': 35.3889756336098, 'Location': 'Urban', 'Policy Type': 'Premium', 'Previous Claims': 1.0, 'Vehicle Age': 10.0, 'Credit Score': 747.0, 'Insurance Duration': 6.0, 'Policy Start Date': '2024-04-18 15:21:39.099739', 'Customer Feedback': 'Good', 'Smoking Status': 'Yes', 'Exercise Frequency': 'Weekly', 'Property Type': 'Condo'}]


array([1209.2244], dtype=float32)

In [17]:
df.sample(1).to_json(orient="records")

'[{"id":184099,"Age":30.0,"Gender":"Male","Annual Income":6701.0,"Marital Status":"Single","Number of Dependents":2.0,"Education Level":"Master\'s","Occupation":"Employed","Health Score":41.0389684989,"Location":"Rural","Policy Type":"Premium","Previous Claims":null,"Vehicle Age":0.0,"Credit Score":552.0,"Insurance Duration":1.0,"Policy Start Date":"2021-02-13 15:21:39.163789","Customer Feedback":"Poor","Smoking Status":"No","Exercise Frequency":"Weekly","Property Type":"Apartment","Premium Amount":229.0}]'