# Work Package 3 (WP3): Open Kaggle Competition

Supervised Machine Learning Model

Development and Deployment

---

**Universitat de Lleida**  
**Enginyeria Informàtica**  
**Sistemes Intel·ligents**  

---

**Professor:** Mariano Garralda Barrio  

**Authors:**  
- Jordi García Ventura  
- Christian López García  

**Date:** 12/01/2025  


## 0. Setup

In [1]:
!python --version

Python 3.9.13


In [1]:
%pip install -r ../requirements.txt

[0mCollecting scikit-learn==1.5.2
  Using cached scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.4 MB)
Collecting pandas==2.2.3
  Using cached pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
Collecting scipy==1.13.1
  Using cached scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
Collecting joblib==1.4.2
  Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Collecting pycaret[full]==3.3.2
  Using cached pycaret-3.3.2-py3-none-any.whl (486 kB)
Collecting category-encoders>=2.4.0
  Using cached category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
Collecting tbats>=1.1.3
  Using cached tbats-1.1.3-py3-none-any.whl (44 kB)
INFO: pip is looking at multiple versions of stopit to determine which version is compatible with other requirements. This could take a while.
Collecting stopit==1.1.2
  Using cached stopit-1.1.2-py3-none-any.whl
INFO: pip is looking at multiple versions of tqdm to determine wh

In [3]:
%matplotlib inline

In [2]:
import sys
sys.path.append("../")

GPU_ENABLED = True

%load_ext autoreload
%autoreload 2
from lib.cache import cache, DataFrameCache
from lib.estimators import (
    CustomImputer,
    CustomMappingEncoder,
    CustomDateEncoder,
    CustomDropColumns
)

In [3]:
from pathlib import Path

import pandas as pd
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_log_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [4]:
TRAIN_FILENAME = "train.csv"
TEST_FILENAME = "test.csv"
DATA_PATH = Path.joinpath(Path.cwd().parent, "data")
CACHE_PATH = Path.joinpath(Path.cwd().parent, "cache")
CACHE_IMAGES_PATH = Path.joinpath(CACHE_PATH, "images")
CACHE_MODELS_PATH = Path.joinpath(CACHE_PATH, "models")
CACHE_DATAFRAMES_PATH = Path.joinpath(CACHE_PATH, "dataframes")
CACHE_NUMPY_PATH = Path.joinpath(CACHE_PATH, "numpy")

In [5]:
df = pd.read_csv(Path.joinpath(DATA_PATH, TRAIN_FILENAME))

In [15]:
df["Health Score"]

0          22.598761
1          15.569731
2          47.177549
3          10.938144
4          20.376094
             ...    
1199995    13.772907
1199996    11.483482
1199997    14.724469
1199998    18.547381
1199999    10.125323
Name: Health Score, Length: 1200000, dtype: float64

In [7]:
# Define column groups
columns_impute_median = ["Age", "Annual Income", "Previous Claims", "Vehicle Age", "Credit Score", "Insurance Duration"]
columns_impute_mean = ["Health Score"]
columns_impute_mode = ["Marital Status", "Number of Dependents"]
columns_impute_constant = ["Occupation", "Customer Feedback"]
columns_encode_onehot = ["Gender", "Marital Status", "Occupation", "Location", "Property Type", "Customer Feedback"]

mapping = {
    "Education Level": {"High School": 0, "Bachelor's": 1, "Master's": 2, "PhD": 3},
    "Policy Type": {"Basic": 0, "Comprehensive": 1, "Premium": 2},
    "Smoking Status": {"No": 0, "Yes": 1},
    "Exercise Frequency": {"Rarely": 0, "Monthly": 1, "Weekly": 2, "Daily": 3},
}

imputer = Pipeline([
    ("impute_constant", CustomImputer(columns_impute_constant, strategy="constant")),
    ("impute_median", CustomImputer(columns_impute_median, strategy="median")),
    ("impute_mean", CustomImputer(columns_impute_mean, strategy="mean")),
    ("impute_mode", CustomImputer(columns_impute_mode, strategy="mode")),
])

encoder = Pipeline([
    ("encode_mapping", CustomMappingEncoder(mapping)),
    ("encode_date", CustomDateEncoder(["Policy Start Date"])),
    ("drop_columns", CustomDropColumns(["Policy Start Date"])),
    ["encode_onehot", ColumnTransformer([
        ("onehot", OneHotEncoder(drop='first', handle_unknown='ignore'), columns_encode_onehot)
    ], remainder='passthrough')]
])

preprocessor = Pipeline([
    ("imputer", imputer),
    ("encoder", encoder),
    ("scaler", StandardScaler())
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=-1))
])

pipeline

In [8]:
target = "Premium Amount"

X = df.drop(columns=["id", target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
# Check preprocessor
X_prep = preprocessor.fit_transform(X)
column_names = preprocessor.get_feature_names_out()
column_names = list(map(lambda x: x.rsplit("__")[-1], column_names))
X_prep_df = pd.DataFrame(X_prep, columns=column_names)
X_prep_df

array([[-1.00429422,  1.42942104, -0.72564604, ..., -1.00312655,
         0.45208563,  0.93436503],
       [-1.00429422, -0.69958394, -0.72564604, ...,  0.99688319,
        -0.44595054,  0.93436503],
       [ 0.99572414, -0.69958394, -0.72564604, ...,  0.99688319,
         0.45208563,  0.93436503],
       ...,
       [ 0.99572414, -0.69958394, -0.72564604, ..., -1.00312655,
        -0.44595054, -0.42064143],
       [ 0.99572414, -0.69958394,  1.37808234, ..., -1.00312655,
         1.35012179, -0.42064143],
       [-1.00429422, -0.69958394, -0.72564604, ...,  0.99688319,
        -0.44595054, -1.09814467]])

In [51]:
# Tran the model
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

score = root_mean_squared_log_error(y_test, y_pred)
print(f"RMSLE: {score}")

RMSLE: 1.1385017745765391


['/mnt/40EE82D1EE82BF22/Users/jordi/Documents/GitHub/meinf-ai-wp-03/cache/models/pipeline.joblib']

In [None]:
# Save the pipeline
import cloudpickle
import lib.estimators

cloudpickle.register_pickle_by_value(lib.estimators)
with open(Path.joinpath(CACHE_MODELS_PATH, "pipeline.pkl"), "wb") as f:
    f.write(cloudpickle.dumps(pipeline))

In [9]:
# Load the pipeline
from joblib import load
from pathlib import Path

DATA_PATH = Path.joinpath(Path.cwd().parent, "data")
CACHE_PATH = Path.joinpath(Path.cwd().parent, "cache")
CACHE_MODELS_PATH = Path.joinpath(CACHE_PATH, "models")

pipeline = load(Path.joinpath(CACHE_MODELS_PATH, "pipeline.pkl"))

In [10]:
# Test the pipeline
sample = X.sample(1)
sample_dict = sample.to_dict(orient="records")
print(sample_dict)
sample_df = pd.DataFrame(sample_dict)
pipeline.predict(sample_df)

[{'Age': 44.0, 'Gender': 'Male', 'Annual Income': 24237.0, 'Marital Status': 'Single', 'Number of Dependents': 3.0, 'Education Level': "Bachelor's", 'Occupation': 'Employed', 'Health Score': 40.774976516430215, 'Location': 'Urban', 'Policy Type': 'Comprehensive', 'Previous Claims': 0.0, 'Vehicle Age': 19.0, 'Credit Score': 501.0, 'Insurance Duration': 1.0, 'Policy Start Date': '2021-04-25 15:21:39.132191', 'Customer Feedback': 'Good', 'Smoking Status': 'Yes', 'Exercise Frequency': 'Rarely', 'Property Type': 'House'}]


array([1076.3657], dtype=float32)

In [12]:
sample_df.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,44.0,Male,24237.0,Single,3.0,Bachelor's,Employed,40.774977,Urban,Comprehensive,0.0,19.0,501.0,1.0,2021-04-25 15:21:39.132191,Good,Yes,Rarely,House


In [17]:
df.sample(1).to_json(orient="records")

'[{"id":909942,"Age":21.0,"Gender":"Female","Annual Income":18053.0,"Marital Status":"Divorced","Number of Dependents":2.0,"Education Level":"PhD","Occupation":"Self-Employed","Health Score":17.8897152903,"Location":"Rural","Policy Type":"Premium","Previous Claims":2.0,"Vehicle Age":2.0,"Credit Score":336.0,"Insurance Duration":3.0,"Policy Start Date":"2022-09-17 15:21:39.198406","Customer Feedback":"Poor","Smoking Status":"No","Exercise Frequency":"Daily","Property Type":"Apartment","Premium Amount":705.0}]'