# Work Package 3 (WP3): Open Kaggle Competition

Supervised Machine Learning Model

Development and Deployment

---

**Universitat de Lleida**  
**Enginyeria Informàtica**  
**Sistemes Intel·ligents**  

---

**Professor:** Mariano Garralda Barrio  

**Authors:**  
- Jordi García Ventura  
- Christian López García  

**Date:** 12/01/2025  


## 0. Setup

In [1]:
!python --version

Python 3.9.13


In [2]:
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'd:\projects\meinf\meinf-ai-wp-03\env\Scripts\python.exe -m pip install --upgrade pip' command.


In [3]:
%matplotlib inline

In [4]:
import sys
sys.path.append("../")

GPU_ENABLED = True

%load_ext autoreload
%autoreload 2
from lib.cache import cache, DataFrameCache
from lib.estimators import (
    CustomImputer,
    CustomMappingEncoder,
    CustomDateEncoder,
    CustomDropColumns
)

In [5]:
from pathlib import Path

import pandas as pd
from sklearn.preprocessing import StandardScaler, PowerTransformer, PolynomialFeatures, MinMaxScaler
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_log_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [6]:
TRAIN_FILENAME = "train.csv"
TEST_FILENAME = "test.csv"
DATA_PATH = Path.joinpath(Path.cwd().parent, "data")
CACHE_PATH = Path.joinpath(Path.cwd().parent, "cache")
CACHE_IMAGES_PATH = Path.joinpath(CACHE_PATH, "images")
CACHE_MODELS_PATH = Path.joinpath(CACHE_PATH, "models")
CACHE_DATAFRAMES_PATH = Path.joinpath(CACHE_PATH, "dataframes")
CACHE_NUMPY_PATH = Path.joinpath(CACHE_PATH, "numpy")

In [7]:
df = pd.read_csv(Path.joinpath(DATA_PATH, TRAIN_FILENAME))

In [8]:
df.shape

(1200000, 21)

In [9]:
df["Health Score"]

0          22.598761
1          15.569731
2          47.177549
3          10.938144
4          20.376094
             ...    
1199995    13.772907
1199996    11.483482
1199997    14.724469
1199998    18.547381
1199999    10.125323
Name: Health Score, Length: 1200000, dtype: float64

In [None]:
# Define column groups
columns_impute_median = ["Age", "Annual Income", "Previous Claims", "Vehicle Age", "Credit Score", "Insurance Duration"]
columns_impute_mean = ["Health Score"]
columns_impute_mode = ["Marital Status", "Number of Dependents"]
columns_impute_constant = ["Occupation", "Customer Feedback"]
columns_encode_onehot = ["Gender", "Marital Status", "Occupation", "Location", "Property Type", "Customer Feedback"]

mapping = {
    "Education Level": {"High School": 0, "Bachelor's": 1, "Master's": 2, "PhD": 3},
    "Policy Type": {"Basic": 0, "Comprehensive": 1, "Premium": 2},
    "Smoking Status": {"No": 0, "Yes": 1},
    "Exercise Frequency": {"Rarely": 0, "Monthly": 1, "Weekly": 2, "Daily": 3},
}

imputer = Pipeline([
    ("impute_constant", CustomImputer(columns_impute_constant, strategy="constant")),
    ("impute_median", CustomImputer(columns_impute_median, strategy="median")),
    ("impute_mean", CustomImputer(columns_impute_mean, strategy="mean")),
    ("impute_mode", CustomImputer(columns_impute_mode, strategy="mode")),
])

encoder = Pipeline([
    ("encode_mapping", CustomMappingEncoder(mapping)),
    ("encode_date", CustomDateEncoder(["Policy Start Date"])),
    ("drop_columns", CustomDropColumns(["Policy Start Date"])),
    ["encode_onehot", ColumnTransformer([
        ("onehot", OneHotEncoder(drop='first', handle_unknown='ignore'), columns_encode_onehot)
    ], remainder='passthrough')]
])

preprocessing = Pipeline([
    ("power_transformer", PowerTransformer(standardize=False)),
    ("polynomial_features", PolynomialFeatures(degree=2)),
    ("scaler", StandardScaler()),
    ("principal_component_analysis", PCA(n_components=0.95)),
])

preprocessor = Pipeline([
    ("imputer", imputer),
    ("encoder", encoder),
    ("preprocessing", preprocessing),
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=-1))
])

pipeline

In [27]:
target = "Premium Amount"

X = df.drop(columns=["id", target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# Check preprocessor
X_prep = preprocessor.fit_transform(X)
column_names = preprocessor.get_feature_names_out()
column_names = list(map(lambda x: x.rsplit("__")[-1], column_names))
X_prep_df = pd.DataFrame(X_prep, columns=column_names)
X_prep_df

Unnamed: 0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,...,pca129,pca130,pca131,pca132,pca133,pca134,pca135,pca136,pca137,pca138
0,4.285705,-7.151787,4.768003,6.185719,-4.578808,-6.145013,-1.822869,-0.819591,-0.162738,-6.204444,...,-0.522807,0.459284,0.308548,-0.122454,-0.041528,1.208966,-0.068031,-0.736918,-0.024212,0.776693
1,1.831679,-4.624554,-1.870381,-0.417982,5.567101,-0.196790,0.527217,-1.145356,4.702247,-0.173023,...,0.049026,-0.084957,0.076515,0.002710,-0.010917,-0.024470,0.088472,-0.015775,0.845943,-0.058177
2,4.198213,-2.245342,-6.328190,-7.072866,-5.053290,-6.972158,-2.045190,5.918317,0.105130,2.195949,...,0.713687,-0.392695,-0.354625,0.063514,-0.944484,0.370035,-0.061082,-0.236472,0.060097,0.180014
3,4.429623,0.129268,1.543827,5.357450,5.401479,-0.088644,-1.283057,-1.710270,0.144576,5.950529,...,0.930775,1.121889,-0.219415,-0.038351,0.133118,0.012782,0.016220,0.509814,-0.031361,0.786099
4,-3.468603,-5.947656,-4.933183,4.826692,-4.386895,-5.959345,-1.895804,1.868728,0.151302,3.899357,...,0.389782,0.450662,-0.122197,-0.025228,0.349844,0.414937,-0.016830,-0.190715,0.045673,-0.387758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,2.385556,-1.833773,6.045049,5.678822,-3.480443,6.072117,-1.518954,-3.403551,-0.116750,-4.623359,...,-0.727043,0.322586,0.354045,0.113730,-0.289383,-0.549743,0.040584,-1.282044,0.027101,0.525967
1199996,-0.308765,-0.352515,-0.493023,5.462381,-4.040046,-5.651538,-1.673466,0.272041,-4.878181,-0.763647,...,-0.177140,0.305975,-0.161620,0.140513,-0.008948,-0.031307,-0.279075,-0.023193,-1.092923,-0.046176
1199997,0.779533,6.905889,-1.944463,-4.939307,5.829317,-0.372678,-1.659019,-3.517617,-4.500978,3.102902,...,-0.323494,0.009823,-0.688081,0.252518,0.079329,0.196720,0.151482,-0.042118,1.414811,-0.393728
1199998,-2.808042,1.384867,-8.405816,5.208675,6.727653,-0.028743,-1.945315,3.104923,-5.466469,-3.163361,...,-0.491190,0.485439,-0.604686,0.508084,-0.143539,-0.183717,-1.020142,-0.298388,-0.044185,0.003221


In [29]:
# Tran the model
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

score = root_mean_squared_log_error(y_test, y_pred)
print(f"RMSLE: {score}")

RMSLE: 1.166180197642342


In [30]:
# Save the pipeline
import cloudpickle
import lib.estimators

cloudpickle.register_pickle_by_value(lib.estimators)
with open(Path.joinpath(CACHE_MODELS_PATH, "pipeline.pkl"), "wb") as f:
    f.write(cloudpickle.dumps(pipeline))

In [23]:
# Load the pipeline
from joblib import load
from pathlib import Path

DATA_PATH = Path.joinpath(Path.cwd().parent, "data")
CACHE_PATH = Path.joinpath(Path.cwd().parent, "cache")
CACHE_MODELS_PATH = Path.joinpath(CACHE_PATH, "models")

pipeline = load(Path.joinpath(CACHE_MODELS_PATH, "pipeline.pkl"))

In [24]:
# Test the pipeline
sample = X.sample(1)
sample_dict = sample.to_dict(orient="records")
print(sample_dict)
sample_df = pd.DataFrame(sample_dict)
pipeline.predict(sample_df)

[{'Age': 62.0, 'Gender': 'Female', 'Annual Income': 18644.0, 'Marital Status': 'Divorced', 'Number of Dependents': 0.0, 'Education Level': "Master's", 'Occupation': 'Unemployed', 'Health Score': nan, 'Location': 'Rural', 'Policy Type': 'Premium', 'Previous Claims': 3.0, 'Vehicle Age': 1.0, 'Credit Score': 626.0, 'Insurance Duration': 3.0, 'Policy Start Date': '2021-10-04 15:21:39.116179', 'Customer Feedback': 'Average', 'Smoking Status': 'No', 'Exercise Frequency': 'Daily', 'Property Type': 'Condo'}]


array([1133.3506], dtype=float32)

In [25]:
df.sample(1).to_json(orient="records")

'[{"id":43421,"Age":61.0,"Gender":"Male","Annual Income":5093.0,"Marital Status":"Divorced","Number of Dependents":3.0,"Education Level":"PhD","Occupation":"Employed","Health Score":12.4880803129,"Location":"Urban","Policy Type":"Premium","Previous Claims":0.0,"Vehicle Age":12.0,"Credit Score":667.0,"Insurance Duration":2.0,"Policy Start Date":"2023-10-05 15:21:39.226954","Customer Feedback":"Good","Smoking Status":"Yes","Exercise Frequency":"Weekly","Property Type":"Condo","Premium Amount":602.0}]'