# Work Package 3 (WP3): Open Kaggle Competition

Supervised Machine Learning Model

Development and Deployment

---

**Universitat de Lleida**  
**Enginyeria Informàtica**  
**Sistemes Intel·ligents**  

---

**Professor:** Mariano Garralda Barrio  

**Authors:**  
- Jordi García Ventura  
- Christian López García  

**Date:** 12/01/2025  


## 0. Setup

In [5]:
!python --version

Python 3.9.13


In [6]:
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'd:\projects\meinf\meinf-ai-wp-03\env\Scripts\python.exe -m pip install --upgrade pip' command.


In [7]:
%matplotlib inline

In [8]:
import sys
sys.path.append("../")

GPU_ENABLED = True

%load_ext autoreload
%autoreload 2
from lib.cache import cache, DataFrameCache
from lib.estimators import (
    CustomImputer,
    CustomMappingEncoder,
    CustomDateEncoder,
    CustomDropColumns
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from pathlib import Path

import pandas as pd
from sklearn.preprocessing import PowerTransformer, PolynomialFeatures, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_log_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [10]:
TRAIN_FILENAME = "train.csv"
TEST_FILENAME = "test.csv"
DATA_PATH = Path.joinpath(Path.cwd().parent, "data")
CACHE_PATH = Path.joinpath(Path.cwd().parent, "cache")
CACHE_IMAGES_PATH = Path.joinpath(CACHE_PATH, "images")
CACHE_MODELS_PATH = Path.joinpath(CACHE_PATH, "models")
CACHE_DATAFRAMES_PATH = Path.joinpath(CACHE_PATH, "dataframes")
CACHE_NUMPY_PATH = Path.joinpath(CACHE_PATH, "numpy")

In [11]:
df = pd.read_csv(Path.joinpath(DATA_PATH, TRAIN_FILENAME))

In [12]:
df.shape

(1200000, 21)

In [13]:
df["Health Score"]

0          22.598761
1          15.569731
2          47.177549
3          10.938144
4          20.376094
             ...    
1199995    13.772907
1199996    11.483482
1199997    14.724469
1199998    18.547381
1199999    10.125323
Name: Health Score, Length: 1200000, dtype: float64

In [14]:
from tensorflow.python.client import device_lib

device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 18260168423148286725
 xla_global_id: -1]

In [15]:
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasRegressor 

def create_model(input_dim=None, kernel_initializer="glorot_uniform", dropout=0.2, learning_rate=0.001, **kwargs):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(124, activation="relu", kernel_initializer=kernel_initializer))
    model.add(Dropout(dropout))
    model.add(Dense(248, activation="relu", kernel_initializer=kernel_initializer))
    model.add(Dropout(dropout))
    model.add(Dense(1, activation="linear", kernel_initializer=kernel_initializer))
    
    model.compile(
        loss="mean_squared_logarithmic_error",
        optimizer=Adam(learning_rate=learning_rate)
    )
    
    return model

In [16]:
# Define column groups
columns_impute_median = [
    "Age",
    "Annual Income",
    "Previous Claims",
    "Vehicle Age",
    "Credit Score",
    "Insurance Duration",
]
columns_impute_mean = ["Health Score"]
columns_impute_mode = ["Marital Status", "Number of Dependents"]
columns_impute_constant = ["Occupation", "Customer Feedback"]
columns_encode_onehot = [
    "Gender",
    "Marital Status",
    "Occupation",
    "Location",
    "Property Type",
    "Customer Feedback",
]

mapping = {
    "Education Level": {"High School": 0, "Bachelor's": 1, "Master's": 2, "PhD": 3},
    "Policy Type": {"Basic": 0, "Comprehensive": 1, "Premium": 2},
    "Smoking Status": {"No": 0, "Yes": 1},
    "Exercise Frequency": {"Rarely": 0, "Monthly": 1, "Weekly": 2, "Daily": 3},
}

imputer = Pipeline(
    [
        (
            "impute_constant",
            CustomImputer(columns_impute_constant, strategy="constant"),
        ),
        (
            "impute_median",
            CustomImputer(columns_impute_median, strategy="median"),
        ),
        (
            "impute_mean",
            CustomImputer(columns_impute_mean, strategy="mean"),
        ),
        (
            "impute_mode",
            CustomImputer(columns_impute_mode, strategy="mode"),
        ),
    ]
)

encoder = Pipeline(
    [
        ("encode_mapping", CustomMappingEncoder(mapping)),
        ("encode_date", CustomDateEncoder(["Policy Start Date"])),
        ("drop_columns", CustomDropColumns(["Policy Start Date"])),
        [
            "encode_onehot",
            ColumnTransformer(
                [
                    (
                        "onehot",
                        OneHotEncoder(drop="first", handle_unknown="ignore"),
                        columns_encode_onehot,
                    )
                ],
                remainder="passthrough",
            ),
        ],
    ]
)

preprocessing = Pipeline(
    [
        # ("polynomial_features", PolynomialFeatures(degree=2)),
        ("power_transformer", PowerTransformer()),
        # ("principal_component_analysis", PCA(n_components=0.95, whiten=True)),
        # ("scaler", StandardScaler())
    ]
)

preprocessor = Pipeline(
    [("imputer", imputer), ("encoder", encoder), ("preprocessing", preprocessing)]
)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "regressor",
            KerasRegressor(
                build_fn=create_model,
                input_dim=26,
                epochs=10,
                batch_size=10,
                verbose=1,
                random_state=42,
                learning_rate=0.001,
            ),
        ),
    ]
)

pipeline

In [17]:
target = "Premium Amount"

X = df.drop(columns=["id", target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Check preprocessor
X_prep = preprocessor.fit_transform(X)
column_names = preprocessor.get_feature_names_out()
column_names = list(map(lambda x: x.rsplit("__")[-1], column_names))
X_prep_df = pd.DataFrame(X_prep, columns=column_names)
X_prep_df

Unnamed: 0,Gender_Male,Marital Status_Married,Marital Status_Single,Occupation_Self-Employed,Occupation_Unemployed,Occupation_Unknown,Location_Suburban,Location_Urban,Property Type_Condo,Property Type_House,...,Education Level,Health Score,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Smoking Status,Exercise Frequency,Policy Start Date_Year
0,-1.004294,1.429421,-0.725646,1.801557,-0.547217,-0.652154,-0.709152,1.420839,-0.706673,1.413289,...,-0.395714,-0.164843,1.174553,1.202672,1.221824,-1.551739,0.054103,-1.003127,0.504957,0.934128
1,-1.004294,-0.699584,-0.725646,-0.555075,-0.547217,1.533380,-0.709152,-0.703809,-0.706673,1.413289,...,0.482338,-0.812426,0.085404,0.168250,0.472718,0.710623,-1.173178,0.996883,-0.368269,0.934128
2,0.995724,-0.699584,-0.725646,1.801557,-0.547217,-0.652154,1.410135,-0.703809,-0.706673,1.413289,...,-1.427187,1.663885,1.174553,0.168250,0.780055,0.000865,-0.738375,0.996883,0.504957,0.934128
3,0.995724,1.429421,-0.725646,-0.555075,-0.547217,1.533380,-0.709152,-0.703809,-0.706673,-0.707570,...,-0.395714,-1.298675,-1.269883,0.168250,-1.867336,-1.585660,-1.648818,0.996883,1.283796,1.614988
4,0.995724,-0.699584,1.378082,1.801557,-0.547217,-0.652154,-0.709152,-0.703809,-0.706673,1.413289,...,-0.395714,-0.360471,1.174553,-1.459382,-0.182817,0.022209,-0.331618,0.996883,0.504957,-0.422240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,-1.004294,1.429421,-0.725646,-0.555075,1.827428,-0.652154,-0.709152,1.420839,-0.706673,-0.707570,...,0.482338,-0.993983,1.174553,0.168250,-0.726253,-1.551739,-0.738375,-1.003127,1.283796,0.934128
1199996,0.995724,-0.699584,-0.725646,1.801557,-0.547217,-0.652154,-0.709152,-0.703809,-0.706673,-0.707570,...,0.482338,-1.238076,0.085404,0.168250,0.152759,0.015093,-0.331618,-1.003127,0.504957,0.255053
1199997,0.995724,-0.699584,-0.725646,-0.555075,-0.547217,1.533380,1.410135,-0.703809,1.415081,-0.707570,...,0.482338,-0.896844,-1.269883,-1.459382,1.505626,0.000865,0.423165,-1.003127,-0.368269,-0.422240
1199998,0.995724,-0.699584,1.378082,-0.555075,-0.547217,1.533380,1.410135,-0.703809,-0.706673,-0.707570,...,1.273570,-0.527330,1.174553,0.168250,-0.357797,-1.313050,-0.331618,-1.003127,1.283796,-0.422240


In [19]:
# Tran the model
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

score = root_mean_squared_log_error(y_test, y_pred)
print(f"RMSLE: {score}")

  X, y = self._initialize(X, y)


Epoch 1/10
[1m96000/96000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 2ms/step - loss: 1.4490
Epoch 2/10
[1m96000/96000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 1ms/step - loss: 1.1886
Epoch 3/10
[1m96000/96000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 2ms/step - loss: 1.1871
Epoch 4/10
[1m96000/96000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 2ms/step - loss: 1.1868
Epoch 5/10
[1m96000/96000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 2ms/step - loss: 1.1796
Epoch 6/10
[1m96000/96000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 2ms/step - loss: 1.1658
Epoch 7/10
[1m96000/96000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 2ms/step - loss: 1.1621
Epoch 8/10
[1m96000/96000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 2ms/step - loss: 1.1593
Epoch 9/10
[1m96000/96000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 2ms/step - loss: 1.1572
Epoch 10/10
[1m96000/96000[0m [32m

In [20]:
# Save the pipeline
import cloudpickle
import lib.estimators

cloudpickle.register_pickle_by_value(lib.estimators)
with open(Path.joinpath(CACHE_MODELS_PATH, "pipeline.pkl"), "wb") as f:
    f.write(cloudpickle.dumps(pipeline))

In [21]:
# Load the pipeline
from joblib import load
from pathlib import Path

DATA_PATH = Path.joinpath(Path.cwd().parent, "data")
CACHE_PATH = Path.joinpath(Path.cwd().parent, "cache")
CACHE_MODELS_PATH = Path.joinpath(CACHE_PATH, "models")

pipeline = load(Path.joinpath(CACHE_MODELS_PATH, "pipeline.pkl"))

In [22]:
# Test the pipeline
sample = X.sample(1)
sample_dict = sample.to_dict(orient="records")
print(sample_dict)
sample_df = pd.DataFrame(sample_dict)
pipeline.predict(sample_df)

[{'Age': 33.0, 'Gender': 'Female', 'Annual Income': 1315.0, 'Marital Status': 'Single', 'Number of Dependents': 0.0, 'Education Level': 'PhD', 'Occupation': 'Unemployed', 'Health Score': 16.901924767302017, 'Location': 'Rural', 'Policy Type': 'Basic', 'Previous Claims': 0.0, 'Vehicle Age': 15.0, 'Credit Score': 668.0, 'Insurance Duration': 6.0, 'Policy Start Date': '2022-09-22 15:21:39.098696', 'Customer Feedback': 'Poor', 'Smoking Status': 'No', 'Exercise Frequency': 'Weekly', 'Property Type': 'Apartment'}]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step


array([631.2059], dtype=float32)

In [None]:
df.sample(1).to_json(orient="records")

'[{"id":1163379,"Age":50.0,"Gender":"Female","Annual Income":95936.0,"Marital Status":"Single","Number of Dependents":4.0,"Education Level":"PhD","Occupation":"Self-Employed","Health Score":41.4498434229,"Location":"Suburban","Policy Type":"Comprehensive","Previous Claims":0.0,"Vehicle Age":2.0,"Credit Score":662.0,"Insurance Duration":5.0,"Policy Start Date":"2019-10-01 15:21:39.278402","Customer Feedback":"Average","Smoking Status":"No","Exercise Frequency":"Monthly","Property Type":"House","Premium Amount":382.0}]'

: 