# Work Package 3 (WP3): Open Kaggle Competition

Supervised Machine Learning Model

Development and Deployment

---

**Universitat de Lleida**  
**Enginyeria Informàtica**  
**Sistemes Intel·ligents**  

---

**Professor:** Mariano Garralda Barrio  

**Authors:**  
- Jordi García Ventura  
- Christian López García  

**Date:** 12/01/2025  


## 0. Setup

In [4]:
!python --version

Python 3.9.13


In [5]:
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'd:\projects\meinf\meinf-ai-wp-03\env\Scripts\python.exe -m pip install --upgrade pip' command.


In [6]:
%matplotlib inline

In [7]:
import sys
sys.path.append("../")

GPU_ENABLED = True

%load_ext autoreload
%autoreload 2
from lib.cache import cache, DataFrameCache
from lib.estimators import (
    CustomImputer,
    CustomMappingEncoder,
    CustomDateEncoder,
    CustomDropColumns
)

In [8]:
from pathlib import Path

import pandas as pd
from sklearn.preprocessing import PowerTransformer, PolynomialFeatures, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_log_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [9]:
TRAIN_FILENAME = "train.csv"
TEST_FILENAME = "test.csv"
DATA_PATH = Path.joinpath(Path.cwd().parent, "data")
CACHE_PATH = Path.joinpath(Path.cwd().parent, "cache")
CACHE_IMAGES_PATH = Path.joinpath(CACHE_PATH, "images")
CACHE_MODELS_PATH = Path.joinpath(CACHE_PATH, "models")
CACHE_DATAFRAMES_PATH = Path.joinpath(CACHE_PATH, "dataframes")
CACHE_NUMPY_PATH = Path.joinpath(CACHE_PATH, "numpy")

In [10]:
df = pd.read_csv(Path.joinpath(DATA_PATH, TRAIN_FILENAME))

In [11]:
df.shape

(1200000, 21)

In [12]:
df["Health Score"]

0          22.598761
1          15.569731
2          47.177549
3          10.938144
4          20.376094
             ...    
1199995    13.772907
1199996    11.483482
1199997    14.724469
1199998    18.547381
1199999    10.125323
Name: Health Score, Length: 1200000, dtype: float64

In [13]:
from tensorflow.python.client import device_lib

device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 9986389703988488849
 xla_global_id: -1]

In [19]:
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasRegressor 

def create_model(input_dim=None, kernel_initializer="glorot_uniform", dropout=0.2, learning_rate=0.001, **kwargs):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(124, activation="relu", kernel_initializer=kernel_initializer))
    model.add(Dropout(dropout))
    model.add(Dense(248, activation="relu", kernel_initializer=kernel_initializer))
    model.add(Dropout(dropout))
    model.add(Dense(1, activation="linear", kernel_initializer=kernel_initializer))
    
    model.compile(
        loss="mean_squared_error",
        optimizer=Adam(learning_rate=learning_rate),
        metrics=["mean_squared_error"]
    )
    
    return model

In [20]:
# Define column groups
columns_impute_median = [
    "Age",
    "Annual Income",
    "Previous Claims",
    "Vehicle Age",
    "Credit Score",
    "Insurance Duration",
]
columns_impute_mean = ["Health Score"]
columns_impute_mode = ["Marital Status", "Number of Dependents"]
columns_impute_constant = ["Occupation", "Customer Feedback"]
columns_encode_onehot = [
    "Gender",
    "Marital Status",
    "Occupation",
    "Location",
    "Property Type",
    "Customer Feedback",
]

mapping = {
    "Education Level": {"High School": 0, "Bachelor's": 1, "Master's": 2, "PhD": 3},
    "Policy Type": {"Basic": 0, "Comprehensive": 1, "Premium": 2},
    "Smoking Status": {"No": 0, "Yes": 1},
    "Exercise Frequency": {"Rarely": 0, "Monthly": 1, "Weekly": 2, "Daily": 3},
}

imputer = Pipeline(
    [
        (
            "impute_constant",
            CustomImputer(columns_impute_constant, strategy="constant"),
        ),
        (
            "impute_median",
            CustomImputer(columns_impute_median, strategy="median"),
        ),
        (
            "impute_mean",
            CustomImputer(columns_impute_mean, strategy="mean"),
        ),
        (
            "impute_mode",
            CustomImputer(columns_impute_mode, strategy="mode"),
        ),
    ]
)

encoder = Pipeline(
    [
        ("encode_mapping", CustomMappingEncoder(mapping)),
        ("encode_date", CustomDateEncoder(["Policy Start Date"])),
        ("drop_columns", CustomDropColumns(["Policy Start Date"])),
        [
            "encode_onehot",
            ColumnTransformer(
                [
                    (
                        "onehot",
                        OneHotEncoder(drop="first", handle_unknown="ignore"),
                        columns_encode_onehot,
                    )
                ],
                remainder="passthrough",
            ),
        ],
    ]
)

preprocessing = Pipeline(
    [
        # ("polynomial_features", PolynomialFeatures(degree=2)),
        # ("power_transformer", PowerTransformer()),
        # ("principal_component_analysis", PCA(n_components=0.95, whiten=True)),
        ("scaler", StandardScaler())
    ]
)

preprocessor = Pipeline(
    [("imputer", imputer), ("encoder", encoder), ("preprocessing", preprocessing)]
)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "regressor",
            KerasRegressor(
                build_fn=create_model,
                input_dim=26,
                epochs=10,
                batch_size=10,
                verbose=1,
                random_state=42,
                learning_rate=0.001,
            ),
        ),
    ]
)

pipeline

In [21]:
target = "Premium Amount"

X = df.drop(columns=["id", target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Check preprocessor
X_prep = preprocessor.fit_transform(X)
column_names = preprocessor.get_feature_names_out()
column_names = list(map(lambda x: x.rsplit("__")[-1], column_names))
X_prep_df = pd.DataFrame(X_prep, columns=column_names)
X_prep_df

Unnamed: 0,Gender_Male,Marital Status_Married,Marital Status_Single,Occupation_Self-Employed,Occupation_Unemployed,Occupation_Unknown,Location_Suburban,Location_Urban,Property Type_Condo,Property Type_House,...,Education Level,Health Score,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Smoking Status,Exercise Frequency,Policy Start Date_Year
0,-1.004294,1.429421,-0.725646,1.801557,-0.547217,-0.652154,-0.709152,1.420839,-0.706673,1.413289,...,-0.465410,-0.255071,1.221087,1.216739,1.286338,-1.567375,-0.007023,-1.003127,0.452086,0.934365
1,-1.004294,-0.699584,-0.725646,-0.555075,-0.547217,1.533380,-0.709152,-0.703809,-0.706673,1.413289,...,0.433367,-0.849704,-0.003359,-0.002284,0.420713,0.714630,-1.163391,0.996883,-0.445951,0.934365
2,0.995724,-0.699584,-0.725646,1.801557,-0.547217,-0.652154,1.410135,-0.703809,-0.706673,1.413289,...,-1.364188,1.824212,1.221087,-0.002284,0.766963,0.013020,-0.777935,0.996883,0.452086,0.934365
3,0.995724,1.429421,-0.725646,-0.555075,-0.547217,1.533380,-0.709152,-0.703809,-0.706673,-0.707570,...,-0.465410,-1.241521,-1.227805,-0.002284,-1.656788,-1.602810,-1.548847,0.996883,1.350122,1.611868
4,0.995724,-0.699584,1.378082,1.801557,-0.547217,-0.652154,-0.709152,-0.703809,-0.706673,1.413289,...,-0.465410,-0.443102,1.221087,-1.221307,-0.271787,0.034281,-0.392479,0.996883,0.452086,-0.420641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,-1.004294,1.429421,-0.725646,-0.555075,1.827428,-0.652154,-0.709152,1.420839,-0.706673,-0.707570,...,0.433367,-1.001709,1.221087,-0.002284,-0.791163,-1.567375,-0.777935,-1.003127,1.350122,0.934365
1199996,0.995724,-0.699584,-0.725646,1.801557,-0.547217,-0.652154,-0.709152,-0.703809,-0.706673,-0.707570,...,0.433367,-1.195387,-0.003359,-0.002284,0.074463,0.027194,-0.392479,-1.003127,0.452086,0.256862
1199997,0.995724,-0.699584,-0.725646,-0.555075,-0.547217,1.533380,1.410135,-0.703809,1.415081,-0.707570,...,0.433367,-0.921210,-1.227805,-1.221307,1.632588,0.013020,0.378433,-1.003127,-0.445951,-0.420641
1199998,0.995724,-0.699584,1.378082,-0.555075,-0.547217,1.533380,1.410135,-0.703809,-0.706673,-0.707570,...,1.332145,-0.597805,1.221087,-0.002284,-0.444912,-1.319331,-0.392479,-1.003127,1.350122,-0.420641


In [23]:
# Tran the model
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

score = root_mean_squared_log_error(y_test, y_pred)
print(f"RMSLE: {score}")

  X, y = self._initialize(X, y)


Epoch 1/10
[1m96000/96000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 2ms/step - loss: 766555.0000 - mean_squared_error: 766555.0000
Epoch 2/10
[1m96000/96000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 2ms/step - loss: 749524.4375 - mean_squared_error: 749524.4375
Epoch 3/10
[1m96000/96000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 2ms/step - loss: 748113.8750 - mean_squared_error: 748113.8750
Epoch 4/10
[1m96000/96000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 2ms/step - loss: 747402.1875 - mean_squared_error: 747402.1875
Epoch 5/10
[1m96000/96000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 2ms/step - loss: 745348.1875 - mean_squared_error: 745348.1875
Epoch 6/10
[1m96000/96000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 2ms/step - loss: 743294.0625 - mean_squared_error: 743294.0625
Epoch 7/10
[1m96000/96000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 2ms/step - loss: 740791.2500 - mean_squar

In [24]:
# Save the pipeline
import cloudpickle
import lib.estimators

cloudpickle.register_pickle_by_value(lib.estimators)
with open(Path.joinpath(CACHE_MODELS_PATH, "pipeline.pkl"), "wb") as f:
    f.write(cloudpickle.dumps(pipeline))

In [25]:
# Load the pipeline
from joblib import load
from pathlib import Path

DATA_PATH = Path.joinpath(Path.cwd().parent, "data")
CACHE_PATH = Path.joinpath(Path.cwd().parent, "cache")
CACHE_MODELS_PATH = Path.joinpath(CACHE_PATH, "models")

pipeline = load(Path.joinpath(CACHE_MODELS_PATH, "pipeline.pkl"))

In [26]:
# Test the pipeline
sample = X.sample(1)
sample_dict = sample.to_dict(orient="records")
print(sample_dict)
sample_df = pd.DataFrame(sample_dict)
pipeline.predict(sample_df)

[{'Age': 35.0, 'Gender': 'Female', 'Annual Income': 26700.0, 'Marital Status': 'Single', 'Number of Dependents': 1.0, 'Education Level': 'PhD', 'Occupation': 'Unemployed', 'Health Score': 51.09722719999661, 'Location': 'Rural', 'Policy Type': 'Basic', 'Previous Claims': 0.0, 'Vehicle Age': 13.0, 'Credit Score': 534.0, 'Insurance Duration': 9.0, 'Policy Start Date': '2021-08-20 15:21:39.106582', 'Customer Feedback': 'Good', 'Smoking Status': 'No', 'Exercise Frequency': 'Rarely', 'Property Type': 'Condo'}]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step


array([1118.9897], dtype=float32)

In [None]:
df.sample(1).to_json(orient="records")

'[{"id":729009,"Age":40.0,"Gender":"Male","Annual Income":3568.0,"Marital Status":"Divorced","Number of Dependents":2.0,"Education Level":"PhD","Occupation":"Self-Employed","Health Score":34.3103249775,"Location":"Suburban","Policy Type":"Premium","Previous Claims":2.0,"Vehicle Age":1.0,"Credit Score":405.0,"Insurance Duration":8.0,"Policy Start Date":"2022-11-24 15:21:39.114118","Customer Feedback":"Average","Smoking Status":"Yes","Exercise Frequency":"Rarely","Property Type":"House","Premium Amount":2224.0}]'

: 