# Imports & Setup

In [1]:
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import uniform

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import  FunctionTransformer, OneHotEncoder, PolynomialFeatures, StandardScaler

In [2]:
# Enable diagrams to visualize pipelines
from sklearn import set_config
set_config(display="diagram")

# Functions

In [3]:
def split_bmi_in_three(x: float) -> str:
    if x < 25:
        return "underweight_normal"
    if x < 30:
        return "overweight"
    return "obesity"

In [4]:
def apply_bmi_split(X: np.array) -> np.array:
    X[:, 2] = np.apply_along_axis(split_bmi_in_three, 2, X[: 2])
    return X

# Data Loading & Separating Features / Target

In [5]:
df = pd.read_csv("csvs/cleaned_dataset.csv")

In [6]:
y = df.pop("charges")
X = df

### Modifying `y`'s shape

In [7]:
y = np.log(y + 1)

# Preprocessing

## With Binning `bmi` Inside PipeLine

### Hold-Out

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    shuffle=True,
                                                    train_size=0.85,
                                                    random_state=42,
                                                    stratify=X['smoker'])

### Pipeline

In [9]:
bmi_categorizer = FunctionTransformer(apply_bmi_split)
ohe_nom = OneHotEncoder(drop="first", handle_unknown="ignore")
ohe_bin = OneHotEncoder(drop="if_binary", handle_unknown="ignore")
poly = PolynomialFeatures(degree=2)
std = StandardScaler()

In [10]:
en = ElasticNet(random_state=42, 
                max_iter=10_000, tol=1e-3
)

In [12]:
pipe_bmi = make_pipeline(bmi_categorizer, ohe_nom)
pipe_bmi

In [13]:
encoding = ColumnTransformer([
    ("bmi", pipe_bmi, ["bmi"]),
    ("bin", ohe_bin, ["sex", "smoker"]),
    ("ohe", ohe_nom, ["region"])
], remainder="passthrough")
encoding

In [14]:
model = make_pipeline(encoding, poly, std, en)
model

### Training & Score

In [15]:
%time

params = {
    "elasticnet__alpha": uniform(0, 2),
    "elasticnet__l1_ratio": uniform(0, 1)
}

random_search = RandomizedSearchCV(
    model,
    param_distributions=params,
    n_iter=2_000,
    cv=10,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.87 µs


ValueError: 
All the 20000 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20000 fits failed with the following error:
Traceback (most recent call last):
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 423, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 377, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/joblib/memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 957, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 157, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 754, in fit_transform
    result = self._fit_transform(X, y, _fit_transform_one)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 681, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/utils/parallel.py", line 65, in __call__
    return super().__call__(iterable_with_config)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/joblib/parallel.py", line 1863, in __call__
    return output if self.return_generator else list(output)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/joblib/parallel.py", line 1792, in _get_sequential_output
    res = func(*args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/utils/parallel.py", line 127, in __call__
    return self.function(*args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 957, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 471, in fit_transform
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 377, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/joblib/memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 957, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 157, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/base.py", line 919, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 157, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/preprocessing/_function_transformer.py", line 240, in transform
    return self._transform(X, func=self.func, kw_args=self.kw_args)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/preprocessing/_function_transformer.py", line 312, in _transform
    return func(X, **(kw_args if kw_args else {}))
  File "/tmp/ipykernel_53457/1467478837.py", line 2, in apply_bmi_split
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/numpy/lib/shape_base.py", line 361, in apply_along_axis
    axis = normalize_axis_index(axis, nd)
numpy.exceptions.AxisError: axis 2 is out of bounds for array of dimension 2


In [None]:
best_model = random_search.best_estimator_
best_model

## With Binning `bmi` Outside Pipeline

In [15]:
X_bmi_nom = X.copy()

In [16]:
X_bmi_nom.bmi = X_bmi_nom.bmi.apply(split_bmi_in_three)

### Hold-Out

In [17]:
X_bmi_nom_train, X_bmi_nom_test, y_train, y_test =\
train_test_split(X_bmi_nom, y,
                 shuffle=True,
                 train_size=0.85,
                 random_state=42,
                 stratify=X['smoker'])

### Pipeline

In [18]:
encoder = ColumnTransformer(
    transformers=[
        ("bin", ohe_bin, ["sex", "smoker"]),
        ("nom", ohe_nom, ["bmi", "region"])
    ],
    remainder="passthrough"
)
encoder

In [19]:
model = make_pipeline(encoder, poly, std, en)
model

### Training & Score

In [24]:
%%time

params = {
    "elasticnet__alpha": uniform(0, 2),
    "elasticnet__l1_ratio": uniform(0, 1)
}

random_search = RandomizedSearchCV(
    model,
    param_distributions=params,
    n_iter=2_000,
    cv=10,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_bmi_nom_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


CPU times: user 24.9 s, sys: 1.61 s, total: 26.5 s
Wall time: 32.8 s


In [25]:
best_model = random_search.best_estimator_
best_model

In [26]:
best_model.fit(X_bmi_nom_train, y_train)
best_model.score(X_bmi_nom_test, y_test)

0.9177335980741432

# 💿 Save model

In [27]:
joblib.dump(best_model, "model.joblib")

['model.joblib']