In [29]:
from typing import Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import loguniform, uniform
import seaborn as sns
from tqdm import tqdm

from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import make_scorer, mean_squared_error as mse, r2_score as r2
from sklearn.model_selection import cross_validate, learning_curve, train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import  FunctionTransformer, OneHotEncoder, PolynomialFeatures, RobustScaler, StandardScaler

In [2]:
# Enable diagrams to visualize pipelines
from sklearn import set_config
set_config(display="diagram")

In [3]:
def split_bmi_in_three(x: float) -> str:
    if x < 25:
        return "underweight_normal"
    if x < 30:
        return "overweight"
    return "obesity"

In [30]:
def tqdm_callback(desc, total):
    pbar = tqdm(total=total, desc=desc)

    def update(*args):
        pbar.update()
        time.sleep(0.1)  # Small sleep to see the progress increment

    return update

# Data Loading & Separating Features / Target

In [4]:
df = pd.read_csv("csvs/cleaned_dataset.csv")

In [5]:
y = df.pop("charges")
X = df

### Modifying `y`'s shape

In [6]:
y = np.log(y + 1)

# Preprocessing

## With Binning `bmi` Inside PipeLine

### Hold-Out

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    shuffle=True,
                                                    train_size=0.85,
                                                    random_state=42,
                                                    stratify=X['smoker'])

### Pipeline

In [8]:
X.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region'], dtype='object')

In [9]:
bmi_categorizer = FunctionTransformer(split_bmi_in_three)
ohe_nom = OneHotEncoder(drop="first", handle_unknown="ignore")
ohe_bin = OneHotEncoder(drop="if_binary", handle_unknown="ignore")
poly = PolynomialFeatures(degree=2)
std = StandardScaler()

In [10]:
en = ElasticNet(random_state=42, 
                max_iter=10_000, tol=1e-3
)

In [11]:
pipe_bmi = make_pipeline(bmi_categorizer, ohe_nom)
pipe_bmi

In [12]:
encoding = ColumnTransformer([
    ("bmi", pipe_bmi, ["bmi"]),
    ("bin", ohe_bin, ["sex", "smoker"]),
    ("ohe", ohe_nom, ["region"])
], remainder="passthrough")
encoding

In [13]:
model = make_pipeline(encoding, poly, std, en)
model

## With Binning `bmi` Outside Pipeline

In [14]:
X_bmi_nom = X.copy()

In [15]:
X_bmi_nom.bmi = X_bmi_nom.bmi.apply(split_bmi_in_three)

### Hold-Out

In [16]:
X_bmi_nom_train, X_bmi_nom_test, y_train, y_test =\
train_test_split(X_bmi_nom, y,
                 shuffle=True,
                 train_size=0.85,
                 random_state=42,
                 stratify=X['smoker'])

### Pipeline

In [17]:
encoder = ColumnTransformer(
    transformers=[
        ("bin", ohe_bin, ["sex", "smoker"]),
        ("nom", ohe_nom, ["bmi", "region"])
    ],
    remainder="passthrough"
)
encoder

In [18]:
model = make_pipeline(encoder, poly, std, en)
model

# Training

In [25]:
%%time

params = {
    "elasticnet__alpha": uniform(0, 2),
    "elasticnet__l1_ratio": uniform(0, 1)
}

random_search = RandomizedSearchCV(
    model,
    param_distributions=params,
    n_iter=2_000,
    cv=10,
    n_jobs=-1
)

random_search.fit(X_bmi_nom_train, y_train)

CPU times: user 27.4 s, sys: 2.83 s, total: 30.3 s
Wall time: 36 s


In [26]:
best_model = random_search.best_estimator_
best_model

In [27]:
best_model.fit(X_bmi_nom_train, y_train)
best_model.score(X_bmi_nom_test, y_test)

0.9175747598130599