# Imports & Setup

In [150]:
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import chi2, uniform
import statsmodels.api as sm

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import  FunctionTransformer, OneHotEncoder, PolynomialFeatures, StandardScaler

In [2]:
# Enable diagrams to visualize pipelines
from sklearn import set_config
set_config(display="diagram")

# Functions

In [3]:
def split_bmi_in_three(x: float) -> str:
    if x < 25:
        return "underweight_normal"
    if x < 30:
        return "overweight"
    return "obesity"

**Each following suggestion gives a different TraceBack...**

In [5]:
# def apply_bmi_split(X: np.array) -> np.array:
#     X[:, 2] = np.apply_along_axis(split_bmi_in_three, 1, X[: 2])
#     return X

In [3]:
# def apply_bmi_split(X: np.array) -> np.array:
#     return (np
#             .apply_along_axis(split_bmi_in_three, 1, X[: 2])
#             .reshape(-1, 1)
#            )

In [4]:
# def apply_bmi_split(column):
#     return column.apply(split_bmi_in_three)

In [2]:
# def apply_bmi_split(column):
#     return np.array([
#         split_bmi_in_three(float(x)) for x in column
#     ])

In [14]:
# def apply_bmi_split(column):
#     new_column = []
#     for elem in column:
#         if elem < 25:
#             new_column.append("underweight_normal")
#         elif elem < 30:
#             new_column.append("overweight")
#         else:
#             new_column.append("obestity")
#     return np.array(new_column)

In [4]:
def apply_bmi_split(column):
    new_column = []
    for elem in column:
        if isinstance(elem, str):
            new_column.append(elem)
        elif elem < 25:
            new_column.append("underweight_normal")
        elif elem < 30:
            new_column.append("overweight")
        else:
            new_column.append("obestity")
    return np.array(new_column).reshape(-1, 1)

# Data Loading & Separating Features / Target

In [5]:
df = pd.read_csv("csvs/cleaned_dataset.csv")

In [6]:
y = df.pop("charges")
X = df

### Modifying `y`'s shape

In [7]:
y = np.log(y + 1)

# Preprocessing

## With Binning `bmi` Inside PipeLine

### Hold-Out

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    shuffle=True,
                                                    train_size=0.85,
                                                    random_state=42,
                                                    stratify=X['smoker'])

### Pipeline

In [9]:
bmi_categorizer = FunctionTransformer(apply_bmi_split)
ohe_nom = OneHotEncoder(drop="first", handle_unknown="ignore")
ohe_bin = OneHotEncoder(drop="if_binary", handle_unknown="ignore")
poly = PolynomialFeatures(degree=2)
std = StandardScaler()

In [10]:
en = ElasticNet(random_state=42, 
                max_iter=10_000, tol=1e-3
)

In [11]:
pipe_bmi = make_pipeline(bmi_categorizer, ohe_nom)
pipe_bmi

In [12]:
encoder_1 = ColumnTransformer(
    transformers = [
        ("bmi", pipe_bmi, ["bmi"]),
        ("bin", ohe_bin, ["sex", "smoker"]),
        ("ohe", ohe_nom, ["region"])
    ],
    remainder="passthrough")

encoder_1

In [13]:
model_1 = make_pipeline(encoder_1, poly, std, en)
model_1

### Training & Score

In [28]:
%time

params = {
    "elasticnet__alpha": uniform(0, 2),
    "elasticnet__l1_ratio": uniform(0, 1)
}

random_search_1 = RandomizedSearchCV(
    model_1,
    param_distributions=params,
    n_iter=2_000,
    cv=10,
    n_jobs=-1,
    random_state=42
)

random_search_1.fit(X_train, y_train)

CPU times: user 12 µs, sys: 0 ns, total: 12 µs
Wall time: 22.2 µs


ValueError: 
All the 20000 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12000 fits failed with the following error:
Traceback (most recent call last):
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 423, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 377, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/joblib/memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 957, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 157, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 778, in fit_transform
    return self._hstack(list(Xs))
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 903, in _hstack
    return np.hstack(Xs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/numpy/core/shape_base.py", line 359, in hstack
    return _nx.concatenate(arrs, 1, dtype=dtype, casting=casting)
ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 1 has size 1022

--------------------------------------------------------------------------------
8000 fits failed with the following error:
Traceback (most recent call last):
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 423, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 377, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/joblib/memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 957, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 157, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 778, in fit_transform
    return self._hstack(list(Xs))
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 903, in _hstack
    return np.hstack(Xs)
  File "/home/martin/Documents/PolicyPriceAI/venv/lib/python3.10/site-packages/numpy/core/shape_base.py", line 359, in hstack
    return _nx.concatenate(arrs, 1, dtype=dtype, casting=casting)
ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 1 has size 1023


In [None]:
best_model_1 = random_search_1.best_estimator_
best_model_1

## With Binning `bmi` Outside Pipeline

In [15]:
X_bmi_nom = X.copy()

In [16]:
X_bmi_nom.bmi = X_bmi_nom.bmi.apply(split_bmi_in_three)

### Hold-Out

In [17]:
X_bmi_nom_train, X_bmi_nom_test, y_train, y_test =\
train_test_split(X_bmi_nom, y,
                 shuffle=True,
                 train_size=0.85,
                 random_state=42,
                 stratify=X['smoker'])

### Pipeline

In [18]:
encoder_2 = ColumnTransformer(
    transformers=[
        ("bin", ohe_bin, ["sex", "smoker"]),
        ("nom", ohe_nom, ["bmi", "region"])
    ],
    remainder="passthrough"
)
encoder_2

In [20]:
model_2 = make_pipeline(encoder_2, poly, std, en)
model_2

### Training & Score

In [24]:
%%time

params = {
    "elasticnet__alpha": uniform(0, 2),
    "elasticnet__l1_ratio": uniform(0, 1)
}

random_search_2 = RandomizedSearchCV(
    model_2,
    param_distributions=params,
    n_iter=2_000,
    cv=10,
    n_jobs=-1,
    random_state=42
)

random_search_2.fit(X_bmi_nom_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


CPU times: user 35.2 s, sys: 829 ms, total: 36 s
Wall time: 1min 17s


In [25]:
best_model_2 = random_search_2.best_estimator_
best_model_2

In [26]:
best_model_2.fit(X_bmi_nom_train, y_train)
best_model_2.score(X_bmi_nom_test, y_test)

0.9177335980741432

# 💿 Save model

In [27]:
joblib.dump(best_model_2, "model.joblib")

['model.joblib']

# Cook's Distance

In [30]:
X_train_preproc = best_model_2[:-1].fit_transform(X_bmi_nom_train)
X_train_preproc.shape

(1136, 55)

## With `statsmodels`

In [31]:
sm_model = sm.OLS(y_train, sm.add_constant(X_train_preproc)).fit()

In [33]:
dir(sm_model)

['HC0_se',
 'HC1_se',
 'HC2_se',
 'HC3_se',
 '_HCCM',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abat_diagonal',
 '_cache',
 '_data_attr',
 '_data_in_cache',
 '_get_robustcov_results',
 '_get_wald_nonlinear',
 '_is_nested',
 '_transform_predict_exog',
 '_use_t',
 '_wexog_singular_values',
 'aic',
 'bic',
 'bse',
 'centered_tss',
 'compare_f_test',
 'compare_lm_test',
 'compare_lr_test',
 'condition_number',
 'conf_int',
 'conf_int_el',
 'cov_HC0',
 'cov_HC1',
 'cov_HC2',
 'cov_HC3',
 'cov_kwds',
 'cov_params',
 'cov_type',
 'df_model',
 'df_resid',
 'eigenvals',
 'el_test',
 'ess',
 'f_pvalue',
 'f_test',
 'fittedvalues',
 'fvalue',
 'get_influence',
 'get_predi

In [128]:
influence = sm_model.get_influence()
cook_distance = influence.cooks_distance[0]

In [129]:
type(cook_distance)

numpy.ndarray

In [130]:
cook_distance.shape

(1136,)

In [125]:
n, p = X_train_preproc.shape

In [127]:
cook_threshold = 4 / (n - p)
cook_threshold

0.0037002775208140612

In [132]:
(cook_distance > cook_threshold).sum()

46

<font color="orangered">**There are 46 influent values. Let's retrieve their indexes.**</font>

In [133]:
(cook_distance > cook_threshold).index()

AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [134]:
condition = cook_distance > cook_threshold

### Retrieving Indexes

In [136]:
indexes = np.where(condition)
indexes

(array([  15,   45,   70,   82,   93,  105,  130,  193,  199,  301,  320,
         330,  387,  412,  418,  445,  466,  481,  488,  536,  554,  568,
         642,  706,  715,  716,  779,  794,  798,  802,  821,  847,  893,
         906,  931,  944,  946,  986,  987, 1039, 1055, 1077, 1092, 1100,
        1119, 1124]),)

### Retrieving Records

In [141]:
df = pd.read_csv("csvs/cleaned_dataset.csv")
df_influents = df.iloc[indexes]
df_influents

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
15,19,male,24.6,1,no,southwest,1837.237
45,55,male,37.3,0,no,southwest,20630.28351
70,27,female,24.75,0,yes,southeast,16577.7795
82,22,male,37.62,1,yes,southeast,37165.1638
93,35,male,34.77,2,no,northwest,5729.0053
105,20,male,28.025,1,yes,northwest,17560.37975
130,59,female,26.505,0,no,northeast,12815.44495
193,56,female,26.6,1,no,northwest,12044.342
199,64,female,39.33,0,no,northeast,14901.5167
301,53,female,22.61,3,yes,northeast,24873.3849


### 💿 Exporting Influents

In [142]:
df_influents.to_csv("csvs/influents.csv")

## With `sklearn`

In [44]:
sl_model = LinearRegression().fit(X_train_preproc, y_train)

In [46]:
n, p = X_train_preproc.shape

### GPT's Suggestion

In [48]:
alpha = 0.05
cook_threshold = chi2.ppf(1 - alpha / ( 2 * n), df=p)

In [49]:
cook_threshold

108.65133684155941

In [51]:
residuals = y_train - sl_model.predict(X_train_preproc)

In [53]:
hat_matrix = (
    X_train_preproc @
    np.linalg.inv(X_train_preproc.T @  X_train_preproc ) @ 
    X_train_preproc.T
)

LinAlgError: Singular matrix

In [56]:
np.linalg.det(X_train_preproc.T @ X_train_preproc)

0.0

### Ugly Formula

In [97]:
X_tp = X_train_preproc.copy()

In [98]:
X_tp.shape

(1136, 55)

In [99]:
# Reduced Train Sets: X_red, y_red
# X_reds[i] is X, y without the ith row
X_y_reds = []
for idx in range(len(X_tp)):
    X_y_reds.append((np.delete(X, i, axis=0), np.delete(y_train, i, axis=0)))

In [100]:
X_reds[0].shape

(1135, 55)

In [101]:
betas_reds = []
for X_red, y_red in X_y_reds:
    model = LinearRegression().fit(X_red, y_red)
    betas_reds.append(model.coef_)  

In [102]:
beta = LinearRegression().fit(X, y_train).coef_

In [104]:
beta.shape

(55,)

In [105]:
deltas_beta = [beta - beta_red for beta_red in betas_reds]

In [124]:
all(deltas_beta[0] == deltas_beta[2])

True

In [107]:
len(deltas_beta)

1136

In [108]:
residuals = y_train - sl_model.predict(X_train_preproc)

In [109]:
n, p = X_train_preproc.shape

In [110]:
residuals ** 2

1094    0.166202
94      0.033765
349     0.057924
410     0.080895
981     0.048910
          ...   
674     0.001376
752     0.054482
319     0.048461
934     0.013979
1113    0.025091
Name: charges, Length: 1136, dtype: float64

In [111]:
residuals_variance = 1 / (n - p) * np.sum(residuals ** 2)

In [113]:
mat = X_tp.T @ X_tp

In [114]:
mat.shape

(55, 55)

In [121]:
np.round(mat)

array([[   0.,    0.,    0., ...,    0.,    0.,    0.],
       [   0., 1136.,   62., ...,  -26.,  -21.,   -5.],
       [   0.,   62., 1136., ...,  -46.,    7.,  -11.],
       ...,
       [   0.,  -26.,  -46., ..., 1136.,  230.,  -31.],
       [   0.,  -21.,    7., ...,  230., 1136.,  947.],
       [   0.,   -5.,  -11., ...,  -31.,  947., 1136.]])

In [115]:
cooks = []
for db in deltas_beta:
    cooks.append(1 / p * (db.T @ mat @ db) / residuals_variance)

In [116]:
cooks

[0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.36666561469684356,
 0.3666656

In [144]:
4 / (n - p)

0.0037002775208140612

### Jeremy's Method

In [146]:
model_j = LinearRegression()
model_j.fit(X_tp, y_train)

In [148]:
y_tp = model_j.predict(X_tp)

In [151]:
mse = mean_squared_error(y_train, y_tp)

In [152]:
# Leverage Matrix H
H = X_tp @ np.linalg.inv(X_tp.T @ X_tp) @ X_tp

LinAlgError: Singular matrix

# 💿 Removing Influent Outliers 

In [167]:
df_std = df.iloc[list(set(df.index) - set(indexes[0]))]
df_std

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1332,50,male,30.970,3,no,northwest,10600.54830
1333,18,female,31.920,0,no,northeast,2205.98080
1334,18,female,36.850,0,no,southeast,1629.83350
1335,21,female,25.800,0,no,southwest,2007.94500


In [168]:
df_std.to_csv("csvs/standard.csv")

# Training Again on *Inliers*

In [169]:
y2 = df_std.pop("charges")
X2 = df_std

y2 = np.log(y2 + 1)

In [171]:
X2_bmi_nom = X2.copy()
X2_bmi_nom.bmi = X2_bmi_nom.bmi.apply(split_bmi_in_three)

In [176]:
X2_bmi_nom_train, X2bmi_nom_test, y2_train, y2_test = train_test_split(
    X2, y2,
    shuffle=True,
    train_size=0.85,
    random_state=42,
    stratify=X2_bmi_nom['smoker']
)

In [181]:
ohe_nom_2 = OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False)
ohe_bin_2 = OneHotEncoder(drop="if_binary", handle_unknown="ignore", sparse_output=False)

In [182]:
encoder_3 = ColumnTransformer(
    transformers=[
        ("bin", ohe_bin_2, ["sex", "smoker"]),
        ("nom", ohe_nom_2, ["bmi", "region"])
    ],
    remainder="passthrough"
)
encoder_3

In [183]:
model_3 = make_pipeline(encoder_3, poly, std, en)
model_3

In [None]:
%%time

params = {
    "elasticnet__alpha": uniform(0, 2),
    "elasticnet__l1_ratio": uniform(0, 1)
}

random_search_3 = RandomizedSearchCV(
    model_3,
    param_distributions=params,
    n_iter=2_000,
    cv=10,
    n_jobs=-1,
    random_state=42
)

random_search_3.fit(X2_bmi_nom_train, y2_train)



In [180]:
best_model_3 = random_search_3.best_estimator_
best_model_3

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_estimator_'

In [None]:
best_model_3.fit(X2_bmi_nom_train, y2_train)
best_model_3.score(X2_bmi_nom_test, y2_test)