# Imports & Setup

In [1]:
from typing import List

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import chi2, uniform
import statsmodels.api as sm

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import  FunctionTransformer, OneHotEncoder, PolynomialFeatures, StandardScaler

In [2]:
# Enable diagrams to visualize pipelines
from sklearn import set_config
set_config(display="diagram")

# Shut down warnings for output readability
import warnings
warnings.filterwarnings("ignore")

# Functions and Classes

In [3]:
class ThresholdBinningTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column: str, bins: List[float], labels=List[str]):
        self.column = column
        self.bins = bins
        self.labels = labels

    def fit(self, X, y=None):
        # No fitting necessary for this transformer
        return self

    def transform(self, X):
        if self.column in X.columns:
            X_binned = pd.cut(X[self.column],
                              bins=self.bins, labels=self.labels,
                              right=False)  # left edge inclusive, right edge exclusive
            X_transformed = X.copy()
            X_transformed[self.column] = X_binned
            return X_transformed
        else:
            raise ValueError(f"Column {self.column} not in input")

    def get_feature_names_out(self, input_features=None):
        return [self.column]

# Data Loading & Separating Features / Target

In [4]:
df = pd.read_csv("csvs/cleaned_dataset.csv")

In [5]:
y = df.pop("charges")
X = df

### Modifying `y`'s shape

In [6]:
y = np.log(y + 1)

# Preprocessing

## Hold-Out

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    shuffle=True,
                                                    train_size=0.85,
                                                    random_state=42,
                                                    stratify=X['smoker'])

## Pipeline

In [8]:
bmi_edges = [0.0, 25.0, 30.0, np.inf]
bmi_cats = ["underweight_normal", "overweight", "obesity"]

bmi_categorizer = ThresholdBinningTransformer(column="bmi", bins=bmi_edges, labels=bmi_cats)    

In [9]:
ohe_nom = OneHotEncoder(drop="first", handle_unknown="ignore")
ohe_bin = OneHotEncoder(drop="if_binary", handle_unknown="ignore")
poly = PolynomialFeatures(degree=2)
std = StandardScaler()

In [21]:
en = ElasticNet(random_state=42, max_iter=10_000, tol=1e-3)

In [11]:
pipe_bmi = make_pipeline(bmi_categorizer, ohe_nom)
pipe_bmi

In [20]:
encoder = ColumnTransformer(
    transformers = [
        ("bmi", pipe_bmi, ["bmi"]),
        ("bin", ohe_bin, ["sex", "smoker"]),
        ("ohe", ohe_nom, ["region"])
    ],
    remainder="passthrough",
    # Avoid prefixing output features by transformer's name
    verbose_feature_names_out=False
)

encoder

In [13]:
model = make_pipeline(encoder, poly, std, en)
model

### Training & Score

In [14]:
%time

params = {
    "elasticnet__alpha": uniform(0, 2),
    "elasticnet__l1_ratio": uniform(0, 1)
}

random_search = RandomizedSearchCV(
    model,
    param_distributions=params,
    n_iter=2_000,
    cv=10,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

CPU times: user 1 µs, sys: 1 µs, total: 2 µs
Wall time: 4.05 µs


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [15]:
best_model = random_search.best_estimator_
best_model

In [16]:
best_model.fit(X_train, y_train)
best_model.score(X_test, y_test)

0.9177282544723908

# 💿 Save model

In [17]:
joblib.dump(best_model, "model.joblib")

# Coefficients Analysis

In [19]:
coefs = best_model[-1].coef_

feats = best_model[:-1].get_feature_names_out()

scores = {feat: coef for feat, coef in zip(feats, coefs)}

coefs = (pd.DataFrame(scores, index=["coef"])
         .T
         .sort_values(by="coef", key=lambda x: abs(x), ascending=False)
         .reset_index()
         .rename(columns={"index": "features"})
        )
coefs

Unnamed: 0,features,coef
0,smoker_yes,0.742355
1,age,0.669828
2,smoker_yes age,-0.542532
3,smoker_yes^2,0.491877
4,children,0.381042
5,age children,-0.191016
6,age^2,-0.161262
7,region_southeast age,0.157041
8,bmi_underweight_normal smoker_yes,-0.144435
9,region_southeast,-0.139555


# Cook's Distance With `statsmodels`

## Cook's Distance Over Threshold

In [29]:
X_train_preproc = best_model_1[:-1].fit_transform(X_train)
X_train_preproc.shape

(1136, 55)

In [30]:
sm_model = sm.OLS(y_train, sm.add_constant(X_train_preproc)).fit()

In [32]:
influence = sm_model.get_influence()
cook_distance = influence.cooks_distance[0]

In [35]:
n, p = X_train_preproc.shape

In [36]:
cook_threshold = 4 / (n - p)
cook_threshold

0.0037002775208140612

In [37]:
(cook_distance > cook_threshold).sum()

47

<font color="orangered">**There are 47 influent values. Let's retrieve their indexes.**</font>

## Retrieving Indexes

In [38]:
condition = cook_distance > cook_threshold

In [74]:
condition[:10]

array([False, False, False, False, False, False, False, False, False,
       False])

In [72]:
cook_distance[:7]

array([2.35005115e-03, 2.73841021e-04, 3.34498866e-04, 4.49087339e-04,
       3.52868123e-04, 2.38067281e-04, 2.03658633e-05])

In [75]:
indexes = np.where(condition)[0]
indexes

array([  15,   33,   45,   82,   93,  105,  130,  170,  193,  199,  301,
        320,  330,  387,  412,  418,  445,  466,  481,  488,  515,  536,
        554,  568,  642,  700,  706,  715,  716,  779,  794,  798,  802,
        821,  847,  893,  906,  931,  946,  987, 1039, 1055, 1077, 1092,
       1100, 1119, 1124])

## Retrieving Records

In [79]:
df = pd.read_csv("csvs/cleaned_dataset.csv")
df_influents = df.iloc[indexes]
df_influents

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
15,19,male,24.6,1,no,southwest,1837.237
33,63,male,28.31,0,no,northwest,13770.0979
45,55,male,37.3,0,no,southwest,20630.28351
82,22,male,37.62,1,yes,southeast,37165.1638
93,35,male,34.77,2,no,northwest,5729.0053
105,20,male,28.025,1,yes,northwest,17560.37975
130,59,female,26.505,0,no,northeast,12815.44495
170,63,male,41.47,0,no,southeast,13405.3903
193,56,female,26.6,1,no,northwest,12044.342
199,64,female,39.33,0,no,northeast,14901.5167


## 💿 Exporting Influents

In [80]:
df_influents.to_csv("csvs/influents.csv")

# 💿 Removing Influent Outliers 

In [88]:
df_std = df.iloc[list(set(df.index) - set(indexes))]
df_std

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1332,50,male,30.970,3,no,northwest,10600.54830
1333,18,female,31.920,0,no,northeast,2205.98080
1334,18,female,36.850,0,no,southeast,1629.83350
1335,21,female,25.800,0,no,southwest,2007.94500


In [43]:
df_std.to_csv("csvs/standard.csv")

# Training Again on *Inliers*

## Full Set

In [50]:
y_std = df_std.pop("charges")
X_std = df_std

y_std = np.log(y_std + 1)

In [51]:
X_std_train, X_std_test, y_std_train, y_std_test = train_test_split(
    X_std, y_std,
    shuffle=True,
    train_size=0.85,
    random_state=42,
    stratify=X_std['smoker']
)

In [52]:
model_1

In [53]:
%time

params = {
    "elasticnet__alpha": uniform(0, 2),
    "elasticnet__l1_ratio": uniform(0, 1)
}

random_search_std = RandomizedSearchCV(
    model_1,
    param_distributions=params,
    n_iter=2_000,
    cv=10,
    n_jobs=-1,
    random_state=42
)

random_search_std.fit(X_std_train, y_std_train)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.96 µs


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [54]:
best_model_std = random_search_std.best_estimator_
best_model_std

In [55]:
best_model_std.fit(X_std_train, y_std_train)
best_model_std.score(X_std_test, y_std_test)

0.8744644131171115

❗ <font color="orangered">**Unfortunately, the score decreased on the full set of inliers.**</font>

## Removing Less Influent Outliers

In [83]:
indexes_distances = [(idx, cook_distance[idx]) for idx in indexes]
indexes_distances

[(15, 0.008306850127064848),
 (33, 0.0037997847502153135),
 (45, 0.015184846179995801),
 (82, 0.01203380250882871),
 (93, 0.005346311188249809),
 (105, 0.006959410704969607),
 (130, 0.007336209543726537),
 (170, 0.003708339258721471),
 (193, 0.0039059761259713733),
 (199, 0.025904353001741615),
 (301, 0.004863616474957394),
 (320, 0.028445742042196277),
 (330, 0.014952546669430503),
 (387, 0.004314863890328738),
 (412, 0.00950938684896455),
 (418, 0.004189455371781017),
 (445, 0.01786760788344977),
 (466, 0.009182577819122765),
 (481, 0.03890431718588168),
 (488, 0.011018843583291337),
 (515, 0.0037412068560326284),
 (536, 0.006299967925762568),
 (554, 0.009318906131777504),
 (568, 0.013509841715765587),
 (642, 0.014734447195287492),
 (700, 0.0038750826347385805),
 (706, 0.016518577801527753),
 (715, 0.008666514863767988),
 (716, 0.011613622135727059),
 (779, 0.017120173903313857),
 (794, 0.005311037233317975),
 (798, 0.005799237670267378),
 (802, 0.006625783739215365),
 (821, 0.006703

In [84]:
len(indexes_distances)

47

In [103]:
# Storing indexes for influent outliers
# by decreasing Cook distance order
sorted_indexes = [
    tup[0]
    for tup in sorted(indexes_distances, key = lambda tup: tup[1], reverse=True)
]

In [None]:
for size in range(40, 0, -5):
    df_std = df.iloc[list(set(df.index) - set(sorted_indexes[:-size]))]
    

In [105]:
df_inliers = []

for size in range(len(sorted_indexes) - 1):
    df_inliers.append(df.iloc[list(set(df.index) - set(sorted_indexes[:size + 1]))])

In [106]:
df_inliers

[      age     sex     bmi  children smoker     region      charges
 0      19  female  27.900         0    yes  southwest  16884.92400
 1      18    male  33.770         1     no  southeast   1725.55230
 2      28    male  33.000         3     no  southeast   4449.46200
 3      33    male  22.705         0     no  northwest  21984.47061
 4      32    male  28.880         0     no  northwest   3866.85520
 ...   ...     ...     ...       ...    ...        ...          ...
 1332   50    male  30.970         3     no  northwest  10600.54830
 1333   18  female  31.920         0     no  northeast   2205.98080
 1334   18  female  36.850         0     no  southeast   1629.83350
 1335   21  female  25.800         0     no  southwest   2007.94500
 1336   61  female  29.070         0    yes  northwest  29141.36030
 
 [1336 rows x 7 columns],
       age     sex     bmi  children smoker     region      charges
 0      19  female  27.900         0    yes  southwest  16884.92400
 1      18    male  

In [107]:
params = {
    "elasticnet__alpha": uniform(0, 2),
    "elasticnet__l1_ratio": uniform(0, 1)
}

for idx, df_inl in enumerate(df_inliers, 1):
    print(f"Removing {idx} influent outlier...")

    y_inl = df_inl.pop("charges")
    X_inl = df_inl

    y_inl = np.log(y_inl + 1)

    X_inl_train, X_inl_test, y_inl_train, y_inl_test = train_test_split(
        X_inl, y_inl,
        shuffle=True,
        train_size=0.85,
        random_state=42,
        stratify=X_inl["smoker"]
    )

    random_search_inl = RandomizedSearchCV(
        model_1,
        param_distributions=params,
        n_iter=2_000,
        cv=10,
        n_jobs=-1,
        random_state=42
    )
    random_search_inl.fit(X_inl_train, y_inl_train)

    best_model_inl = random_search_inl.best_estimator_

    best_model_inl.fit(X_inl_train, y_inl_train)
    print(f"Score on test set: {best_model_inl.score(X_inl_test, y_inl_test)}")    

Removing 1 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.9206453256185483
Removing 2 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8392845486056807
Removing 3 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8445403187438726
Removing 4 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8424606624635211
Removing 5 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8404350663848302
Removing 6 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8366288602510121
Removing 7 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.876069356177881
Removing 8 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8342930917951539
Removing 9 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8249583277517898
Removing 10 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8188553790807344
Removing 11 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.838695284570108
Removing 12 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8373359497922428
Removing 13 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8633151385654595
Removing 14 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8644056345660249
Removing 15 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8766972252589065
Removing 16 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8522096412198021
Removing 17 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8408706801636163
Removing 18 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8460315769910007
Removing 19 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8462004911181027
Removing 20 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8385057490193117
Removing 21 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.7936713836721414
Removing 22 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8310459719057413
Removing 23 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8187619949394133
Removing 24 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8236966001032939
Removing 25 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.7939811512847306
Removing 26 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.789802592113808
Removing 27 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8051238209217547
Removing 28 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8175332821511268
Removing 29 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8331709486603943
Removing 30 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.7705165125863747
Removing 31 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8023519563901491
Removing 32 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8062251643003536
Removing 33 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8034612896399805
Removing 34 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.7924730468589006
Removing 35 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.7912838917926104
Removing 36 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.7870915312857132
Removing 37 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8840829175921164
Removing 38 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.880655898141904
Removing 39 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8813448426675599
Removing 40 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8793359150302352
Removing 41 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8033741735649231
Removing 42 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8243419494768763
Removing 43 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8151539091802426
Removing 44 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8016442130019144
Removing 45 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8618870284068494
Removing 46 influent outlier...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8558063226859249


In [94]:
%%time

df_inliers = [
    df.iloc[list(set(df.index) - set(sorted_indexes[:-size]))]
    for size in range(45, 0, -5)
]

n_outliers = 45
params = {
    "elasticnet__alpha": uniform(0, 2),
    "elasticnet__l1_ratio": uniform(0, 1)
}

for df_inl in df_inliers:
    print(f"Nb of influent outliers removed: {n_outliers}")
    y_inl = df_inl.pop("charges")
    X_inl = df_inl

    y_inl = np.log(y_inl + 1)

    X_inl_train, X_inl_test, y_inl_train, y_inl_test = train_test_split(
        X_inl, y_inl,
        shuffle=True,
        train_size=0.85,
        random_state=42,
        stratify=X_inl["smoker"]
    )

    random_search_inl = RandomizedSearchCV(
        model_1,
        param_distributions=params,
        n_iter=2_000,
        cv=10,
        n_jobs=-1,
        random_state=42
    )
    random_search_inl.fit(X_inl_train, y_inl_train)

    best_model_inl = random_search_inl.best_estimator_

    best_model_inl.fit(X_inl_train, y_inl_train)
    print(f"Score on test set: {best_model_inl.score(X_inl_test, y_inl_test)}")
    n_outliers -= 5

Nb of influent outliers removed: 45


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8296846358906406
Nb of influent outliers removed: 40


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.7811590767736788
Nb of influent outliers removed: 35


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8384940317260465
Nb of influent outliers removed: 30


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8593530739023003
Nb of influent outliers removed: 25


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8538988977252228
Nb of influent outliers removed: 20


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.7794676233725021
Nb of influent outliers removed: 15


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.7929996720714829
Nb of influent outliers removed: 10


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8395684073093069
Nb of influent outliers removed: 5


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Score on test set: 0.8017353176140976
CPU times: user 5min 39s, sys: 39 s, total: 6min 18s
Wall time: 6min 29s
