In [15]:
import pandas as pd

In [16]:
df = pd.read_csv("insurance.csv")
df.dropna(axis=0, inplace=True)

In [17]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [18]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [19]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


# Split Testing

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
x_train, x_test, y_train, y_test = train_test_split(
    df.drop(columns=["charges"]),
    df["charges"],
    test_size=0.2,
    random_state=42,
)

In [23]:
x_train, x_test

(      age     sex     bmi  children smoker     region
 560    46  female  19.950         2     no  northwest
 1285   47  female  24.320         0     no  northeast
 1142   52  female  24.860         0     no  southeast
 969    39  female  34.320         5     no  southeast
 486    54  female  21.470         3     no  northwest
 ...   ...     ...     ...       ...    ...        ...
 1095   18  female  31.350         4     no  northeast
 1130   39  female  23.870         5     no  southeast
 1294   58    male  25.175         0     no  northeast
 860    37  female  47.600         2    yes  southwest
 1126   55    male  29.900         0     no  southwest
 
 [1070 rows x 6 columns],
       age     sex     bmi  children smoker     region
 764    45  female  25.175         2     no  northeast
 887    36  female  30.020         0     no  northwest
 890    64  female  26.885         0    yes  northwest
 1293   46    male  25.745         3     no  northwest
 259    19    male  31.920         0 

In [24]:
y_train, y_test

(560      9193.83850
 1285     8534.67180
 1142    27117.99378
 969      8596.82780
 486     12475.35130
            ...     
 1095     4561.18850
 1130     8582.30230
 1294    11931.12525
 860     46113.51100
 1126    10214.63600
 Name: charges, Length: 1070, dtype: float64,
 764      9095.06825
 887      5272.17580
 890     29330.98315
 1293     9301.89355
 259     33750.29180
            ...     
 109     47055.53210
 575     12222.89830
 535      6067.12675
 543     63770.42801
 846      9872.70100
 Name: charges, Length: 268, dtype: float64)

In [25]:
df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
)

In [26]:
len(df_train), len(df_test)

(1070, 268)

# Encoder

In [27]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.metrics import median_absolute_error, mean_absolute_error, mean_absolute_percentage_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
import sklearn
import optuna

In [28]:
encoder = ColumnTransformer(
    [
        ('onehot', OneHotEncoder(sparse_output=False, min_frequency=5, handle_unknown='infrequent_if_exist'), ["sex", "smoker", "region"])
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
    force_int_remainder_cols=False
)

In [29]:
regressor = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

In [30]:
pipe = Pipeline([
    ("encoder", encoder),
    ("standardization", StandardScaler()),
    ("regressor", regressor)
])

In [31]:
pipe.get_params()
pipe.fit(x_train, y_train)

In [32]:
y_test_pred = pipe.predict(x_test)

In [33]:
mean_absolute_error(y_test, y_test_pred)

2536.3051130123727

In [34]:
mean_absolute_percentage_error(y_test, y_test_pred)

0.3037062973809438

# Test Optuna

In [35]:
study = optuna.create_study(storage="sqlite:///model_selection.db", study_name="Exodia", direction="minimize")

[I 2025-06-07 15:16:38,443] A new study created in RDB with name: Exodia


In [36]:
def objective_func(trial):
    params = {
        'regressor__n_estimators': trial.suggest_int("regressor__n_estimators", low= 10, high=100),
        'regressor__criterion': trial.suggest_categorical("regressor__criterion", ['squared_error', 'absolute_error']),
        'encoder__onehot__min_frequency': trial.suggest_int("encoder__onehot__min_frequency", 1, 10),
        'regressor__max_depth': trial.suggest_int("regressor__max_depth", 3, 20)
    }

    pipe.set_params(**params)
    
    scores = cross_validate(
        pipe,
        x_train,
        y_train,
        scoring=make_scorer(mean_absolute_error, greater_is_better=False),
        cv=KFold(shuffle=True, random_state=42)
    )

    return abs(scores["test_score"].mean())


In [37]:
study.optimize(objective_func, n_trials=100, n_jobs=-1)

[I 2025-06-07 15:16:41,334] Trial 4 finished with value: 2521.8762745511112 and parameters: {'regressor__n_estimators': 23, 'regressor__criterion': 'absolute_error', 'encoder__onehot__min_frequency': 6, 'regressor__max_depth': 11}. Best is trial 4 with value: 2521.8762745511112.
[I 2025-06-07 15:16:43,345] Trial 0 finished with value: 2471.8301677098925 and parameters: {'regressor__n_estimators': 21, 'regressor__criterion': 'absolute_error', 'encoder__onehot__min_frequency': 1, 'regressor__max_depth': 10}. Best is trial 0 with value: 2471.8301677098925.
[I 2025-06-07 15:16:43,646] Trial 3 finished with value: 2573.683261525547 and parameters: {'regressor__n_estimators': 62, 'regressor__criterion': 'absolute_error', 'encoder__onehot__min_frequency': 2, 'regressor__max_depth': 5}. Best is trial 0 with value: 2471.8301677098925.
[I 2025-06-07 15:16:43,667] Trial 5 finished with value: 2689.9128243082314 and parameters: {'regressor__n_estimators': 29, 'regressor__criterion': 'squared_error

In [38]:
pipe.set_params(**study.best_params)

In [39]:
study.best_params

{'regressor__n_estimators': 47,
 'regressor__criterion': 'absolute_error',
 'encoder__onehot__min_frequency': 8,
 'regressor__max_depth': 5}

In [40]:
pipe.fit(x_train, y_train)

In [41]:
preds = pipe.predict(x_test)        

In [42]:
mean_absolute_error(y_test, preds)

1803.8793305408856

In [43]:
mean_absolute_percentage_error(y_test, preds)

0.1334267523960632

# Aggiungere una colonna booleana per salutare o no

In [44]:
df["salutare"] = df["bmi"].between(18.5, 24.9)

In [45]:
df[["bmi", "salutare"]].head()


Unnamed: 0,bmi,salutare
0,27.9,False
1,33.77,False
2,33.0,False
3,22.705,True
4,28.88,False


In [46]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,salutare
0,19,female,27.9,0,yes,southwest,16884.924,False
1,18,male,33.77,1,no,southeast,1725.5523,False
2,28,male,33.0,3,no,southeast,4449.462,False
3,33,male,22.705,0,no,northwest,21984.47061,True
4,32,male,28.88,0,no,northwest,3866.8552,False


In [47]:
df["salutare"].value_counts()

salutare
False    1116
True      222
Name: count, dtype: int64

In [48]:
def categoria_bmi(bmi):
    if bmi < 18.5:
        return "Sottopeso"
    elif bmi < 25:
        return "Normopeso"
    elif bmi < 30:
        return "Sovrappeso"
    else:
        return "Obeso"

df["categoria_bmi"] = df["bmi"].apply(categoria_bmi)


In [49]:
print(df[["bmi", "categoria_bmi"]].head())
print(df["categoria_bmi"].value_counts())


      bmi categoria_bmi
0  27.900    Sovrappeso
1  33.770         Obeso
2  33.000         Obeso
3  22.705     Normopeso
4  28.880    Sovrappeso
categoria_bmi
Obeso         707
Sovrappeso    386
Normopeso     225
Sottopeso      20
Name: count, dtype: int64


In [50]:

import plotly.express as px


# Categorize BMI
def categoria_bmi(bmi):
    if bmi < 18.5:
        return "Sottopeso"
    elif bmi < 25:
        return "Normopeso"
    elif bmi < 30:
        return "Sovrappeso"
    else:
        return "Obeso"

df["categoria_bmi"] = df["bmi"].apply(categoria_bmi)

# Create a scatter plot with Plotly
fig = px.scatter(
    df,
    x="age",
    y="charges",
    color="categoria_bmi",
    title="Age vs Charges grouped by BMI Category",
    labels={"age": "Age", "charges": "Insurance Charges", "categoria_bmi": "BMI Category"},
    hover_data=["bmi"]
)

fig.update_layout(template="plotly_white")
fig.show()
