In [None]:
# standard libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings, gc
warnings.simplefilter('ignore')

# sklearn
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.cluster import *
from sklearn.preprocessing import *
from sklearn.pipeline import *
from sklearn.compose import *
from sklearn.utils import *

### Load Data

In [None]:
# load
CSV = '/kaggle/input/medical-cost-dataset/medical_cost.csv'
df = pd.read_csv(CSV)

# remove ID column
df = df.iloc[:,1:]

# format decimal 
df['bmi'] = df['bmi'].apply(lambda x: round(x,2))
df['charges'] = df['charges'].apply(lambda x: round(x,2))

# categorical features
cat_cols = ['sex', 'smoker', 'region']
for c in cat_cols:
    df[c] = df[c].astype('category')
    
# converting categorical features to codes
df[cat_cols] = df[cat_cols].apply(lambda x: x.cat.codes)

# view
df.head()

### De-Duplication

In [None]:
# find duplicate rows
duplicate = df[df.duplicated(subset=df.columns[:-1].tolist())]
duplicate

**I have 2 options here,**

1. **drop duplicates (easy)**
2. **merge duplicates (little tricky)**

**I will go with option#2. The aggregate strategy that I'm going to use is MEAN.**

In [None]:
# merge duplicates
df = df.groupby(df.columns[:-1].tolist()).agg({'charges': np.mean}).reset_index()

### Cluster Feature

In [None]:
# creating a seperate dataset for cluster features
temp = df.copy()
cluster_feat = df.columns[:-1].tolist()
cluster_df = temp.groupby(by=cluster_feat, as_index=False)['charges'].mean()


# applying cluster algorithm
kwargs = {
        "init": 'k-means++',
        "random_state": np.random.randint(10),
        "max_iter": 100,
        "bisecting_strategy": 'largest_cluster',
        }

sil_scores = {}
for n in range(2, 10):
    clust = BisectingKMeans(n_clusters=n, **kwargs)
    labels = clust.fit_predict(cluster_df)
    sil_scores[n] = round(silhouette_score(cluster_df, labels), 3)

    
# get the n_cluster with highest silhouette score
sil_score_max = max(sil_scores.values())
n_cluster = [key for key, val in sil_scores.items() if val == sil_score_max]


# applying the cluster to the dataset
cluster_df['cluster_feat'] = BisectingKMeans(n_clusters=n_cluster[0], **kwargs).fit(cluster_df).labels_


# merge the dataset
df = pd.merge(df, cluster_df, right_on=cluster_feat, left_on=cluster_feat)


# drop (one of the) charges column
df.drop('charges_y', axis=1, inplace=True)


# rename charges column
df.rename(columns={'charges_x': 'charges'}, inplace=True)


# view
df.head()

### Convert Age to Category

In [None]:
# generic age categories
# Baby/Toddler: (0,3], 0 is excluded & 3 is included
# Child: (3,17], 3 is excluded & 17 is included
# Adult: (17,63], 17 is excluded & 63 is included
# Elderly: (63,99], 63 is excluded & 99 is included

df['age'] = pd.cut(x=df['age'], bins=[0, 3, 17, 63, 99],
                     labels=[
                              0,  #'Baby/Toddler', 
                              1,  # 'Child', 
                              2,  #'Adult',
                              3  #'Elderly'
                           ])

### Feature Engineering

In [None]:
# features & target
x,y = df.drop('charges',axis=1, inplace=False), df[['charges']]

# train & test split
split = 0.1
x_train, x_val, y_train, y_val = train_test_split(x,y, test_size=split, shuffle=True)
print(f"Training size: {x_train.shape[0]}\nValidation size: {x_val.shape[0]}")

### Base Regressor Pipeline

In [None]:
# list of numeric columns
num_cols = ['bmi']
cat_cols = ['age', 'sex', 'children', 'smoker', 'region', 'cluster_feat']

# base regressor
regressor = PassiveAggressiveRegressor()

# numerical column transformer
num_transformer = Pipeline(steps=[("scale", StandardScaler())])

# categorical column transformer
cat_transformer = Pipeline(steps=[("encode", OneHotEncoder(handle_unknown='ignore'))])

# pre-processor pipeline
preprocessor = ColumnTransformer(transformers=
        [
           ("num", num_transformer, num_cols),
            ("cat", cat_transformer, cat_cols)
        ])

# build a regression pipeline
reg_pipe = Pipeline(steps=[
                            ("preprocess", preprocessor),
                            ("regressor", regressor)
])

# fit the training data
reg_pipe.fit(x_train, y_train)

# make prediction
y_pred = reg_pipe.predict(x_val)
mae = '{:.3f}'.format(mean_absolute_error(y_val, y_pred))
print(f"MAE of our base regressor: {mae}")

**That's a pathetic !!, I'll try to fine tune the hyperparameters**

<br>

### Tuning Hyperparameters

In [None]:
# param grid

param_grid = {
        # Huber Regressor Hyperparameters    
        "regressor__C": np.linspace(1,5, 5, dtype=np.float64).tolist(),
        "regressor__max_iter": np.linspace(1000,3000, 5, dtype=np.int64).tolist(),
        "regressor__tol": [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
        "regressor__validation_fraction": np.linspace(0, 1,5, dtype=np.float64).tolist()
    }

In [None]:
# grid search
grid = GridSearchCV(reg_pipe, n_jobs=-1, param_grid=param_grid, verbose=0)

# fit data
grid.fit(x_train, y_train)

In [None]:
print('*'*100)
print(f"\nBest Params:{grid.best_params_}\n")
print(f"Best Score:{grid.best_score_}\n")
print('*'*100)

### Tuned Base Regressor Pipeline

In [None]:
# best params
best_params = {'C': 5.0, 'tol': 0.1, 'max_iter': 2500, 'validation_fraction': 0.75}

# tuned regressor
regressor_tuned = PassiveAggressiveRegressor(**best_params)

# build a new regression pipeline
reg_pipe_new = Pipeline(steps=[
                            ("preprocess", preprocessor),
                            ("regressor", regressor_tuned)
])

# fit the training data
reg_pipe_new.fit(x_train, y_train)

# make prediction
y_pred = reg_pipe_new.predict(x_val)
mae = '{:.3f}'.format(mean_absolute_error(y_val, y_pred))
print(f"MAE of our optimized regressor: {mae}")

**Not a great improvement but will suffice for now !**