## Insurance Premium Prediction  
  
Given *data about insurance customers*, let's try to predict the **premium charges** a given customer will incur.  
  
We will use a random forest regression model within a scikit-learn pipeline to make our predictions.  
We will design an interactive widget that will allow us to make predictions.

# Getting Started

In [31]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor

import ipywidgets as widgets
from IPython.display import display

In [32]:
data = pd.read_csv('insurance.csv')

In [33]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
age         1338 non-null int64
sex         1338 non-null object
bmi         1338 non-null float64
children    1338 non-null int64
smoker      1338 non-null object
region      1338 non-null object
charges     1338 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


# Preprocessing

In [35]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Split df into X and y
    y = df['charges']
    X = df.drop('charges', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    return X_train, X_test, y_train, y_test

In [36]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [37]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region
744,50,male,26.410,0,no,northwest
363,21,female,26.400,1,no,southwest
10,25,male,26.220,0,no,northeast
970,50,female,28.160,3,no,southeast
634,51,male,39.700,1,no,southwest
...,...,...,...,...,...,...
715,60,male,28.900,0,no,southwest
905,26,female,29.355,2,no,northeast
1096,51,female,34.960,2,yes,northeast
235,40,female,22.220,2,yes,southeast


In [38]:
y_train

744      8827.20990
363      2597.77900
10       2721.32080
970     10702.64240
634      9391.34600
           ...     
715     12146.97100
905      4564.19145
1096    44641.19740
235     19444.26580
1061    11554.22360
Name: charges, Length: 936, dtype: float64

# Building the Pipeline and Training

In [39]:
nominal_features = ['sex', 'smoker', 'region']

nominal_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse=False, drop='first'))
])

preprocessor = ColumnTransformer(transformers=[
    ('nominal', nominal_transformer, nominal_features)
], remainder='passthrough')

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor())
])

In [40]:
model.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('nominal',
                                                  Pipeline(memory=None,
                                                           steps=[('onehot',
                                                                   OneHotEncoder(categorical_features=None,
                                                                                 categories=None,
                                                                                 drop='first',
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='error',
                          

# Results

In [41]:
y_pred = model.predict(X_test)

In [42]:
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))

print("RMSE: {:.2f}".format(rmse))
print(" R^2: {:.5f}".format(r2))

RMSE: 5020.09
 R^2: 0.82217


# Interactive Widget

In [43]:
age_widget = widgets.IntSlider(
    value=38,
    min=18,
    max=64,
    step=1,
    description="Age:"
)

bmi_widget = widgets.FloatSlider(
    value=30,
    min=15,
    max=54,
    step=0.01,
    description="BMI:"
)

children_widget = widgets.IntSlider(
    value=1,
    min=0,
    max=5,
    step=1,
    description="Children:"
)

sex_widget = widgets.ToggleButtons(
    options=['female', 'male'],
    description="Sex:"
)

smoker_widget = widgets.ToggleButtons(
    options=['no', 'yes'],
    description="Smoker:"
)

region_widget = widgets.Dropdown(
    options=['northeast', 'northwest', 'southeast', 'southwest'],
    description="Region:"
)

predict_btn = widgets.Button(
    description="Predict"
)

prediction_out = widgets.Output()


def make_prediction(btn):
    x = pd.DataFrame({
        'age':      age_widget.value,
        'sex':      sex_widget.value,
        'bmi':      bmi_widget.value,
        'children': children_widget.value,
        'smoker':   smoker_widget.value,
        'region':   region_widget.value
    }, index=[0])
    
    prediction = model.predict(x)
    
    with prediction_out:
        prediction_out.clear_output()
        print("Prediction: {:.4f}".format(prediction[0]))


predict_btn.on_click(make_prediction)


display(age_widget, bmi_widget, children_widget, sex_widget, smoker_widget, region_widget, predict_btn, prediction_out)

IntSlider(value=38, description='Age:', max=64, min=18)

FloatSlider(value=30.0, description='BMI:', max=54.0, min=15.0, step=0.01)

IntSlider(value=1, description='Children:', max=5)

ToggleButtons(description='Sex:', options=('female', 'male'), value='female')

ToggleButtons(description='Smoker:', options=('no', 'yes'), value='no')

Dropdown(description='Region:', options=('northeast', 'northwest', 'southeast', 'southwest'), value='northeast…

Button(description='Predict', style=ButtonStyle())

Output()