### **Insurance Premium Predictions**

**Disclaimer: the notebook needs to be run in a live environment(Jupyter notebooks, Google colab, etc) in order for the interactive widgets to function (in GitHub they will be static).** 

### **Problem statement: predicting the premium charges a given customer will incur, given data about insurance customers.**

**Import libraries, packages, modules**

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

import ipywidgets as widgets
from IPython.display import display

Import the dataset

In [None]:
data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Insurance_premium_predictions/insurance.csv')

In [None]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


**Check datatypes and missing values.**

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## **Preprocessing**

**Preprocessing function**

In [None]:
def preprocessing_inputs(df):
    df=df.copy()

    #Split data into X(features) and y (target)
    y=df['charges']
    X=df.drop('charges', axis=1)

    # Train - test split
    X_train, X_test, y_train, y_test=train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    return X_train, X_test, y_train, y_test

In [None]:
 X_train, X_test, y_train, y_test=preprocessing_inputs(data)

In [None]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region
744,50,male,26.410,0,no,northwest
363,21,female,26.400,1,no,southwest
10,25,male,26.220,0,no,northeast
970,50,female,28.160,3,no,southeast
634,51,male,39.700,1,no,southwest
...,...,...,...,...,...,...
715,60,male,28.900,0,no,southwest
905,26,female,29.355,2,no,northeast
1096,51,female,34.960,2,yes,northeast
235,40,female,22.220,2,yes,southeast


## **Pipeline and Training**

# **Model 1: Linear Regression**

In [None]:
# Identify nominal features.
nominal_features=['sex', 'smoker', 'region']

In [None]:
nominal_transformer=Pipeline(steps=[
                  ('onehot', OneHotEncoder(sparse=False, drop='if_binary'))
                  ])

# Apply the OneHotEncoder function set above to the nominal features from the preset variable and pass through as-is any columns not included the the nominal feature variable.

preprocessor=ColumnTransformer(transformers=[
                  ('nominal', nominal_transformer, nominal_features)
                  ], remainder='passthrough')   #Any column not specified by the transformers specified in the preprocessor will pass through, instead of being dropped.

# Build the model using the sklearn pipeline.

model1=Pipeline(steps=[
                  ('preprocessor', preprocessor), 
                  ('scaler', StandardScaler()),
                  ('regressor', LinearRegression())
                  ])

In [None]:
# Fit the model

model1.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('nominal',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(drop='if_binary',
                                                                                 sparse=False))]),
                                                  ['sex', 'smoker',
                                                   'region'])])),
                ('scaler', StandardScaler()),
                ('regressor', LinearRegression())])

**Results for Linear Regression Model**



In [None]:
# A set of predictions

y_pred1=model1.predict(X_test)

In [None]:
y_pred1

array([ 4.63106370e+03,  1.29030637e+04,  1.25590637e+04,  1.31532578e+04,
        5.91795055e+02,  3.23612578e+04,  1.28005264e+04,  1.23190637e+04,
        3.90306370e+03,  2.98230637e+04,  1.09670637e+04,  1.74245264e+04,
        8.61652641e+03,  8.32779505e+03,  3.31906370e+03,  1.02965264e+04,
        3.99906370e+03,  6.84125776e+03,  1.49357951e+04,  1.43750637e+04,
        1.25430637e+04,  3.32012578e+04,  9.18306370e+03,  8.93506370e+03,
        3.23906370e+03,  8.20779505e+03,  9.40125776e+03,  1.07910637e+04,
        7.66452641e+03,  4.46306370e+03,  1.40557951e+04,  5.73652641e+03,
        3.47052578e+04,  2.75110637e+04,  3.34492578e+04,  9.55906370e+03,
        3.07590637e+04,  2.67699891e+04,  1.51670637e+04,  3.42157951e+04,
        6.34452641e+03,  1.40790637e+04,  1.05452578e+04,  1.49517951e+04,
        4.16052641e+03,  1.30390637e+04,  4.45579505e+03,  2.89910637e+04,
        7.30306370e+03,  1.40099891e+04,  1.32150637e+04,  1.21765264e+04,
        2.03106370e+03,  

In [None]:
y_test

559      1646.42970
1087    11353.22760
1020     8798.59300
460     10381.47870
802      2103.08000
           ...     
323     11566.30055
1268     1880.48700
134      2457.21115
1274    17043.34140
876     26140.36030
Name: charges, Length: 402, dtype: float64

In [None]:
# Compute the Root Mean Squared Error RMSE and see the average error in $ between real and estimated values (insurance premium).

rmse1=np.sqrt(np.mean((y_test-y_pred1)**2))

6061.727159628868

In [None]:
y_test.describe()

count      402.000000
mean     13255.808817
std      11919.315462
min       1131.506600
25%       4936.174925
50%       9830.956600
75%      16694.951712
max      60021.398970
Name: charges, dtype: float64

**Model 2: Random Forest**

In [None]:
model2=Pipeline(steps=[
                  ('preprocessor', preprocessor), 
                  ('scaler', StandardScaler()),
                  ('regressor', RandomForestRegressor())
                  ])

In [None]:
# Fit the model

model2.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('nominal',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(drop='if_binary',
                                                                                 sparse=False))]),
                                                  ['sex', 'smoker',
                                                   'region'])])),
                ('scaler', StandardScaler()),
                ('regressor', RandomForestRegressor())])

**Results for Random Forest Model**

In [None]:
# Prediction set

y_pred2=model2.predict(X_test)

In [None]:
y_pred2

array([ 1945.8761953 , 12681.3329759 ,  8956.3405665 , 10818.1827208 ,
        2118.0337475 , 39995.5948542 ,  9762.3532666 , 11654.5384965 ,
        4240.3170239 , 20259.6300188 , 17515.8753955 , 14404.1800152 ,
        6870.0338565 ,  6970.4057745 ,  1799.455     , 10192.2305516 ,
        5203.7566295 ,  6547.2814329 , 15987.8555325 , 13612.512978  ,
       10030.0485579 , 40393.2824149 ,  9918.4453164 ,  9076.8350693 ,
       14026.52375633,  6706.9916285 ,  8923.6659612 ,  8989.8192597 ,
        7758.72457   ,  4083.082414  , 12679.8251618 ,  6113.7864093 ,
       25846.3703062 , 35219.8111425 , 25946.9543845 , 15576.494855  ,
       38978.8332149 , 17385.6768175 , 15351.732309  , 44292.4638134 ,
        7188.504706  , 11798.711329  , 11339.5717122 , 19770.0484462 ,
        6581.145381  , 13206.0395139 ,  2007.310068  , 34634.012037  ,
       10640.8617035 , 17849.3843599 , 16248.6736009 , 15517.9501984 ,
        5453.0639352 , 11768.7709082 , 17403.5454568 ,  5753.4844981 ,
      

In [None]:
# Compute the Root Mean Squared Error RMSE and see the average error in $ between real and estimated values (insurance premium).

rmse2=np.sqrt(np.mean((y_test-y_pred2)**2))

In [None]:
# Base line model
np.sum((y_test-y_test.mean())**2)

56970102513.23877

In [None]:
# Our model
np.sum((y_test - y_pred2)**2)

9394934917.070051

In [None]:
# R_squared_score
R2 = 1- (np.sum((y_test - y_pred2)**2)/np.sum((y_test-y_test.mean())**2))
R2

0.8350900822955892

In [None]:
print("RMSE': {:.2f}".format(rmse2))
print("R^2': {:.5f}".format(R2))


RMSE': 4834.30
R^2': 0.83509


### **Interactive widgets**

In [None]:
# Display distinct values for all object columns in the dataset

{column: list(X_train[column].unique()) for column in X_train.select_dtypes('object').columns}

{'region': ['northwest', 'southwest', 'northeast', 'southeast'],
 'sex': ['male', 'female'],
 'smoker': ['no', 'yes']}

In [None]:
X_train.describe()

Unnamed: 0,age,bmi,children
count,936.0,936.0,936.0
mean,38.82265,30.682185,1.104701
std,14.029097,6.087997,1.222664
min,18.0,15.96,0.0
25%,26.0,26.2725,0.0
50%,38.0,30.495,1.0
75%,51.0,34.7175,2.0
max,64.0,53.13,5.0


**Widgets for numerical values**

In [None]:
# Age slider widget

age_widget=widgets.IntSlider(
          value=38, #mean
          min=18, #min
          max=64, #max
          steps=1, 
          description='Age:'
      )

In [None]:
# BMI slider widget

bmi_widget=widgets.IntSlider(
          value=30, #mean
          min=15, #min
          max=54, #max
          steps=0.01, 
          description='BMI:'
      )

In [None]:
# Children slider widget

children_widget=widgets.IntSlider(
                value=1, #mean
                min=0, #min
                max=5, #max
                steps=1, 
                description='Children:'
                )

**Widgets for nominal variables**

In [None]:
# Sex toggle widget

sex_widget=widgets.ToggleButtons(
          options=['male', 'female'],
          description='Sex:'
      )

In [None]:
# Smoker toggle widget

smoker_widget=widgets.ToggleButtons(
              options=['no', 'yes'],
              description='Smoker:'
          )

In [None]:
# Region dropdown widget

region_widget=widgets.Dropdown(
              options=['northeast', 'northwest','southeast', 'outhwest'],
              description='Region:'
          )

In [None]:
display(age_widget, BMI_widget, children_widget, sex_widget, smoker_widget, region_widget)

IntSlider(value=38, description='Age:', max=64, min=18)

IntSlider(value=30, description='BMI:', max=54, min=15)

IntSlider(value=1, description='Children:', max=5)

ToggleButtons(description='Sex:', index=1, options=('female', 'male'), value='male')

ToggleButtons(description='Smoker:', index=1, options=('yes', 'no'), value='no')

Dropdown(description='Region:', index=2, options=('northeast', 'northwest', 'southeast', 'outhwest'), value='s…

In [None]:
# Will update according to the input in the widgets
age_widget.value

38

In [None]:
# Create a predict button.
predict_button= widgets.Button(
                description='Predict'
                )
# Output
prediction_out=widgets.Output()

# Create a prediction function.

def make_prediction(btn):
    x=pd.DataFrame({
        'age': age_widget.value,
        'sex': sex_widget.value,
        'bmi': bmi_widget.value, 
        'children': children_widget.value,
        'smoker': smoker_widget.value,
        'region': region_widget.value
    }, index=[0])

    prediction=model2.predict(x)

    with prediction_out:
      prediction_out.clear_output()
      print("Prediction: {:.4f}".format(prediction[0]))


predict_button.on_click(make_prediction)

# Display
display(age_widget, BMI_widget, children_widget, sex_widget, smoker_widget, region_widget, predict_button, prediction_out)




IntSlider(value=38, description='Age:', max=64, min=18)

IntSlider(value=30, description='BMI:', max=54, min=15)

IntSlider(value=1, description='Children:', max=5)

ToggleButtons(description='Sex:', index=1, options=('female', 'male'), value='male')

ToggleButtons(description='Smoker:', index=1, options=('yes', 'no'), value='no')

Dropdown(description='Region:', index=2, options=('northeast', 'northwest', 'southeast', 'outhwest'), value='s…

Button(description='Predict', style=ButtonStyle())

Output()