<a href="https://colab.research.google.com/github/190031319PHemanthBhargav/machine-learning-projects/blob/main/ML_PROJECT-Insurance%20premium%20prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Design and implement a web application to predict insurance premiums based on   publicly available risk factors




*in this project we have to predict the customer's insurance premiums based on available factors like age, gender, number of children, bmi value etc.*

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,make_pipeline

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


In [None]:
dataset = pd.read_csv('/content/insurance.csv')
dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [None]:
def preprocess_data(data):
  data =data.copy()
  y=data['charges']
  x=data.drop('charges',axis=1)

  x_train,x_test, y_train, y_test = train_test_split(x,y,train_size=0.7, test_size=0.3, shuffle=True, random_state=1)

  return x_train, x_test, y_train, y_test

In [None]:
x_train, x_test, y_train, y_test = preprocess_data(dataset)

In [None]:
x_train

Unnamed: 0,age,sex,bmi,children,smoker,region
744,50,male,26.410,0,no,northwest
363,21,female,26.400,1,no,southwest
10,25,male,26.220,0,no,northeast
970,50,female,28.160,3,no,southeast
634,51,male,39.700,1,no,southwest
...,...,...,...,...,...,...
715,60,male,28.900,0,no,southwest
905,26,female,29.355,2,no,northeast
1096,51,female,34.960,2,yes,northeast
235,40,female,22.220,2,yes,southeast


In [None]:
nominal_features = ['sex', 'smoker', 'region']

nominal_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse=False, drop='first'))
])

preprocessor = ColumnTransformer(transformers=[
    ('nominal', nominal_transformer, nominal_features)
], remainder='passthrough')


model = Pipeline(steps=[
                        ('preprocessor', preprocessor),
                        ('scalar', StandardScaler()),
                        ('regressor', RandomForestRegressor())
])

In [None]:
res = model.fit(x_train, y_train)
res

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('nominal',
                                                  Pipeline(memory=None,
                                                           steps=[('onehot',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop='first',
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='error',
                                                                                 sparse=False))],
                                  

Predicting the model

In [None]:
y_pred = model.predict(x_test)

In [None]:
x_test

Unnamed: 0,age,sex,bmi,children,smoker,region
559,19,male,35.530,0,no,northwest
1087,57,male,31.540,0,no,northwest
1020,51,male,37.000,0,no,southwest
460,49,female,36.630,3,no,southeast
802,21,male,22.300,1,no,southwest
...,...,...,...,...,...,...
323,57,male,40.945,0,no,northeast
1268,20,female,33.300,0,no,southwest
134,20,female,28.785,0,no,northeast
1274,26,male,27.060,0,yes,southeast


In [None]:
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))

print("RMSE: {:.2f}".format(rmse))
print(" R^2: {:.5f}".format(r2))

RMSE: 4886.74
 R^2: 0.83149


In [None]:
y_train

744      8827.20990
363      2597.77900
10       2721.32080
970     10702.64240
634      9391.34600
           ...     
715     12146.97100
905      4564.19145
1096    44641.19740
235     19444.26580
1061    11554.22360
Name: charges, Length: 936, dtype: float64

In [None]:
np.sum((y_test - y_test.mean())**2)

56970102513.23878

In [None]:
y_pred

array([ 2421.6303261 , 12266.3188134 ,  9035.5444136 , 10903.6331273 ,
        2156.172438  , 39904.9704112 , 10273.1867112 , 11639.8147705 ,
        3170.4881357 , 20374.2002905 , 17619.5602133 , 13619.9918653 ,
        6912.7358505 ,  7184.8806619 ,  2023.419089  , 11714.7502506 ,
        6980.3719718 ,  6797.9886865 , 15788.6435766 , 13958.7322076 ,
       10038.3705675 , 39943.2081514 , 11216.3110319 ,  9302.2759989 ,
       15161.08119507,  6767.435854  ,  8610.227741  ,  9796.049981  ,
        6867.8523317 ,  3675.126971  , 12169.7559086 ,  7182.8262328 ,
       25965.022305  , 35371.7295385 , 25306.5945452 , 15127.7529073 ,
       39089.1338799 , 17359.3945615 , 15219.474544  , 46545.9613946 ,
        7470.7991667 , 11827.6410465 , 11000.8075322 , 19343.4894928 ,
        6153.8885412 , 12770.3181117 ,  2000.1332281 , 34892.0404081 ,
       10713.4357814 , 18769.9558524 , 16039.7944599 , 15255.0853969 ,
        5626.5639296 , 12553.5047732 , 16842.3607005 ,  6668.9217221 ,
      

In [None]:
import ipywidgets as widgets
from IPython.display import display

age_widget = widgets.IntSlider(
    value=38,
    min=18,
    max=64,
    step=1,
    description="Age:"
)

bmi_widget = widgets.FloatSlider(
    value=30,
    min=15,
    max=54,
    step=0.01,
    description="BMI:"
)

children_widget = widgets.IntSlider(
    value=1,
    min=0,
    max=5,
    step=1,
    description="Children:"
)

sex_widget = widgets.ToggleButtons(
    options=['female', 'male'],
    description="Sex:"
)

smoker_widget = widgets.ToggleButtons(
    options=['no', 'yes'],
    description="Smoker:"
)

region_widget = widgets.Dropdown(
    options=['northeast', 'northwest', 'southeast', 'southwest'],
    description="Region:"
)

predict_btn = widgets.Button(
    description="Predict"
)

prediction_out = widgets.Output()


def make_prediction(btn):
    x = pd.DataFrame({
        'age':      age_widget.value,
        'sex':      sex_widget.value,
        'bmi':      bmi_widget.value,
        'children': children_widget.value,
        'smoker':   smoker_widget.value,
        'region':   region_widget.value
    }, index=[0])
    
    prediction = model.predict(x)
    
    with prediction_out:
        prediction_out.clear_output()
        print("Prediction: {:.4f}".format(prediction[0]))


predict_btn.on_click(make_prediction)


display(age_widget, bmi_widget, children_widget, sex_widget, smoker_widget, region_widget, predict_btn, prediction_out)

IntSlider(value=38, description='Age:', max=64, min=18)

FloatSlider(value=30.0, description='BMI:', max=54.0, min=15.0, step=0.01)

IntSlider(value=1, description='Children:', max=5)

ToggleButtons(description='Sex:', options=('female', 'male'), value='female')

ToggleButtons(description='Smoker:', options=('no', 'yes'), value='no')

Dropdown(description='Region:', options=('northeast', 'northwest', 'southeast', 'southwest'), value='northeast…

Button(description='Predict', style=ButtonStyle())

Output()

In [None]:
import joblib
joblib.dump(model,'insurance.model')

['insurance.model']