# **Insurance Premium Predictive Model**

In [None]:
# step 1: import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Step 2: read data
df = pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/Insurance%20Premium.csv')

In [None]:
df.head()

Unnamed: 0,ID,Age,Gender,BMI,Children,Smoker,Region,Premium
0,1,19,female,27.9,0,yes,south,16885
1,2,18,male,33.77,1,no,east,1726
2,3,28,male,33.0,3,no,east,4449
3,4,33,male,22.705,0,no,west,21984
4,5,32,male,28.88,0,no,west,3867


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        1338 non-null   int64  
 1   Age       1338 non-null   int64  
 2   Gender    1338 non-null   object 
 3   BMI       1338 non-null   float64
 4   Children  1338 non-null   int64  
 5   Smoker    1338 non-null   object 
 6   Region    1338 non-null   object 
 7   Premium   1338 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 83.8+ KB


In [None]:
df.describe(include = 'all')

Unnamed: 0,ID,Age,Gender,BMI,Children,Smoker,Region,Premium
count,1338.0,1338.0,1338,1338.0,1338.0,1338,1338,1338.0
unique,,,2,,,2,4,
top,,,male,,,no,east,
freq,,,676,,,1064,364,
mean,669.5,39.207025,,30.663397,1.094918,,,13270.414798
std,386.391641,14.04996,,6.098187,1.205493,,,12110.012882
min,1.0,18.0,,15.96,0.0,,,1122.0
25%,335.25,27.0,,26.29625,0.0,,,4740.0
50%,669.5,39.0,,30.4,1.0,,,9382.0
75%,1003.75,51.0,,34.69375,2.0,,,16640.0


In [None]:
# step 3: define y and X
df.columns

Index(['ID', 'Age', 'Gender', 'BMI', 'Children', 'Smoker', 'Region',
       'Premium'],
      dtype='object')

In [None]:
y = df['Premium']
X = df[['Age', 'Gender', 'BMI', 'Children', 'Smoker', 'Region']]

In [None]:
X

Unnamed: 0,Age,Gender,BMI,Children,Smoker,Region
0,19,female,27.900,0,yes,south
1,18,male,33.770,1,no,east
2,28,male,33.000,3,no,east
3,33,male,22.705,0,no,west
4,32,male,28.880,0,no,west
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,west
1334,18,female,31.920,0,no,north
1335,18,female,36.850,0,no,east
1336,21,female,25.800,0,no,south


In [None]:
# ordinal encoding
X.Smoker.value_counts()

no     1064
yes     274
Name: Smoker, dtype: int64

In [None]:
X.Smoker.replace({'yes':0,'no':1},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.Smoker.replace({'yes':0,'no':1},inplace=True)


In [None]:
X.Smoker.value_counts()

1    1064
0     274
Name: Smoker, dtype: int64

In [None]:
X = pd.get_dummies(X)

In [None]:
X

Unnamed: 0,Age,BMI,Children,Smoker,Gender_female,Gender_male,Region_east,Region_north,Region_south,Region_west
0,19,27.900,0,0,1,0,0,0,1,0
1,18,33.770,1,1,0,1,1,0,0,0
2,28,33.000,3,1,0,1,1,0,0,0
3,33,22.705,0,1,0,1,0,0,0,1
4,32,28.880,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,1,0,1,0,0,0,1
1334,18,31.920,0,1,1,0,0,1,0,0
1335,18,36.850,0,1,1,0,1,0,0,0
1336,21,25.800,0,1,1,0,0,0,1,0


In [None]:
# split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=2529)

In [None]:
# select model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

In [None]:
# train
model.fit(X_train,y_train)

In [None]:
# predict
y_pred = model.predict(X_test)

In [None]:
# evaluate
from sklearn.metrics import mean_absolute_percentage_error, r2_score

In [None]:
mean_absolute_percentage_error(y_test,y_pred)

0.3421362119277888

In [None]:
r2_score(y_test,y_pred)

0.8335078796287988

**Lets select a sample as new value**

In [None]:
X.sample()

Unnamed: 0,Age,BMI,Children,Smoker,Gender_female,Gender_male,Region_east,Region_north,Region_south,Region_west
109,63,35.09,0,0,0,1,1,0,0,0


In [None]:
X_new = np.array([[63,	35.09,	0,	0,	0,	1,	1,	0,	0,	0]])

In [None]:
X_new.shape

(1, 10)

In [None]:
model.predict(X_new)



array([47166.76])