In [57]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [58]:
data = pd.read_csv('insurance.csv')

In [59]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [61]:
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

### Finding out the number of categories

In [62]:
data['sex'].value_counts()

male      676
female    662
Name: sex, dtype: int64

In [63]:
data['smoker'].value_counts()

no     1064
yes     274
Name: smoker, dtype: int64

In [64]:
data['region'].value_counts()

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

## Converting categorical columns

In [65]:
# Converting the Sex column

dict_1 = {'male': 1, 'female': 0}

data['sex'] = data['sex'].map(dict_1)

In [66]:
# Converting the smoker column

dict_2 = {'yes': 1, 'no': 0}

data['smoker'] = data['smoker'].map(dict_2)

In [67]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [68]:
# Converting the region column

region_new = pd.get_dummies(data[['region']], drop_first= True)

In [69]:
data = data.drop(['region'], axis=1)

In [70]:
data = pd.concat([data, region_new], axis=1)

In [71]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,0,0,1
1,18,1,33.77,1,0,1725.5523,0,1,0
2,28,1,33.0,3,0,4449.462,0,1,0
3,33,1,22.705,0,0,21984.47061,1,0,0
4,32,1,28.88,0,0,3866.8552,1,0,0


## Spiliting data into Train and Test set

In [72]:
features = data.drop(['charges'], axis=1)
labels = data['charges']

In [73]:
features.shape, labels.shape

((1338, 8), (1338,))

In [74]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, random_state=42)

In [75]:
X_train.shape, X_test.shape

((1070, 8), (268, 8))

## Training the model

In [78]:
reg_rf = RandomForestRegressor()

In [79]:
grid_params = {'n_estimators' : [100, 200, 500, 800, 1000]}

In [80]:
grid = GridSearchCV(estimator= reg_rf, param_grid= grid_params , cv= 5)

In [81]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'n_estimators': [100, 200, 500, 800, 1000]})

In [82]:
grid.best_estimator_

RandomForestRegressor(n_estimators=200)

## Evaluating the data on the test set

In [85]:
final_model = grid.best_estimator_

y_predict = final_model.predict(X_test)

MSE = mean_squared_error(y_test, y_predict)
RMSE = np.sqrt(MSE)

In [86]:
RMSE

4630.293846005309

## Saving the model

In [87]:
import pickle

file = open('rf_regression.pkl', 'wb')

pickle.dump(final_model, file)