# Medical Expenses Prediction Analysis using Random Forest Regressor

In [37]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [38]:
data = pd.read_csv('med-insurance.csv')
data.shape

(1338, 7)

In [39]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [40]:
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [41]:
# lets check the descriptive summary
data.describe().style.background_gradient(cmap = 'Greens')

Unnamed: 0,age,bmi,children,expenses
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.665471,1.094918,13270.422414
std,14.04996,6.098382,1.205493,12110.01124
min,18.0,16.0,0.0,1121.87
25%,27.0,26.3,0.0,4740.2875
50%,39.0,30.4,1.0,9382.03
75%,51.0,34.7,2.0,16639.915
max,64.0,53.1,5.0,63770.43


### Univariate Analysis

In [42]:
import plotly.graph_objects as go
#animals=['giraffes', 'orangutans', 'monkeys']

fig = go.Figure([go.Bar(x=data['smoker'], y=data['smoker'].index, marker_line_color='rgb(8,48,107)')])
fig.show()

In [43]:
fig = go.Figure([go.Bar(x=data['children'], y=data['children'].index, marker_line_color='rgb(100,48,107)')])
fig.show()

In [44]:
fig = go.Figure([go.Bar(x=data['region'], y=data['region'].index, marker_line_color='rgb(80,48,7)')])
fig.show()

In [45]:
fig = px.histogram(data, x="age")
fig.show()

In [48]:
fig = px.violin(data, x="bmi")
fig.show()

### Bivariate Analysis

In [11]:
# lets understand the impact of age on Medical Expenses
px.box(data, y = 'expenses',
           x = 'age')

* With Increasing Age, Expense is expeted to increase, but It is not obvious for all the scenarios.

In [12]:
# lets understand the impact of bmi on Medical Expenses
px.scatter(data, y = 'expenses',
           x = 'bmi')

In [13]:
px.box(data, x ='children',y = 'expenses', points="all")

In [14]:
px.box(data, x ='smoker',y = 'expenses', points="all")

In [50]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


# Multivariate Analysis

In [51]:
fig = px.scatter(data, x="age", y="bmi",
	         size="children", color="expenses",
                 log_x=True, size_max=60)
fig.show()

### Data Processing

In [19]:
# lets perform encoding

# as we know males have higher expense than females, lets encode males as 2, and females as 1, 
# similarly smokers, have highers expense, so we will encode smokers as 2, and non smokers as 1,
# as we know that the south east region has higher expense than other regions

data['sex'] = data['sex'].replace(('male','female'), (2, 1))
data['smoker'] = data['smoker'].replace(('yes','no'), (2, 1))
data['region'] = data['region'].replace(('southeast','southwest','northeast','northwest'),(2, 1, 1, 1))

# let's check whether any categorical column is left
data.select_dtypes('object').columns

Index([], dtype='object')

In [20]:
# now lets check our data again
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,1,27.9,0,2,1,16884.92
1,18,2,33.8,1,1,2,1725.55
2,28,2,33.0,3,1,2,4449.46
3,33,2,22.7,0,1,1,21984.47
4,32,2,28.9,0,1,1,3866.86


In [21]:
y = data['expenses']
X = data.drop(['expenses'], axis = 1)

In [22]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,1,27.9,0,2,1
1,18,2,33.8,1,1,2
2,28,2,33.0,3,1,2
3,33,2,22.7,0,1,1
4,32,2,28.9,0,1,1


In [23]:
y.head()

0    16884.92
1     1725.55
2     4449.46
3    21984.47
4     3866.86
Name: expenses, dtype: float64

In [24]:
# lets perform train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(1070, 6) (268, 6)
(1070,) (268,)


# Feature Scaling

In [25]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Model Building using Random Forest Regressor

In [73]:
from sklearn.ensemble import RandomForestRegressor
model2 = RandomForestRegressor(n_estimators=100)
model2.fit(X_train, y_train)

RandomForestRegressor()

In [74]:
y_pred = model2.predict(X_test)
y_test1 = y_test.values
data = np.concatenate((y_pred.reshape(len(y_pred),1), y_test1.reshape(len(y_test1),1)),1)
data

array([[11315.5603    ,  9724.53      ],
       [ 9518.4971    ,  8547.69      ],
       [44381.8381    , 45702.02      ],
       [13118.5927    , 12950.07      ],
       [ 9287.5061    ,  9644.25      ],
       [13970.8253    ,  4500.34      ],
       [ 2261.7006    ,  2198.19      ],
       [11795.153     , 11436.74      ],
       [ 7677.5395    ,  7537.16      ],
       [ 5468.5364    ,  5425.02      ],
       [ 7853.2099    ,  6753.04      ],
       [15582.5461    , 10493.95      ],
       [ 8811.6438    ,  7337.75      ],
       [ 6027.4359    ,  4185.1       ],
       [24256.8347    , 18310.74      ],
       [11916.6482    , 10702.64      ],
       [13305.2179    , 12523.6       ],
       [ 6672.9542    ,  3490.55      ],
       [ 6367.2599    ,  6457.84      ],
       [34128.691     , 33475.82      ],
       [23829.8418    , 23967.38      ],
       [13067.5292    , 12643.38      ],
       [10469.2798    , 23045.57      ],
       [27952.8998    , 23065.42      ],
       [ 3520.01

# Model Accuracy

In [75]:
# lets check the Model accuracy
from sklearn.metrics import r2_score, mean_squared_error

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE Score :", rmse)

r2_score = r2_score(y_test, y_pred)
print("R2 Score :",r2_score)

RMSE Score : 4517.388382017007
R2 Score : 0.8717601800736396


In [30]:
dataset = pd.DataFrame({'y_test': data[:, 0], 'y_pred': data[:, 1]})
dataset.to_csv('accuracy.csv')

In [31]:
dataset = pd.read_csv('accuracy.csv')

In [32]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,y_test,y_pred
0,0,10888.2914,9724.53
1,1,9578.2185,8547.69
2,2,44207.385,45702.02
3,3,13231.6293,12950.07
4,4,9176.768,9644.25


In [33]:
import plotly.graph_objects as go
fig = go.Figure()

fig.add_trace(go.Scatter(x=dataset['Unnamed: 0'], y=dataset['y_test'], name='Actual Value',
                         line=dict(color='royalblue', width=3)))
fig.add_trace(go.Scatter(x=dataset['Unnamed: 0'], y=dataset['y_pred'], name = 'Predicted Value',
                         line=dict(color='firebrick', width=2)))


## This model can predict Medical Expences of a Person With the Accuracy of 87%