In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [2]:
# fetch data 

insurance_data = pd.read_csv('insurance.csv')

insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.77,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.88,0,no,northwest,3866.86


In [3]:
# subset data to only include columns for smokers

subset_data = insurance_data[insurance_data['smoker'] == 'yes'].drop(columns = ['age', 'sex', 'children', 'smoker', 'region'])

subset_data.head()

len(subset_data)

Unnamed: 0,bmi,charges
0,27.9,16884.92
11,26.29,27808.73
14,42.13,39611.76
19,35.3,36837.47
23,31.92,37701.88


274

In [4]:
X_train, X_test, y_train, y_test = train_test_split(subset_data['bmi'], subset_data['charges'], test_size=0.25)
# Pick a random_state as below and keep using the same number (example 35) to repeat the same test and train data
# X_train, X_test, y_train, y_test = train_test_split(subset_data['bmi'], subset_data['charges'], test_size=0.25, random_state=35)
X_train
X_test
y_train
y_test

843    29.81
375    28.31
85     22.89
292    45.54
373    32.90
        ... 
1323   40.37
420    33.88
549    46.20
607    23.66
966    24.80
Name: bmi, Length: 205, dtype: float64

465    28.38
609    37.80
1321   26.70
1118   35.75
92     29.83
        ... 
94     31.30
1207   33.40
1204   27.28
19     35.30
157    25.18
Name: bmi, Length: 69, dtype: float64

843    27,533.91
375    18,033.97
85     21,098.55
292    42,112.24
373    36,085.22
          ...   
1323   43,896.38
420    46,889.26
549    45,863.21
607    25,678.78
966    23,967.38
Name: charges, Length: 205, dtype: float64

465    19,521.97
609    39,241.44
1321   28,101.33
1118   38,282.75
92     30,184.94
          ...   
94     47,291.06
1207   38,415.47
1204   18,223.45
19     36,837.47
157    15,518.18
Name: charges, Length: 69, dtype: float64

In [5]:
model = LinearRegression(fit_intercept = True)

model.fit(X_train.array.reshape(-1, 1), y_train) # When extending to multiple features remove .array.reshape(-1, 1)

# The following gives the R-square score
model.score(X_train.array.reshape(-1, 1), y_train) # When extending to multiple features remove .array.reshape(-1, 1)

# This is the coefficient Beta_1 (or slope of the Simple Linear Regression line)
model.coef_

# This is the coefficient Beta_0
model.intercept_

LinearRegression()

0.6331612630193303

array([1415.73233291])

-11349.438216737675

In [6]:
test_output = pd.DataFrame(model.predict(X_test.array.reshape(-1, 1)), index = X_test.index, columns = ['pred_charges'])
# When extending to multiple features remove .array.reshape(-1, 1)
test_output.head()

Unnamed: 0,pred_charges
465,28829.05
609,42165.24
1321,26443.54
1118,39262.99
92,30881.86


In [9]:
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_charges'] - test_output['charges']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)

Unnamed: 0,pred_charges,charges_x,charges_y
465,28829.05,19521.97,19521.97
609,42165.24,39241.44,39241.44
1321,26443.54,28101.33,28101.33
1118,39262.99,38282.75,38282.75
92,30881.86,30184.94,30184.94


KeyError: 'charges'

#### Visualize data

In [8]:
# define function to import viz libraries
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [11]:
plot_data = []
plot_data.append(go.Scatter(x= X_train, y= y_train, name = 'Train data actual', mode = 'markers'))
plot_data.append(go.Scatter(x= X_train, y= model.predict(X_train.array.reshape(-1, 1)), name = 'Train data predicted', mode = 'markers'))
plot_data.append(go.Scatter(x= X_test, y= y_test, name = 'Test data actual', mode = 'markers'))
plot_data.append(go.Scatter(x= X_test, y= model.predict(X_test.array.reshape(-1, 1)), name = 'Test data predicted', mode = 'markers'))
# When extending to multiple features remove .array.reshape(-1, 1) in above (but remember 2-d we cannot draw)

layout = go.Layout(xaxis = dict(title='bmi'), yaxis = dict(title= 'charges'), 
                   title = 'Plot of predicted and actual')
fig = go.Figure(data= plot_data, layout=layout)
plotly.offline.iplot(fig)
