In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ML libraries inladen
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.covariance import EllipticEnvelope
# Voor statistische data visualisatie
import seaborn as sns; sns.set(color_codes=True)

In [4]:
dataset = pd.read_csv('insurance.csv')
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
# full one-hot encoding

dataset = pd.concat([dataset,pd.get_dummies(dataset['sex'], prefix='sex')],axis=1)
dataset.drop(['sex'],axis=1, inplace=True)

dataset = pd.concat([dataset,pd.get_dummies(dataset['smoker'], prefix='smoker')],axis=1)
dataset.drop(['smoker'],axis=1, inplace=True)

dataset = pd.concat([dataset,pd.get_dummies(dataset['region'], prefix='region')],axis=1)
dataset.drop(['region'],axis=1, inplace=True)



In [6]:
dataset.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0,1,0,0


In [12]:
y = dataset['charges'].values
X = dataset.drop('charges',axis=1)

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33, random_state=0)

In [15]:
# initialiseren en trainen van het regressiemodel
lregmodel = linear_model.LinearRegression()
lregmodel.fit(X_train,y_train)

# Coëfficiënten en intercept van het lineair regressiemodel
print('coeffs: ',lregmodel.coef_)
print('intercept', lregmodel.intercept_)

coeffs:  [   258.36454867    325.61051663    552.15348059     49.17121211
    -49.17121211 -11653.25334683  11653.25334683    583.5269011
    -66.42019472   -276.01029958   -241.0964068 ]
intercept -645.3065336832678


In [16]:
# Evaluatie van het model

y_predicted = lregmodel.predict(X_test)

## Mean Absolute Error
from sklearn.metrics import mean_absolute_error

MAE = mean_absolute_error(y_test,y_predicted)
print('MAE = ',MAE)

## Mean Squared Error
from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test,y_predicted)
print('MSE = ',MSE)

## coefficient of determination = r2 score
from sklearn.metrics import r2_score

r2 = r2_score(y_test,y_predicted)

print('r2 score = ',r2)

### alternatieve manier voor het bepalen van de r2 score
r2 = lregmodel.score(X_test,y_test)
print('r2 score = ', r2)

MAE =  4095.3606481910856
MSE =  34097271.235341355
r2 score =  0.7871002363403499
r2 score =  0.7871002363403499


In [22]:
dataset = pd.read_csv('insurance.csv')
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [23]:
#  Verwijder 1 one-hot kolom

dataset = pd.concat([dataset,pd.get_dummies(dataset['sex'], prefix='sex')],axis=1)
dataset.drop(['sex'],axis=1, inplace=True)

dataset.drop(['sex_female'],axis=1, inplace=True)

dataset = pd.concat([dataset,pd.get_dummies(dataset['smoker'], prefix='smoker')],axis=1)
dataset.drop(['smoker'],axis=1, inplace=True)
dataset.drop(['smoker_no'],axis=1, inplace=True)
dataset = pd.concat([dataset,pd.get_dummies(dataset['region'], prefix='region')],axis=1)
dataset.drop(['region'],axis=1, inplace=True)
dataset.drop(['region_southwest'],axis=1, inplace=True)

In [24]:
y = dataset['charges'].values
X = dataset.drop('charges',axis=1)

In [25]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33, random_state=0)

In [26]:
# initialiseren en trainen van het regressiemodel
lregmodel = linear_model.LinearRegression()
lregmodel.fit(X_train,y_train)

# Coëfficiënten en intercept van het lineair regressiemodel
print('coeffs: ',lregmodel.coef_)
print('intercept', lregmodel.intercept_)

coeffs:  [  258.36454867   325.61051663   552.15348059   -98.34242422
 23306.50669366   824.6233079    174.67621208   -34.91389278]
intercept -12490.485075203977


In [27]:
# Evaluatie van het model

y_predicted = lregmodel.predict(X_test)

## Mean Absolute Error
from sklearn.metrics import mean_absolute_error

MAE = mean_absolute_error(y_test,y_predicted)
print('MAE = ',MAE)

## Mean Squared Error
from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test,y_predicted)
print('MSE = ',MSE)

## coefficient of determination = r2 score
from sklearn.metrics import r2_score

r2 = r2_score(y_test,y_predicted)

print('r2 score = ',r2)

### alternatieve manier voor het bepalen van de r2 score
r2 = lregmodel.score(X_test,y_test)
print('r2 score = ', r2)

MAE =  4095.360648191086
MSE =  34097271.23534136
r2 score =  0.7871002363403498
r2 score =  0.7871002363403498
