### Import libraries

In [1]:
import math
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.initializers import GlorotNormal

### Getting the data

In [3]:
data = pd.read_csv('./input/insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
data.shape

(1338, 7)

In [5]:
data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [6]:
print(data['sex'].value_counts())
print(data['smoker'].value_counts())
print(data['region'].value_counts())

male      676
female    662
Name: sex, dtype: int64
no     1064
yes     274
Name: smoker, dtype: int64
southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64


In [7]:
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [8]:
stats = data.describe()
stats = stats.append((data.loc[:,['age', 'bmi', 'children', 'charges']].max() - data.loc[:,['age', 'bmi', 'children', 'charges']].min()).rename('range'))
stats

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801
range,46.0,37.17,5.0,62648.55411


- Al examinar los datos es posible percatarse que la data no requiere de imputación de valores ya que no presenta valores faltantes
- De las 6 variables independientes 3 de ellas son categóricas por lo que requieren de una codificación de valores
- Las variables numéricas evaluadas se encuentran en distintas dimensionalidades (años, kg/m^2, dolares, etc)

### Cleaning Data

In [9]:
data = pd.get_dummies(data, columns = ['sex', 'region', 'smoker'], prefix = {'sex': 'gender', 'region': 'region', 'smoker': 'smokes'})
data

Unnamed: 0,age,bmi,children,charges,gender_female,gender_male,region_northeast,region_northwest,region_southeast,region_southwest,smokes_no,smokes_yes
0,19,27.900,0,16884.92400,1,0,0,0,0,1,0,1
1,18,33.770,1,1725.55230,0,1,0,0,1,0,1,0
2,28,33.000,3,4449.46200,0,1,0,0,1,0,1,0
3,33,22.705,0,21984.47061,0,1,0,1,0,0,1,0
4,32,28.880,0,3866.85520,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,0,1,0,1,0,0,1,0
1334,18,31.920,0,2205.98080,1,0,1,0,0,0,1,0
1335,18,36.850,0,1629.83350,1,0,0,0,1,0,1,0
1336,21,25.800,0,2007.94500,1,0,0,0,0,1,1,0


### Train Test Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data.loc[:, data.columns != 'charges'], data[['charges']], test_size = 0.3, random_state = 1)

In [11]:
print(X_train.shape)
print(X_test.shape)

(936, 11)
(402, 11)


### Data Normalization

In [12]:
xscaler = StandardScaler()
yscaler = StandardScaler()

In [13]:
X_train = xscaler.fit_transform(X_train)
y_train = yscaler.fit_transform(y_train)

### MLP Model

##### Callbacks

In [14]:
#remember the number below epochs shows the amount of batches needed to finish an epoch. In this case since X_train has 936 observations and we have a 0.2 data validation = (936 * 0.2) / batch_size
batch_size = 32
val_split = 0.2
freq = math.ceil((X_train.shape[0] * (1-val_split)) / batch_size)
checkpoint_path = '.\checkpoints\mlp_cp_{epoch:04d}.ckpt'

In [15]:
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_path,
                                                 save_weights_only = True,
                                                 monitor = 'val_mse',
                                                 save_freq = 5 * freq, #default to epoch, if not how many batches later should it be saved
                                                 verbose = 1)

In [16]:
earlystop = tf.keras.callbacks.EarlyStopping(monitor = 'val_mse', patience = 3, verbose = 0, restore_best_weights = True)

##### Model

In [17]:
model = Sequential([
    Dense(units = 32, input_shape = (11, ), activation = 'relu', kernel_initializer = GlorotNormal()),
    #Dense(units = 64, activation = 'relu', kernel_initializer = GlorotNormal()),
    Dense(units = 1)
])

In [None]:
#model.save_weights(checkpoint_path.format(epoch=0))

In [18]:
model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['mse'])

In [19]:
%%time
history = model.fit(
                x = X_train, 
                y = y_train,
                epochs = 20,
                batch_size = batch_size,
                validation_split = val_split,
                callbacks = [cp_callback, earlystop]
        )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 00005: saving model to .\checkpoints\mlp_cp_0005.ckpt
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 00010: saving model to .\checkpoints\mlp_cp_0010.ckpt
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 00015: saving model to .\checkpoints\mlp_cp_0015.ckpt
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 00020: saving model to .\checkpoints\mlp_cp_0020.ckpt
Wall time: 18.6 s


### Predictions

In [20]:
X_test = xscaler.transform(X_test)

In [21]:
predictions = yscaler.inverse_transform(model.predict(X_test))

In [22]:
results = pd.DataFrame(np.hstack((predictions, y_test)), columns = ['charges_prediction', 'charges'])
results.head()

Unnamed: 0,charges_prediction,charges
0,2199.1604,1646.4297
1,12021.620117,11353.2276
2,9701.579102,8798.593
3,13266.162109,10381.4787
4,4224.144043,2103.08


In [23]:
mse_test = metrics.mean_squared_error(results['charges'], results['charges_prediction'])
print('El MSE final para la data de prueba del modelo es {}'.format(mse_test))

El MSE final para la data de prueba del modelo es 24608452.819660217
