In [1]:
# import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [2]:
#get the data
data = pd.read_csv('https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv')
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [3]:
#since we have categorical data, we need to normalize and standardize the data
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer

ct = make_column_transformer(
    (StandardScaler(), ['bmi', 'age', 'children']),
    (OneHotEncoder(), ['sex', 'smoker', 'region'])
)


In [4]:
#Create data(x) and labels
X = data.drop('charges', axis= 1)
y = data['charges']

In [7]:
# build our train and test data

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

In [14]:
# fit the ct to the training data (fitting it on test set will lead to data leakage)
ct.fit(x_train)

# transform the training data
x_train_normal = ct.transform(x_train)
x_test_normal= ct.transform(x_test)

Now we have normalized our data let's see how they look


In [15]:
#let's look at the non-normalized data
x_train.loc[0]

age                19
sex            female
bmi              27.9
children            0
smoker            yes
region      southwest
Name: 0, dtype: object

In [17]:
#then the normalized data
x_train_normal[0]   #note that there is no .loc method because it is a numpy array, so u can just index

array([-1.75652513,  0.47222651,  0.73433626,  1.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ])

In [21]:
#how about the shapes
x_train_normal.shape, x_train.shape

((1070, 11), (1070, 6))

In [23]:
# creating the NN model
tf.random.set_seed(42)

# 1. create the model
model= tf.keras.Sequential([
    tf.keras.layers.Dense(100),
    tf.keras.layers.Dense(10),
    tf.keras.layers.Dense(1)
])

# 2. compile the model
model.compile(loss= tf.keras.losses.mae,
             optimizer = tf.keras.optimizers.Adam(learning_rate= 0.001),
             metrics= ['mae'])

# 3 fit the model
model.fit(x_train_normal, y_train, epochs= 200, verbose= 0)

<tensorflow.python.keras.callbacks.History at 0x20c620258b0>

In [24]:
# evaluate the model
model_loss, model_mae = model.evaluate(x_test_normal, y_test)



In [25]:
#let's record how the model trains so that we can be able to plot how it learns
history = model.fit(x_train_normal, y_train)



In [27]:
# saving the model
## model.save('name_of_the_model')

In [28]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 100)               1200      
_________________________________________________________________
dense_4 (Dense)              (None, 10)                1010      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 11        
Total params: 2,221
Trainable params: 2,221
Non-trainable params: 0
_________________________________________________________________
