In [None]:
# Import libraries. You may or may not use all of these.
!pip install -q git+https://github.com/tensorflow/docs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

In [None]:
# Import data
!wget https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv
dataset = pd.read_csv('insurance.csv')
dataset.tail()

From the dataset, we see that the columns 'age', 'bmi' 'children' and 'expenses' are numerical columns while the columns 'sex', 'smoker' and 'region' are categorical columns. We will first try to encode the categorical columns to numerical columns. For this, we need to know what are the value counts of the respective columns.

In [None]:
dataset['sex'].value_counts()

In [None]:
dataset['smoker'].value_counts()

In [None]:
dataset['region'].value_counts()

We set up dictionaries for each categorical column to change it to numerical column.

In [None]:
sex_dict = {'male': 0, 'female': 1}
smoker_dict = {'no': 0, 'yes': 1}
region_dict = {'northeast': 0, 'southeast': 1, 'southwest': 2, 'northwest': 3}

In [None]:
dataset['sex'] = dataset['sex'].map(sex_dict)
dataset['smoker'] = dataset['smoker'].map(smoker_dict)
dataset['region'] = dataset['region'].map(region_dict)

dataset.head()

In [None]:
import seaborn as sns

sns.pairplot(dataset)

Now, we convert the dataset into training and testing datasets.

In [None]:
X = dataset[['age', 'sex', 'bmi', 'children', 'smoker', 'region']]
y = dataset['expenses']

In [None]:
from sklearn.model_selection import train_test_split

train_dataset, test_dataset, train_labels, test_labels = train_test_split(X, y, test_size=0.2, random_state=101)
print(train_dataset.shape, test_dataset.shape)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [None]:
train_dataset = scaler.fit_transform(train_dataset)
test_dataset = scaler.transform(test_dataset)

In [None]:
model = keras.models.Sequential()
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(1))

model.compile(optimizer='adam', loss='mae', metrics=['mae', 'mse'])

In [None]:
model.fit(x=train_dataset, y=train_labels, validation_split=0.2, batch_size=256, epochs=600)

In [None]:
plt.figure(figsize=(24, 12))
pd.DataFrame(model.history.history).plot()
plt.show()

In [None]:
# RUN THIS CELL TO TEST YOUR MODEL. DO NOT MODIFY CONTENTS.
# Test model by checking how well the model generalizes using the test set.
loss, mae, mse = model.evaluate(test_dataset, test_labels, verbose=2)

print("Testing set Mean Abs Error: {:5.2f} expenses".format(mae))

if mae < 3500:
  print("You passed the challenge. Great job!")
else:
  print("The Mean Abs Error must be less than 3500. Keep trying.")

# Plot predictions.
test_predictions = model.predict(test_dataset).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True values (expenses)')
plt.ylabel('Predictions (expenses)')
lims = [0, 50000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims,lims)
