# Insurance ML

predict risk of accidents

In [None]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf
tf.__version__

In [None]:
tf.random.set_seed(42)

In [None]:
import numpy as np
np.__version__

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (20, 8)
mpl.rcParams['axes.titlesize'] = 24
mpl.rcParams['axes.labelsize'] = 20

In [None]:
# !pip install -q dtreeviz

In [None]:
# https://github.com/parrt/dtreeviz
import dtreeviz
dtreeviz.__version__

In [None]:
# https://github.com/AndreasMadsen/python-lrcurve
# !pip install -q lrcurve

In [None]:
from lrcurve import KerasLearningCurve

In [None]:
# XXX: THIS IS VERY GENERAL AND CAN BE USED PRETTY MUCH ANYWHERE

from dtreeviz import clfviz

def plot_decision_boundaries(model, X, y_true, x1_range=None, x2_range=None):
  _, ax = plt.subplots(figsize=(8,4), dpi=300)


  ranges = None
  if x1_range and x2_range:
    ranges=(x1_range, x2_range)

  clfviz(
              model, X, y_true, 
             show=['instances', 'boundaries', 'probabilities', 'misclassified'],
             markers=['v', '^', 'd'],
             ntiles=50, 
             ax=ax,
            ranges=ranges,
             tile_fraction=1.0, 
             boundary_markersize=1.0,
             feature_names=["Age", "Max Speed"],
             colors={'class_boundary': 'black',
                     'tile_alpha': 0.5,
                    #  'warning' : 'yellow',
                     'classes':
                         [None,  # 0 classes
                          None,  # 1 class
                          None,  # 2 classes
                          ['#FF8080', '#FFFF80', '#8080FF'],  # 3 classes
                          ]
                     }
               )

# Step 1: Loading and exploring our data set

This is a database of customers of an insurance company. Each data point is one customer. Risk is expressed as a number between 0 and 1. 1 meaning highest and 0 meaning lowerst risk of having an accident.

In [None]:
# XXX: why would everyone need to know where the data is being loadded from and what if that changes? also: how to even do that?
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/DJCordhose/insurance-ml/main/data/insurance-customers-risk-1500.csv')

In [None]:
# XXX: Loading is mandatory, but why analysis of the data in a training notebook?
df.head()

In [None]:
df.describe()

In [None]:
features = ['speed', 'age', 'miles']

In [None]:
import seaborn as sns

In [None]:
# XXX: COLORS ARE WEIRD

plt.figure(figsize=(10, 10))

cm = df.corr()
cm3 = cm.iloc[:3, :3]

hm = sns.heatmap(cm3,
                cbar=True,
                annot=True,
                square=True,
#                 cmap='Blues', 
                fmt='.2f',
                yticklabels=features,
                xticklabels=features)

# Step 2: Training a neural network on 2 dimensions of the data

In [None]:
y = df['group'].values
# add more columns to list to have fewer features to train on
X = df.drop(['risk', 'group', 'miles'], axis='columns').values
# reorder, first age, then speed to match plotting
X = pd.DataFrame(np.array([X[:, 1], X[:, 0]]).T)
X.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=21)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [None]:
### XXX: THERE IS SO MUCH ROOM FOR EXPERIMENT AND MAKING COPIES HERE

from tensorflow.keras.layers import InputLayer, Dense, Dropout, \
                                    BatchNormalization, Activation

num_features = X.shape[1]
dropout = 0.6
model = tf.keras.Sequential()

model.add(InputLayer(name='input', input_shape=(num_features,)))

# model.add(Dense(500, name='hidden1'))
# model.add(Activation('relu'))
# model.add(BatchNormalization())
# model.add(Dropout(dropout))

# model.add(Dense(500, name='hidden2'))
# model.add(Activation('relu'))
# model.add(BatchNormalization())
# model.add(Dropout(dropout))

# model.add(Dense(500, name='hidden3'))
# model.add(Activation('relu'))
# model.add(BatchNormalization())
# model.add(Dropout(dropout))

model.add(Dense(name='output', units=3, activation='softmax'))

model.summary()

In [None]:
%%time 

# XXX: this cries for a function with some parameters

BATCH_SIZE = 32
EPOCHS = 50

model.compile(loss='sparse_categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=EPOCHS, 
                    batch_size=BATCH_SIZE, 
                    callbacks=[KerasLearningCurve()],
                    verbose=0)

In [None]:
# XXX: getting final metrics is very common
train_loss, train_metric = model.evaluate(X_train, y_train, batch_size=BATCH_SIZE)
train_loss, train_metric

In [None]:
test_loss, test_metric = model.evaluate(X_val, y_val, batch_size=BATCH_SIZE)
test_loss, test_metric

In [None]:
# XXX: those plots are happning all the time
plt.yscale('log')
plt.ylabel("loss")
plt.xlabel("epochs")
plt.title('Loss over epochs')

plt.plot(history.history['loss']);
plt.plot(history.history['val_loss']);

plt.legend(['Training', 'Validation']);

In [None]:
plt.ylabel("accuracy")
plt.xlabel("epochs")
plt.title('Accuracy over epochs')

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

In [None]:
# XXX: those are plausibility checks and should be regression tests on quality of the model
model.predict([[48, 100]])

In [None]:
model.predict([[48, 100]]).argmax()

In [None]:
model.predict([[30, 150]])

In [None]:
model.predict([[30, 150]]).argmax()

In [None]:
# XXX version without boundaries is straight forward, but one with ranges: which ranges make sense and why?
# plot_decision_boundaries(model, X, y, x1_range=(10, 150), x2_range=(50, 250))
plot_decision_boundaries(model, X, y)

In [None]:
# model.save?

In [None]:
# XXX: loading and saving of model are one-liners, but there are different formats and they are hard to remember
model.save('classifier.h5', save_format='h5')

In [None]:
model.save('classifier', save_format='tf')

In [None]:
!ls -l

In [None]:
!ls -l classifier/

In [None]:
!tar czvf classifier.tgz ./classifier

In [None]:
!ls -l