<a href="https://colab.research.google.com/github/DJCordhose/mlops-drift/blob/main/notebooks/2-5-validate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Validate


In [1]:
import sys
IN_COLAB = 'google.colab' in sys.modules
IN_COLAB

True

In [2]:
def url_for_dataset(relative_path):
  if IN_COLAB:
    url = f'https://github.com/DJCordhose/mlops-drift/raw/main/datasets/insurance_prediction/{relative_path}.csv.gz'
  else:
    url =  f'../datasets/insurance_prediction/{relative_path}.csv.gz'
  return url

In [3]:
# If this gives red output everything is fine
import tensorflow as tf
tf.__version__

'2.13.0'

In [4]:
# disable gpus, can also give red output and everything is just fine
tf.config.set_visible_devices([], 'GPU')

In [5]:
tf.random.set_seed(42)

In [6]:
import numpy as np
np.__version__

'1.23.5'

In [7]:
import matplotlib.pyplot as plt
%matplotlib inline

import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (20, 8)
mpl.rcParams['axes.titlesize'] = 24
mpl.rcParams['axes.labelsize'] = 16

# Step 1: Choosing data set for validation

In [60]:
use_new_data = False
# how many months after training?
iteration = 36 # final month in dataset, should have the biggest difference
# iteration = 12 # one year later, drift should start to show

In [61]:
import pandas as pd

if not use_new_data:
  # original (old) data
  df = pd.read_csv(url_for_dataset(f'reference'), delimiter=';')
else:
  # new data
  df = pd.read_csv(url_for_dataset(f'monthly/month-{iteration}'), delimiter=';')


In [62]:
df.head(10)

Unnamed: 0,training,age,emergency_braking,braking_distance,power,miles,risk,group_name,group
0,0,41.9511,0,40.933328,122.238329,98.509765,3.700073,high,0
1,0,24.368286,1,44.797317,113.765298,46.324178,2.447873,medium,1
2,0,18.314649,1,41.587241,143.427269,76.862968,4.079834,high,0
3,0,51.265254,1,47.266716,111.578133,102.697069,5.087831,high,0
4,0,23.578861,0,42.835319,145.994235,63.690055,3.739299,high,0
5,0,29.506037,1,46.547745,139.820214,66.610999,3.130142,high,0
6,0,21.289385,1,40.666499,134.749599,63.146964,1.774027,medium,1
7,1,47.459483,0,35.16855,142.215391,123.385272,0.084924,low,2
8,1,20.57368,1,41.798826,114.677871,103.853842,-0.033755,low,2
9,0,24.639606,1,40.801391,162.457198,44.292386,2.456763,medium,1


In [63]:
df.describe()

Unnamed: 0,training,age,emergency_braking,braking_distance,power,miles,risk,group
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,0.21,35.146663,0.394,39.903294,134.888089,80.965668,2.183617,0.901333
std,0.407444,12.765348,0.488798,4.781834,35.04857,25.496752,2.481164,0.815705
min,0.0,15.949545,0.0,24.368993,38.246253,18.796953,-5.289352,0.0
25%,0.0,25.141921,0.0,36.733165,110.00866,62.666195,0.708085,0.0
50%,0.0,32.714683,0.0,39.867888,131.055019,77.094819,2.386835,1.0
75%,0.0,42.633629,1.0,42.938777,157.319959,95.568682,3.874869,2.0
max,1.0,86.213394,1.0,56.463472,285.750164,211.807417,10.343054,2.0


In [64]:
X = df.drop(['risk', 'group', 'group_name'], axis='columns').values

In [65]:
y = df['group'].values
np.unique(y, return_counts=True)

(array([0, 1, 2]), array([580, 488, 432]))

In [66]:
from sklearn.model_selection import train_test_split
# FIXME: does not make sense when this is a completely new dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1200, 6), (300, 6), (1200,), (300,))

# Step 2: Choosing model to validate on
Options:
1. use pre-trained model from github (old model)
1. use whatever you have in this folder or anywhere in your file system
1. download model from training notebook and upload here (when using colab)
1. mount google drive

In [67]:
# !wget https://github.com/DJCordhose/mlops-drift/raw/main/notebooks/classifier.h5

In [68]:
model_path = 'classifier'
model = tf.keras.models.load_model(f'{model_path}.h5')

In [69]:
_, train_metric = model.evaluate(X_train, y_train, verbose=0)
train_metric

0.8691666722297668

In [70]:
_, test_metric = model.evaluate(X_val, y_val, verbose=0)
test_metric

0.8833333253860474

# Step 3: Check Invariant Properties of Model

## Accuracy range and lack of overfitting

In [71]:
# 85% is sufficient for our use case
assert train_metric > .85

In [72]:
assert test_metric > .85

In [25]:
assert abs(train_metric - test_metric) < .05

In [26]:
# we can not be too good, this would be a suspicious

assert test_metric < .95
assert train_metric < .95

## Invariants of the model itself (without using data on it)
Could be
* effective depth of tree (if decision tree has been used as architecture instead of NN)
* distribution of weights
* actual normalization

**We do use these properties as we want to treat the model as a black box to make it interchangable**

# Invariants also useful for monitoring and analyzis - not using GT

In [27]:
y_pred = model.predict(X).argmax(axis=1)
y_pred.shape



(1500,)

## Distribution of Output Class

In [28]:
values, counts = np.unique(y_pred, return_counts=True)
values, counts

(array([0, 1, 2]), array([562, 516, 422]))

In [29]:
# equal distribution around classes expected
tolerance = 0.2
expected_count = len(X) / 3
lower_bound = int(expected_count * (1 - tolerance))
upper_bound = int(expected_count * (1 + tolerance))
print(f'{lower_bound}-{upper_bound}')

for count in counts:
    print(count)
    assert count in range(lower_bound, upper_bound)

400-600
562
516
422


## Distribution of confidence

In [30]:
y_pred_probas = model.predict(X).max(axis=1)
y_pred_probas.shape



(1500,)

In [31]:
y_pred_probas.min(), y_pred_probas.mean(), y_pred_probas.max()

(0.4761628, 0.7455376, 0.9996872)

In [32]:
assert y_pred_probas.min() > .4

In [33]:
assert y_pred_probas.mean() > 0.7

In [34]:
assert y_pred_probas.max() > 0.99