In [None]:
!pip install nannyml==0.9.1

# Monitor a neural model running in prod (when no ground truth available)

Tor run this code please upload the following files from github repo here in colab: `predictions_val_set_onnx.csv`, `predictions_test_set_onnx.csv`, and model config file `config.json.`

## Import packages

In [52]:
import nannyml as nml
import pandas as pd
from IPython.display import display

## Read reference (before prod) and analysis (from prod) data

In [93]:
df_reference = pd.read_csv("predictions_val_set_onnx.csv")
df_analysis = pd.read_csv("predictions_test_set_onnx.csv")

print(f"len df_reference: {len(df_reference)}")
print(f"len df_analysis: {len(df_analysis)}")

display(df_reference.head())

len df_reference: 933
len df_analysis: 856


Unnamed: 0,uuid,predictions,predicted_probability,predicted_softmax_dist,predicted_class_id,labels,class_id
0,a249415e-9a02-45e8-8c7c-490eb2f366f9,flight,0.998968,"[8.233627340814564e-06, 1.588795021234546e-05,...",13,flight,13
1,49e523d7-6bbb-4611-8138-1e260d529f5b,flight,0.998974,"[8.145506399159785e-06, 1.5459047062904574e-05...",13,flight,13
2,582dc947-94c4-41ee-852c-aea2a59246d1,flight,0.998956,"[7.889273547334597e-06, 1.6907604731386527e-05...",13,flight,13
3,e46a97e0-7316-467c-969e-0c4a0429f3b9,flight,0.998859,"[8.463684935122728e-06, 1.5429834093083628e-05...",13,flight,13
4,609f49b3-27f5-4bce-9a95-ededc2fedff1,airline,0.991165,"[0.0004991275491192937, 0.0007190926698967814,...",5,airline,5


## Modify dataframes so that it is compatible with NunnyML

#### _To do performance estimation nannyml requires at least two samples per class. Since the test set is very limited for this simulation I will subsample class._

In [94]:
supported_labels = ["flight", "airfare", "ground_service", "airline", "abbreviation", "aircraft", "quantity"]
df_reference = df_reference[df_reference.labels.isin(supported_labels)]
df_analysis = df_analysis[df_analysis.labels.isin(supported_labels)]

print(f"len df_reference: {len(df_reference)}")
print(f"len df_analysis: {len(df_analysis)}")

len df_reference: 889
len df_analysis: 762


#### Read and adjust id2label mappings from model config file

In [95]:
import json

# Read id2label mappings from model config file
config_path = "config.json"
with open(config_path, "r") as fp:
    config = json.load(fp)

id2label =  {int(key): value for key, value in config["id2label"].items() if value in supported_labels}
print(id2label)

{0: 'abbreviation', 1: 'aircraft', 3: 'airfare', 5: 'airline', 13: 'flight', 19: 'ground_service', 22: 'quantity'}


In [148]:
# This function does quick data transformation to be compatible with nunyml
def get_data_nunnyml(df_ref, df_an, id2label):
  # Transform Reference data
  data = []
  id_list = sorted(id2label.keys())
  labels = [id2label[i] for i in id_list]
  for _, row in df_ref.iterrows():
      softmax = eval(row["predicted_softmax_dist"])
      data_row = [row["predicted_class_id"], row["class_id"]] + [softmax[id_] for id_ in id_list]
      data.append(data_row)
  # Form dataframe
  columns = ["y_pred_proba_" + l for l in labels]
  columns = ["predicted_class_id", "class_id"] + columns
  df_nunnyml = pd.DataFrame(data, columns=columns)
  df_nunnyml = df_nunnyml[df_nunnyml.predicted_class_id.isin(id_list)]
  # rescale class ids from 0 to new num of classes
  new_id_list = list(range(len(id_list)))
  id2new_id = dict(zip(id_list, new_id_list))
  df_nunnyml.predicted_class_id = df_nunnyml.predicted_class_id.apply(lambda x: id2new_id[x])
  df_nunnyml.class_id = df_nunnyml.class_id.apply(lambda x: id2new_id[x])
  df_nunnyml = df_nunnyml.sample(frac=1.)

  # Transform analysis data
  df_an = df_an[df_an.labels.isin(labels)]
  data_an = []
  for _, row in df_an.iterrows():
    softmax = eval(row["predicted_softmax_dist"])
    data_row = [row["predicted_class_id"]] + [softmax[id_] for id_ in id_list]
    data_an.append(data_row)
  df_an_nunnyml = pd.DataFrame(data_an, columns=["predicted_class_id"] + ["y_pred_proba_" + l for l in labels])
  df_an_nunnyml = df_an_nunnyml[df_an_nunnyml.predicted_class_id.isin(id_list)]
  df_an_nunnyml.predicted_class_id = df_an_nunnyml.predicted_class_id.apply(lambda x: id2new_id[x])
  df_an_nunnyml = df_an_nunnyml.sample(frac=1.)

  return df_nunnyml, df_an_nunnyml, dict(zip(new_id_list, columns[2:]))


df_nunnyml_reference, df_analysis_reference, y_pred_proba_map = get_data_nunnyml(df_reference, df_analysis, id2label)
print(y_pred_proba_map)
df_nunnyml_reference.head(3)

{0: 'y_pred_proba_abbreviation', 1: 'y_pred_proba_aircraft', 2: 'y_pred_proba_airfare', 3: 'y_pred_proba_airline', 4: 'y_pred_proba_flight', 5: 'y_pred_proba_ground_service', 6: 'y_pred_proba_quantity'}


Unnamed: 0,predicted_class_id,class_id,y_pred_proba_abbreviation,y_pred_proba_aircraft,y_pred_proba_airfare,y_pred_proba_airline,y_pred_proba_flight,y_pred_proba_ground_service,y_pred_proba_quantity
240,2,2,0.000493,2.8e-05,0.994674,0.000145,0.000338,0.000187,6.8e-05
312,0,0,0.988258,0.000625,0.000393,0.00125,0.000226,0.001414,0.000259
714,4,4,0.000429,0.000309,0.00087,0.000331,0.702372,0.004845,0.001148


In [149]:
df_analysis_reference.head(3)

Unnamed: 0,predicted_class_id,y_pred_proba_abbreviation,y_pred_proba_aircraft,y_pred_proba_airfare,y_pred_proba_airline,y_pred_proba_flight,y_pred_proba_ground_service,y_pred_proba_quantity
632,4,8e-06,1.5e-05,0.000291,5.4e-05,0.998919,4.6e-05,4.8e-05
681,2,0.000467,2.7e-05,0.994803,0.000144,0.000339,0.000175,6.4e-05
679,4,8e-06,1.8e-05,0.000234,5.6e-05,0.998928,4.4e-05,5.7e-05


## Fit Confidence Based Performance Estimation  Estimator

In [None]:
chunk_size = 50
estimator = nml.CBPE(
    problem_type="classification_multiclass",
    y_pred_proba=y_pred_proba_map,
    y_pred="predicted_class_id",
    y_true="class_id",
    metrics=["accuracy", "f1"],
    chunk_size=chunk_size,
)
estimator = estimator.fit(df_nunnyml_reference)

## Estimate metrics on analysis data based on confidence (without ground truth)

In [156]:
results = estimator.estimate(df_analysis_reference)
metric_fig = results.plot()
metric_fig.show()

_In reality, I would implement the metrics for each class including precision, recall [of the number of classes are not high]_