In [1]:
import pickle
import pandas as pd
from utils.anchor_parser import anchor_parser

In [2]:
target = "quality"
dataset_name = "white-wine"
model_name = "rf"

In [3]:
with open('./model/selected-{}.pickle'.format(model_name), 'rb') as f:
    model = pickle.load(f)

In [4]:
from pymongo import MongoClient
client = MongoClient("localhost", 27017)
db = client["pipeline"]

In [5]:
data_meta_collection = db["data"]
datapoint_collection = db["datapoint"]
model_collection = db["model"]
model_scores_collection = db["model_scores"]
anchor_meta_collection = db["anchor_meta"]
anchor_collection = db["anchor"]
shap_meta_collection = db["shap_meta"]
shap_collection = db["shap"]
trustscore_meta_collection = db["trustscore_meta"]
trustscore_collection = db["trustscore"]

In [6]:
data_meta = data_meta_collection.find_one(
            {"type": "test", "name": dataset_name}
)

In [7]:
feature_names = data_meta["features"]
classes = data_meta["classes"]; classes

[3, 4, 5, 6, 7, 8, 9]

In [8]:
index = 0
datapoint = datapoint_collection.find_one({"data_id": data_meta["_id"], "index": index})

In [9]:
datapoint_df = pd.DataFrame([datapoint["values"]], columns=feature_names).drop(columns=target)
datapoint_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,6.8,0.67,0.3,13.0,0.29,22.0,193.0,0.9984,3.08,0.67,9.0


In [10]:
model_pred = model.predict(datapoint_df)
model_pred_proba = model.predict_proba(datapoint_df)

print(model_pred)
print(model_pred_proba)

[5]
[[0.13   0.05   0.4485 0.2815 0.09   0.     0.    ]]


In [12]:
predictions_meta_collection = db["predictions_meta"]

data_id1 = data_meta_collection.find_one({"type": "train"})["_id"]
data_id2 = data_meta_collection.find_one({"type": "test"})["_id"]
model_id = model_collection.find_one({"stage": "selection"})["_id"]

predictions_meta_collection.insert_one({"data_id": data_id1, "model_id": model_id})
predictions_meta_collection.insert_one({"data_id": data_id2, "model_id": model_id})

<pymongo.results.InsertOneResult at 0x1ac3dea7bb0>

In [26]:
predictions_collection = db["predictions"]

pred_meta_id_train = predictions_meta_collection.find_one({"data_id": data_id1})["_id"]
pred_meta_id_test = predictions_meta_collection.find_one({"data_id": data_id2})["_id"]

train_data = datapoint_collection.find({"data_id": data_id1})
test_data = datapoint_collection.find({"data_id": data_id2})

train_df = pd.DataFrame([datapoint["values"] for datapoint in train_data], columns=feature_names).drop(columns=target)
test_df = pd.DataFrame([datapoint["values"] for datapoint in test_data], columns=feature_names).drop(columns=target)

train_preds = model.predict(train_df)
train_preds_probs = model.predict_proba(train_df)
test_preds = model.predict(test_df)
test_preds_probs = model.predict_proba(test_df)

for i, pred in enumerate(train_preds):
    predictions_collection.insert_one({"predictions_meta_id": pred_meta_id_train, "prediction": int(pred), "probs": train_preds_probs[i].tolist(), "index": i})
for i, pred in enumerate(test_preds):
    predictions_collection.insert_one({"predictions_meta_id": pred_meta_id_test, "prediction": int(pred), "probs": test_preds_probs[i].tolist(), "index": i})
    

In [23]:
train_preds_probs[4]

array([0.   , 0.   , 0.01 , 0.055, 0.04 , 0.895, 0.   ])

In [11]:
model_meta = model_collection.find_one({"stage": "selection", "model_name": model_name})
model_scores = model_scores_collection.find_one({"model_id": model_meta["_id"]})
anchor_meta = anchor_meta_collection.find_one({"model_id": model_meta["_id"]})
anchor = anchor_collection.find_one({"anchor_meta_id": anchor_meta["_id"], "index": index})
shap_meta = shap_meta_collection.find_one({"model_id": model_meta["_id"]})
shap = shap_collection.find_one({"shap_meta_id": shap_meta["_id"], "index": index, "class": int(model_pred[0])})
trustscore_meta = trustscore_meta_collection.find_one({"model_id": model_meta["_id"]})
trustscore = trustscore_collection.find_one({"trustscores_meta_id": trustscore_meta["_id"], "index": index})

In [12]:
anchor

{'_id': ObjectId('64a43f6a220cee0c436b43a6'),
 'anchor_meta_id': ObjectId('64a4377c220cee0c436b3de7'),
 'index': 0,
 'anchor': ['alcohol <= 9.47',
  'volatile acidity > 0.33',
  'density > 1.00',
  'pH <= 3.09',
  'free sulfur dioxide <= 23.00'],
 'precision': 0.9572649572649573,
 'coverage': 0.0029}

In [13]:
shap["values"]

[0.0038259148829975858,
 0.07450668060669254,
 -0.023059427963729,
 -0.03176103223827419,
 0.002960605473430394,
 0.0003946065743311089,
 0.00319737255911293,
 0.02351502200338014,
 0.02429852437416859,
 -0.027970350936667113,
 0.10106583028882754]

In [14]:
trustscore

{'_id': ObjectId('64a82d7ae01228df4fc3b355'),
 'trustscores_meta_id': ObjectId('64a82d36e01228df4fc3ad96'),
 'extreme': False,
 'index': 0,
 'score': 0.7505975503856865,
 'neighbour': 6}

In [15]:
dp = datapoint_df.to_dict("records")[0]; dp

{'fixed acidity': 6.8,
 'volatile acidity': 0.67,
 'citric acid': 0.3,
 'residual sugar': 13.0,
 'chlorides': 0.29,
 'free sulfur dioxide': 22.0,
 'total sulfur dioxide': 193.0,
 'density': 0.9984,
 'pH': 3.08,
 'sulphates': 0.67,
 'alcohol': 9.0}

In [16]:
anchor_precision = anchor["precision"]
anchor_coverage = anchor["coverage"]
my_anchors = [anchor_parser(elem) for elem in anchor["anchor"]]; my_anchors

[{'value': 'alcohol', 'upper_bound': 9.47, 'lower_bound': None},
 {'value': 'volatile acidity', 'upper_bound': None, 'lower_bound': 0.33},
 {'value': 'density', 'upper_bound': None, 'lower_bound': 1.0},
 {'value': 'pH', 'upper_bound': 3.09, 'lower_bound': None},
 {'value': 'free sulfur dioxide', 'upper_bound': 23.0, 'lower_bound': None}]

In [17]:
shap_values = dict(zip(feature_names, shap["values"])); shap_values

{'fixed acidity': 0.0038259148829975858,
 'volatile acidity': 0.07450668060669254,
 'citric acid': -0.023059427963729,
 'residual sugar': -0.03176103223827419,
 'chlorides': 0.002960605473430394,
 'free sulfur dioxide': 0.0003946065743311089,
 'total sulfur dioxide': 0.00319737255911293,
 'density': 0.02351502200338014,
 'pH': 0.02429852437416859,
 'sulphates': -0.027970350936667113,
 'alcohol': 0.10106583028882754}

In [18]:
ts_stats = trustscore_meta["statistics"]
ts = trustscore["score"]
ts_neighbour = trustscore["neighbour"]; ts_neighbour
ts_is_outlier = trustscore["extreme"]

In [19]:
ts_stats

{'length': 1470,
 'mean': 1.0082333322175348,
 'std': 0.4133127918717231,
 'median': 0.9726126672315786,
 'min': 0.0609716409633725,
 'max': 6.906542093917446,
 'outliers': 10}

In [20]:
ts

0.7505975503856865

In [21]:
classes

[3, 4, 5, 6, 7, 8, 9]

In [22]:
model_pred_proba[0]

array([0.13  , 0.05  , 0.4485, 0.2815, 0.09  , 0.    , 0.    ])

In [23]:
probas = dict(zip([str(x) for x in classes], model_pred_proba[0]))

In [24]:
explanation_dict = {
    "classes": classes,
    "datapoint": dp,
    "prediction": {
        "class": model_pred[0],
        "probas": probas
    },
    "anchor": {
        "rules": my_anchors,
        "precision": anchor_precision,
        "coverage": anchor_coverage,
    },
    "shap": {
        "values": shap_values
    },
    "trustscore": {
        "statistics": ts_stats,
        "score": ts,
        "closest_prediction": ts_neighbour
    }
}

In [25]:
explanation_dict

{'classes': [3, 4, 5, 6, 7, 8, 9],
 'datapoint': {'fixed acidity': 6.8,
  'volatile acidity': 0.67,
  'citric acid': 0.3,
  'residual sugar': 13.0,
  'chlorides': 0.29,
  'free sulfur dioxide': 22.0,
  'total sulfur dioxide': 193.0,
  'density': 0.9984,
  'pH': 3.08,
  'sulphates': 0.67,
  'alcohol': 9.0},
 'prediction': {'class': 5,
  'probas': {'3': 0.13,
   '4': 0.05,
   '5': 0.4485,
   '6': 0.2815,
   '7': 0.09,
   '8': 0.0,
   '9': 0.0}},
 'anchor': {'rules': [{'value': 'alcohol',
    'upper_bound': 9.47,
    'lower_bound': None},
   {'value': 'volatile acidity', 'upper_bound': None, 'lower_bound': 0.33},
   {'value': 'density', 'upper_bound': None, 'lower_bound': 1.0},
   {'value': 'pH', 'upper_bound': 3.09, 'lower_bound': None},
   {'value': 'free sulfur dioxide', 'upper_bound': 23.0, 'lower_bound': None}],
  'precision': 0.9572649572649573,
  'coverage': 0.0029},
 'shap': {'values': {'fixed acidity': 0.0038259148829975858,
   'volatile acidity': 0.07450668060669254,
   'citric 

In [26]:
from aggregators.single_prediction import get_single_pred_info

In [30]:
{
    "datapoint1": get_single_pred_info(2),
    "datapoint2": get_single_pred_info(0)
}

{'datapoint1': {'classes': [3, 4, 5, 6, 7, 8, 9],
  'train_scores': {'f1': 0.6485565011803691,
   'precision': 0.6785752047320168,
   'recall': 0.6625850340136055},
  'datapoint': {'fixed acidity': 5.7,
   'volatile acidity': 0.21,
   'citric acid': 0.25,
   'residual sugar': 1.1,
   'chlorides': 0.035,
   'free sulfur dioxide': 26.0,
   'total sulfur dioxide': 81.0,
   'density': 0.9902,
   'pH': 3.31,
   'sulphates': 0.52,
   'alcohol': 11.4},
  'prediction': {'class': 6,
   'probas': {'3': 0.0,
    '4': 0.0,
    '5': 0.12,
    '6': 0.58,
    '7': 0.28,
    '8': 0.02,
    '9': 0.0}},
  'anchor': {'rules': [{'value': 'alcohol',
     'upper_bound': None,
     'lower_bound': 10.4},
    {'value': 'free sulfur dioxide', 'upper_bound': None, 'lower_bound': 23.0},
    {'value': 'citric acid', 'upper_bound': 0.27, 'lower_bound': None},
    {'value': 'volatile acidity', 'upper_bound': 0.21, 'lower_bound': None},
    {'value': 'total sulfur dioxide',
     'upper_bound': 109.0,
     'lower_boun