# Explore label inference confidence
Designed to be run on the **staging** dataset (though it should be able to run on whatever).
The goal is to be able to make an informed decision on what confidence threshold we should use in the staging test of the new Label UI.
We first examine one user's data, then try to generalize our findings to the whole dataset.
Portions of this notebook inspired by/copypasted from `Evaluate sim wrt filtration and different radii - unrolled`. Once one inputs a UUID immediately below, the notebook can be run top to bottom.

## Settings

### Set target user

In [None]:
from uuid import UUID
target_user = UUID(input("Enter target UUID: "))  # To avoid accidentally leaving real UUIDs in the notebook
print(target_user)

### Do not truncate dataframes to print

In [None]:
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.width", None)

### Imports
Substantially copypasted from `Evaluate sim wrt filtration and different radii - unrolled`

In [None]:
# Import everything we could possibly want

import pandas as pd
import numpy as np
import geojson as gj
import sklearn.cluster as sc
import sklearn.metrics.pairwise as smp
import sklearn.metrics as sm

import json
import copy
import itertools

import folium
import branca.element as bre

import matplotlib.pyplot as plt
import matplotlib.colors as pltc
import seaborn as sns

from IPython import display
from uuid import UUID

import bson.json_util as bju
import bson.objectid as boi

import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.decorations.trip_queries as esdtq
import emission.analysis.modelling.tour_model.similarity as eamts

import emission.core.wrapper.entry as ecwe
import emission.core.wrapper.confirmedtrip as ecwct

## Read the data
Substantially copypasted from `Evaluate sim wrt filtration and different radii - unrolled`

In [None]:
all_users = esta.TimeSeries.get_uuid_list()
confirmed_trip_df_map = {}
labeled_trip_df_map = {}
expanded_trip_df_map = {}
for u in all_users:
    ts = esta.TimeSeries.get_time_series(u)
    ct_df = ts.get_data_df("analysis/confirmed_trip")
    confirmed_trip_df_map[u] = ct_df
    # We'll use this filtering and expansion for our individual user exploration, but when we do things in aggregate we'll do our own processing directly from confirmed_trip_df_map
    labeled_trip_df_map[u] = esdtq.filter_labeled_trips(ct_df)
    expanded_trip_df_map[u] = esdtq.expand_userinputs(labeled_trip_df_map[u])

## Define our own processing functions
Not particularly pretty, but they get the job done.

In [None]:
LABEL_NAMES = ["mode_confirm", "purpose_confirm", "replaced_mode"]
EMPTY_INFERENCE = {"labels": {}, "p": 0.0}

# Provides a measure of the difference between two dictionaries, passed in as a single indexable argument (in practice, a Series)
def dict_diff(ds):
    items0, items1 = ds[0].items(), ds[1].items()
    if len(items0) > 0 and len(items1) > 0:
        return len(set(items0) ^ set(items1))//2
    else:
        return max(len(items0), len(items1))

def handle_empty(col):
    return col if len(col) > 0 else []

def label_dict(e):
    return {n: e["user_input"][n] if n in e["user_input"] else None for n in LABEL_NAMES}

def label_tuple(e):
    return tuple(e["user_input"][n] if n in e["user_input"] else None for n in LABEL_NAMES)

# Operates in-place on a DataFrame of trips
def process_trips(trips):
    # If there's no data, add the column names anyway so we can reference them later
    if len(trips.axes[1]) == 0:
        for col_name in ["all_user_labels_dict", "all_user_labels", "most_likely_inference_dict", "most_likely_inference", "mli_confidence", "correct", "mismatches"]:
            trips[col_name] = []
        return
    
    inferred = trips["inferred_labels"]
    most_likely = [max([e for e in inf], key = lambda e : e["p"]) if len(inf) > 0 else EMPTY_INFERENCE for inf in inferred]
    confidences = [ent["p"] if ent is not None else 0.0 for ent in most_likely]
    trips["all_user_labels_dict"] = handle_empty([label_dict(e) for _,e in trips.iterrows()])
    trips["all_user_labels"] = handle_empty([label_tuple(e) for _,e in trips.iterrows()])
    trips["most_likely_inference_dict"] = handle_empty([e["labels"] for e in most_likely])
    trips["most_likely_inference"] = handle_empty([tuple(e["labels"].values()) for e in most_likely])
    trips["mli_confidence"] = handle_empty(confidences)
    trips["correct"] = handle_empty(trips["all_user_labels"] == trips["most_likely_inference"])
    trips["mismatches"] = handle_empty(trips[["all_user_labels_dict", "most_likely_inference_dict"]].apply(dict_diff, axis="columns"))
    trips.sort_values("mli_confidence", ascending=False, inplace=True)

# Returns all the trips that either have confidence below 1 or are not correctly inferred
def get_uncertain(trips):
    return trips[(trips["mli_confidence"] < 1) | (~trips["correct"])]

# Returns all the trips for which there exists an inference
def get_predicted(trips):
    return trips[trips["most_likely_inference"] != ()]

## Explore individual user's data

In [None]:
user_trips = expanded_trip_df_map[target_user].copy()
process_trips(user_trips)
print("Not shown: any trips that the user has not labeled")

uncertain_trips = get_uncertain(user_trips)
print(f"Not shown: {user_trips.shape[0]-uncertain_trips.shape[0]} correctly inferred trips with mli_confidence=1")

predicted_uncertain = get_predicted(uncertain_trips)
print(f"Not shown: {uncertain_trips.shape[0]-predicted_uncertain.shape[0]} trips with no inference")
        
display.display(predicted_uncertain[["all_user_labels", "most_likely_inference", "mli_confidence", "correct", "mismatches"]])

What are the probabilities represented in the data?

In [None]:
unique_p = user_trips['mli_confidence'].unique()
print(f"{len(unique_p)} unique probabilities:")
print("p-value: number of trips")
for p in unique_p:
    print(f"{p:.3f}: {user_trips[user_trips['mli_confidence'] == p].shape[0]}")

Comparing stated confidence to actual chance of being correct:

In [None]:
print("stated p: fraction correct")
for p in unique_p:
    n = user_trips[user_trips['mli_confidence'] == p].shape[0]
    n_correct = user_trips[(user_trips['mli_confidence'] == p) & (user_trips['correct'])].shape[0]
    print(f"{p:.3f}: {(n_correct/n):.3f}")

Presumably the reason for such a close correspondence here is because we trained on this data.

How about if we break down the label tuples?

In [None]:
print("p: number of trips with 0, 1, 2, 3 label mismatches between actual and predicted")
for p in unique_p:
    p_trips = user_trips[user_trips['mli_confidence'] == p]
    counts = {diffs: p_trips[p_trips['mismatches'] == diffs].shape[0] for diffs in range(0, 4)}
    print(f"{p:.3f}: {list(counts.values())}")

Besides emphasizing that we should prioritize reducing the share of trips with no inference over improving the accuracy of inferences, where do we go from here?
 * We should see what the probability distribution is for unlabeled trips. We won't be able to compare with ground truth, but that way we can see what the probability distribution _actually_ is without it being influenced by training data
 * We should expand to all users -- this user might not have enough unlabeled trips

## Explore aggregate data
Strategies guiding our methodology are:
   * Split things up by fully labeled, fully unlabeled, partially labeled
   * Calculate probability distribution for each user
   * Calculate probability distribution across all users

### More processing functions

In [None]:
def count_non_none(l):
    n = 0
    for e in l:
        if e is not None: n += 1
    return n
def get_fully_labeled(trips):
    return trips[trips["all_user_labels"].apply(count_non_none) == len(LABEL_NAMES)]
def get_fully_unlabeled(trips):
    return trips[trips["all_user_labels"].apply(count_non_none) == 0]
def get_partially_labeled(trips):
    lens = trips["all_user_labels"].apply(count_non_none)
    return trips[(0 < lens) & (lens < len(LABEL_NAMES))]
def prob_dist(trips):
    dist = {}
    unique_p = trips['mli_confidence'].unique()
    unique_p.sort()
    unique_p = unique_p[::-1]  # Dictionaries are ordered! And we want ours to be sorted descending.
    for p in unique_p: dist[p] = trips[trips['mli_confidence'] == p].shape[0]
    return dist
def pretty_print_dict(d):
    denom = sum(d.values())
    print("{")
    for k in d:
        print(f"  {k:4.3f}: {d[k]:<4} ({d[k]/denom:.2%})")
    print("}")

### Split into fully labeled, fully unlabeled, partially labeled

In [None]:
all_trips_dict, user_labeled, user_unlabeled, user_partial = {}, {}, {}, {}
for user in all_users:
    these_trips = confirmed_trip_df_map[user].copy()
    process_trips(these_trips)
    all_trips_dict[user] = these_trips
    user_labeled[user] = get_fully_labeled(these_trips)
    user_unlabeled[user] = get_fully_unlabeled(these_trips)
    user_partial[user] = get_partially_labeled(these_trips)

# Yes, we're splitting up into fully labeled, fully unlabeled, partially labeled twice. Can be optimized if necessary.
all_trips = pd.concat(all_trips_dict.values())  # This is also somewhat slow and not strictly necessary -- circumvent if necessary
all_labeled = get_fully_labeled(all_trips)
all_unlabeled = get_fully_unlabeled(all_trips)
all_partial = get_partially_labeled(all_trips)

print(all_trips.shape)
print(all_labeled.shape)
print(all_unlabeled.shape)
print(all_partial.shape)
assert all_labeled.shape[0]+all_unlabeled.shape[0]+all_partial.shape[0] == all_trips.shape[0]

### Calculate and display probability distributions

In [None]:
user_labeled_dist, user_unlabeled_dist, user_partial_dist = [], [], []
for user in all_users:
    user_labeled_dist.append(prob_dist(user_labeled[user]))
    user_unlabeled_dist.append(prob_dist(user_unlabeled[user]))
    user_partial_dist.append(prob_dist(user_partial[user]))
all_labeled_dist = prob_dist(all_labeled)
all_unlabeled_dist = prob_dist(all_unlabeled)
all_partial_dist = prob_dist(all_partial)

In [None]:
print(f"Not shown: {all_partial.shape[0]} partially labeled trips")
print()

print("Probability distribution of all fully labeled:")
print("Probability: number of trips (percentage of trips)")
pretty_print_dict(all_labeled_dist)
print()

print("Probability distribution of all fully unlabeled:")
print("Probability: number of trips (percentage of trips)")
pretty_print_dict(all_unlabeled_dist)

Okay, that's useful data. Let's graph it.

### Probability distribution graph

In [None]:
def bar(labels, a, b, title, figsize):
    x = np.arange(len(labels))
    y_a = [a[k] if k in a else 0 for k in labels]
    y_b = [b[k] if k in b else 0 for k in labels]

    fig,ax = plt.subplots(figsize=figsize)
    width = 0.4
    bars_a = ax.bar(x-width/2, y_a, width, label="Fully labeled")
    bars_b = ax.bar(x+width/2, y_b, width, label="Fully unlabeled")

    ax.set_ylabel("Number of trips")
    ax.set_title("Probability distribution of most-likely inferences"+title)
    ax.set_xticks(x)
    ax.set_xticklabels([f"{n:.2f}" for n in labels])
    ax.legend()
    
    plt.show()

print(f"Not shown: {all_partial.shape[0]} partially labeled trips")
labels = (set(all_labeled_dist.keys()) | set(all_unlabeled_dist.keys()))
bar(sorted(labels), all_labeled_dist, all_unlabeled_dist, ", full range", (20,10))
bar(sorted(labels - {0.0, 1.0}), all_labeled_dist, all_unlabeled_dist, " excluding 0 and 1", (20,10))
bar([0.0, 1.0], all_labeled_dist, all_unlabeled_dist, " only 0 and 1", (4,5))

From the graphs, we see that a significant fraction of the unlabeled trips have no inference at all, more so than for labeled trips. There are also more labeled trips with 100% certainty than unlabeled. However, aside from these endpoints, the trend is reversed — unlabeled trips tend to cluster towards the middle and upper end of the probability spectrum, whereas labeled trips are more evenly distributed.