In [1]:
%load_ext autoreload
%autoreload 2

import json
import os
from pathlib import Path

import numpy as np

In [2]:
def get_service_set(schema_path):
    """Get the set of all services present in a schema."""
    service_set = set()
    with open(schema_path, 'r') as f:
        schema = json.load(f)
        for service in schema:
            service_set.add(service["service_name"])
    return service_set


def get_in_domain_services(schema_path_1, schema_path_2):
    """Get the set of common services between two schemas."""
    return get_service_set(schema_path_1) & get_service_set(schema_path_2)

# Replace the arguments with the paths to the original train and test schemas
in_domain_services = get_in_domain_services(
        os.path.join(Path().resolve().parent, "data/raw/sgd/train", "schema.json"),
        os.path.join(Path().resolve().parent, "data/raw/sgd/test", "schema.json"))

In [3]:
# The directory structure here is
# decode/sgd-x/
#   v1/
#     experiment-11-1/model.2560000/
#       *.json
#   v2/
#     experiment-11-1/model.2560000/
#       *.json
#   v3/
#     experiment-11-1/model.2560000/
#       *.json
#   v4/
#     experiment-11-1/model.2560000/
#       *.json
#   v5/
#     experiment-11-1/model.2560000/
#       *.json
# Change the variables to match your directory structure.
directory = os.path.join(Path().resolve().parent, "decode/sgd-x")
name = "experiment-11-1/model.2560000"
versions = ["v1", "v2", "v3", "v4", "v5"]
# Metric to use
metric = "joint_goal_accuracy"

all_scores, seen_scores, unseen_scores = [], [], []
for version in versions:
    version_directory = os.path.join(directory, version, name)
    all_version_scores, seen_version_scores, unseen_version_scores = [], [], []
    with open(os.path.join(version_directory, "metrics_and_dialogues.json"), "r") as f:
        data = json.load(f)
        for dialogue in data:
            for turn in data[dialogue]["turns"]:
                if turn["speaker"] != "USER":
                    continue
                for frame in turn["frames"]:
                    if frame["service"][:-1] in in_domain_services:
                        # Seen
                        seen_version_scores.append(frame["metrics"][metric])
                    else:
                        # Unseen
                        unseen_version_scores.append(frame["metrics"][metric])
                    all_version_scores.append(frame["metrics"][metric])
    all_scores.append(all_version_scores)
    seen_scores.append(seen_version_scores)
    unseen_scores.append(unseen_version_scores)

# 2D array with dimensions (5, T) where T is number of frames
all_scores = np.asarray(all_scores)
seen_scores = np.asarray(seen_scores)
unseen_scores = np.asarray(unseen_scores)

In [4]:
all_average = np.mean(all_scores)
seen_average = np.mean(seen_scores)
unseen_average = np.mean(unseen_scores)
print(all_average)
print(seen_average)
print(unseen_average)

0.42356559880080674
0.7069315490328452
0.3290010125230066


In [5]:
ss = []
for scores in [all_scores.T, seen_scores.T, unseen_scores.T]:
    # 2D array with dimensions (T, 5)
    mean = np.mean(scores, axis=1, keepdims=True)
    std = np.sqrt(np.sum((scores - mean) ** 2, axis=1, keepdims=True) / (5 - 1))
    # Use np.nanmean to skip 0 / 0
    ss.append(np.nanmean(std / mean))

all_ss, seen_ss, unseen_ss = ss
print(all_ss)
print(seen_ss)
print(unseen_ss)

0.7554753093832196
0.5654699684037223
0.8652182144164333


  ss.append(np.nanmean(std / mean))
