In [1]:
import plotly.express as px
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
import os
from typing import Dict, Any

In [2]:
POSITIVE_COLOR_SCALE = "Purples"

In [3]:
labels = pd.read_pickle("../data/epic-kitchens-100-annotations/EPIC_100_validation.pkl")
nouns = pd.read_csv("../data/epic-kitchens-100-annotations/EPIC_100_noun_classes.csv", index_col=0)
verbs = pd.read_csv("../data/epic-kitchens-100-annotations/EPIC_100_verb_classes.csv", index_col=0)

In [4]:
pd.read_pickle("../audio-gru-cm.pkl").keys()

dict_keys(['verb_output', 'noun_output', 'narration_id'])

In [5]:
def get_top_n_labels(n: int):
    
    if n is None:
        return verbs.loc[:, "key"], nouns.loc[:, "key"]
    
    topn_verbs_ids = labels.verb_class.value_counts().head(n)
    topn_nouns_ids = labels.noun_class.value_counts().head(n)

    topn_verbs = verbs.loc[topn_verbs_ids.index, "key"]
    topn_nouns = nouns.loc[topn_nouns_ids.index, "key"]

    return topn_verbs, topn_nouns

In [6]:
def get_or_compute_cm(preds):
    if "verb_cm" in preds and "noun_cm" in preds:
        verb_cm = preds["verb_cm"]
        noun_cm = preds["noun_cm"]
    else:
        verb_output = preds["verb_output"]
        noun_output = preds["noun_output"]

        verb_preds = np.argmax(verb_output, axis=1)
        noun_preds = np.argmax(noun_output, axis=1)
    
        verb_cm = confusion_matrix(labels.verb_class, verb_preds, labels=verbs.index)
        noun_cm = confusion_matrix(labels.noun_class, noun_preds, labels=nouns.index)

    assert verb_cm.shape[0] == verb_cm.shape[1] == len(verbs)
    assert noun_cm.shape[0] == noun_cm.shape[1] == len(nouns)
    
    return verb_cm, noun_cm

In [7]:
model_names = {
    "sf": "Vision SlowFast",
    "asf": "Audio SlowFast",
    "asf-gru": "Audio SlowFast with GRU"
}

def get_top_n_classes_confusion_matrix(file_path: str, n: int, model: str):
    print("Model: {}".format(model))
    preds = pd.read_pickle(file_path)

    verb_cm, noun_cm = get_or_compute_cm(preds)
    topn_verbs, topn_nouns = get_top_n_labels(n)

    # Filter the verb_cm and noun_cm to only keep the top n classes via the indices
    verb_cm = pd.DataFrame(verb_cm).loc[topn_verbs.index, topn_verbs.index]
    noun_cm = pd.DataFrame(noun_cm).loc[topn_nouns.index, topn_nouns.index]

    # Normalize the confusion matrices
    verb_cm = verb_cm / verb_cm.sum(axis=1)
    noun_cm = noun_cm / noun_cm.sum(axis=1)

    output_path = f"../res/modality_comparison/cm/{model}"
    for subfolder in ["pdf", "png", "svg"]:
        os.makedirs(os.path.join(output_path, subfolder), exist_ok=True)

    # Plot the confusion matrices
    fig = px.imshow(
        verb_cm,
        labels=dict(x="Predicted", y="True", color="Proportion"),
        x=topn_verbs,
        y=topn_verbs,
        width=1000,
        height=1000,
        color_continuous_scale=POSITIVE_COLOR_SCALE,
        title=f"Top N={n} verbs confusion matrix for {model_names[model]}",
        range_color=[0, 1],
    )
    fig.show()

    fig.write_image(f"{output_path}/svg/verb_confusion_matrix.svg")
    fig.write_image(f"{output_path}/png/verb_confusion_matrix.png", scale=2.0)
    fig.write_image(f"{output_path}/pdf/verb_confusion_matrix.pdf")

    fig = px.imshow(
        noun_cm,
        labels=dict(x="Predicted", y="True", color="Proportion"),
        x=topn_nouns,
        y=topn_nouns,
        width=1000,
        height=1000,
        color_continuous_scale=POSITIVE_COLOR_SCALE,
        title=f"Top N={n} noun confusion matrix for {model_names[model]}",
        range_color=[0, 1],
    )
    fig.show()

    # Save the confusion matrices as a SVG file
    fig.write_image(f"{output_path}/svg/top_{n}_noun_confusion_matrix.svg", format="svg")
    fig.write_image(f"{output_path}/png/top_{n}_noun_confusion_matrix.png", scale=2.0, format="png")
    fig.write_image(f"{output_path}/pdf/top_{n}_noun_confusion_matrix.pdf", format="pdf")


N = 50
get_top_n_classes_confusion_matrix("../visu-cm.pkl", n=N, model="sf")
get_top_n_classes_confusion_matrix("../audio-gru-cm.pkl", n=N, model="asf-gru")
get_top_n_classes_confusion_matrix("../audio-cm.pkl", n=N, model="asf")

Model: sf


Model: asf-gru


Model: asf


In [9]:
RANGE_DIFF = [-0.5, 0.5]
NEGATIVE_COLOR_SCALE = ["red", "white", "green"]
N = 50

def create_comparison_plots(files: Dict[str, Dict[str, Any]], n: int, width: int = 1500):
    data = files.copy()

    assert len(data) > 1, "At least two models are required to create a comparison plot"
    topn_verbs, topn_nouns = get_top_n_labels(n)

    for model in files.keys():
        for modality in ["audio", "vision"]:
            for model in files[modality].keys():
                file_path = files[modality][model]["file_path"]
                preds = pd.read_pickle(file_path)
                data[modality][model]["preds_verb"] = preds["verb_output"]
                data[modality][model]["preds_noun"] = preds["noun_output"]
                verb_cm, noun_cm = get_or_compute_cm(preds)
                # Filter the verb_cm and noun_cm to only keep the top n classes via the indices
                verb_cm = pd.DataFrame(verb_cm).loc[topn_verbs.index, topn_verbs.index].fillna(0)
                noun_cm = pd.DataFrame(noun_cm).loc[topn_nouns.index, topn_nouns.index].fillna(0)

                # Normalize the confusion matrices
                verb_cm = verb_cm / verb_cm.sum(axis=1)
                noun_cm = noun_cm / noun_cm.sum(axis=1)

                data[modality][model]["verb_cm"], data[modality][model]["noun_cm"] = verb_cm, noun_cm

                print(f"{verb_cm.shape=}")
                print(f"{noun_cm.shape=}")

    vision_models = list(data["vision"].keys())
    audio_models = list(data["audio"].keys())

    # For each pair of models, create a comparison plot
    for vision_model in vision_models:
        for audio_model in audio_models:
            audio_model_name = files["audio"][audio_model]["name"]
            vision_model_name = files["vision"][vision_model]["name"]

            print(f"Comparing {vision_model} and {audio_model}")
            verb_cm_vision = data["vision"][vision_model]["verb_cm"]
            verb_cm_audio = data["audio"][audio_model]["verb_cm"]

            noun_cm_vision = data["vision"][vision_model]["noun_cm"]
            noun_cm_audio = data["audio"][audio_model]["noun_cm"]

            output_path = f"../res/modality_comparison/diffs/{vision_model}_vs_{audio_model}"

            for subfolder in ["svg", "png", "pdf"]:
                os.makedirs(os.path.join(output_path, subfolder), exist_ok=True)

            # Prepare data for verb accuracy comparison
            top1_verbs_vision = np.diag(verb_cm_vision)
            top1_verbs_audio = np.diag(verb_cm_audio)
            df_verbs = pd.DataFrame(
                {"Verb": topn_verbs, vision_model_name: top1_verbs_vision, audio_model_name: top1_verbs_audio}
            )
            df_verbs = pd.melt(df_verbs, id_vars=["Verb"], var_name="Model", value_name="Accuracy@1")

            # Create a bar plot for the top1 accuracy of the verb classes
            fig = px.bar(
                df_verbs,
                x="Verb",
                y="Accuracy@1",
                color="Model",  # This differentiates the models
                barmode="group",
                range_y=[0, 1],
                title=f"Accuracy@1 for the top {n} verb classes",
                width=width,
                height=700,
            )
            fig.update_layout(yaxis=dict(tickformat=",.2%"))
            fig.show()

            # Save the bar plot as a SVG file
            fig.write_image(f"{output_path}/svg/verb_top1_accuracy_n={n}.svg")
            fig.write_image(f"{output_path}/png/verb_top1_accuracy_n={n}.png", scale=2.0)
            fig.write_image(f"{output_path}/pdf/verb_top1_accuracy_n={n}.pdf")

            # Prepare data for noun accuracy comparison
            top1_nouns_vision = np.diag(noun_cm_vision)
            top1_nouns_audio = np.diag(noun_cm_audio)
            df_nouns = pd.DataFrame(
                {"Noun": topn_nouns, vision_model_name: top1_nouns_vision, audio_model_name: top1_nouns_audio}
            )
            df_nouns = pd.melt(df_nouns, id_vars=["Noun"], var_name="Model", value_name="Accuracy@1")

            # Create a bar plot for the Accuracy@1 of the noun classes
            fig = px.bar(
                df_nouns,
                x="Noun",
                y="Accuracy@1",
                color="Model",  # This differentiates the models
                barmode="group",
                range_y=[0, 1],
                title=f"Accuracy@1 for the top {n} noun classes",
                width=width,
                height=700,
            )
            fig.update_layout(yaxis=dict(tickformat=",.2%"))
            fig.show()

            # Save the bar plot as a SVG file
            fig.write_image(f"{output_path}/svg/noun_top1_accuracy_n={n}.svg")
            fig.write_image(f"{output_path}/png/noun_top1_accuracy_n={n}.png", scale=2.0)
            fig.write_image(f"{output_path}/pdf/noun_top1_accuracy_n={n}.pdf")

            # Print the verbs for which audio accuracy is higher
            print("Verbs for which audio accuracy is higher")
            print(topn_verbs[np.diag(verb_cm_audio) > np.diag(verb_cm_vision)])

            # Print the verbs for which audio accuracy is higher
            print("Nouns for which audio accuracy is higher")
            print(topn_nouns[np.diag(noun_cm_audio) > np.diag(noun_cm_vision)])
            # Get ensemble performance

            for ensemble_type in ["mean"]:
                ensemble_logits_verb = (
                    data["vision"][vision_model]["preds_verb"] + data["audio"][audio_model]["preds_verb"]
                )
                ensemble_logits_noun = (
                    data["vision"][vision_model]["preds_noun"] + data["audio"][audio_model]["preds_noun"]
                )
                if ensemble_type == "mean":
                    ensemble_logits_verb /= 2
                    ensemble_logits_noun /= 2

                ensemble_preds_verb = np.argmax(ensemble_logits_verb, axis=1)
                ensemble_preds_noun = np.argmax(ensemble_logits_noun, axis=1)

                ensemble_verb_cm = confusion_matrix(labels.verb_class, ensemble_preds_verb, labels=verbs.index)
                ensemble_noun_cm = confusion_matrix(labels.noun_class, ensemble_preds_noun, labels=nouns.index)

                # Filter the verb_cm and noun_cm to only keep the top n classes via the indices
                ensemble_verb_cm = pd.DataFrame(ensemble_verb_cm).loc[topn_verbs.index, topn_verbs.index]
                ensemble_noun_cm = pd.DataFrame(ensemble_noun_cm).loc[topn_nouns.index, topn_nouns.index]

                # Normalize the confusion matrices
                ensemble_verb_cm = ensemble_verb_cm / ensemble_verb_cm.sum(axis=1)
                ensemble_noun_cm = ensemble_noun_cm / ensemble_noun_cm.sum(axis=1)

                # Prepare data for verb accuracy comparison
                top1_verbs_ensemble = np.diag(ensemble_verb_cm)
                df_verbs_ensemble = pd.DataFrame(
                    {
                        "Verb": topn_verbs,
                        vision_model_name: top1_verbs_vision,
                        audio_model_name: top1_verbs_audio,
                        "Ensemble": top1_verbs_ensemble,
                    }
                )
                df_verbs_ensemble = pd.melt(
                    df_verbs_ensemble, id_vars=["Verb"], var_name="Model", value_name="Accuracy@1"
                )

                # Create a bar plot for the top1 accuracy of the verb classes
                fig = px.bar(
                    df_verbs_ensemble,
                    x="Verb",
                    y="Accuracy@1",
                    color="Model",  # This differentiates the models
                    barmode="group",
                    range_y=[0, 1],
                    title=f"Accuracy@1 for the top {n} verb classes with {ensemble_type} ensemble",
                    width=width,
                    height=700,
                )
                fig.update_layout(yaxis=dict(tickformat=",.2%"))
                fig.show()

                # Save the bar plot as a SVG file
                fig.write_image(f"{output_path}/svg/verb_top1_accuracy_n={n}_{ensemble_type}_ensemble.svg")
                fig.write_image(f"{output_path}/png/verb_top1_accuracy_n={n}_{ensemble_type}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/pdf/verb_top1_accuracy_n={n}_{ensemble_type}_ensemble.pdf")

                delta_top_1_verbs = top1_verbs_ensemble - np.maximum(top1_verbs_audio, top1_verbs_vision)
                df_verbs_delta = pd.DataFrame(
                    {
                        "Verb": topn_verbs,
                        "Delta": delta_top_1_verbs,
                        "pos": (delta_top_1_verbs > 0).astype(int),
                    }
                )

                fig = px.bar(
                    df_verbs_delta,
                    x="Verb",
                    y="Delta",
                    barmode="group",
                    title=f"Delta in performance for the top {n} verb classes with {ensemble_type} ensemble",
                    # range_y=[0, 1],
                    width=width,
                    height=700,
                    color="pos",
                    color_continuous_scale="RdYlGn",
                    text=df_verbs_delta["Delta"].apply(lambda x: ("+" if x > 0 else "") +  f"{100*x:1.2f}%"),
                )
                fig.update_layout(yaxis=dict(tickformat=",.2%"))
                fig.update_traces(textposition="outside")
                fig.show()

                # Save the bar plot as a SVG file
                fig.write_image(f"{output_path}/svg/delta_verb_top1_accuracy_n={n}_{ensemble_type}_ensemble.svg")
                fig.write_image(f"{output_path}/png/delta_verb_top1_accuracy_n={n}_{ensemble_type}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/pdf/delta_verb_top1_accuracy_n={n}_{ensemble_type}_ensemble.pdf")

                # Prepare data for noun accuracy comparison
                top1_nouns_ensemble = np.diag(ensemble_noun_cm)
                df_nouns_ensemble = pd.DataFrame(
                    {
                        "Noun": topn_nouns,
                        vision_model_name: top1_nouns_vision,
                        audio_model_name: top1_nouns_audio,
                        "Ensemble": top1_nouns_ensemble,
                    }
                )
                df_nouns_ensemble = pd.melt(
                    df_nouns_ensemble, id_vars=["Noun"], var_name="Model", value_name="Accuracy@1"
                )

                # Create a bar plot for the top1 accuracy of the noun classes
                fig = px.bar(
                    df_nouns_ensemble,
                    x="Noun",
                    y="Accuracy@1",
                    color="Model",  # This differentiates the models
                    barmode="group",
                    title=f"Accuracy@1 for the top {n} noun classes with {ensemble_type} ensemble",
                    range_y=[0, 1],
                    width=width,
                    height=700,
                )
                fig.show()

                # Save the bar plot as a SVG file
                fig.write_image(f"{output_path}/svg/noun_top1_accuracy_n={n}_{ensemble_type}_ensemble.svg")
                fig.write_image(f"{output_path}/png/noun_top1_accuracy_n={n}_{ensemble_type}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/pdf/noun_top1_accuracy_n={n}_{ensemble_type}_ensemble.pdf")

                delta_top_1_nouns = top1_nouns_ensemble - np.maximum(top1_nouns_audio, top1_nouns_vision)
                df_nouns_delta = pd.DataFrame(
                    {
                        "Noun": topn_nouns,
                        "Delta": delta_top_1_nouns,
                        "pos": (delta_top_1_nouns > 0).astype(int),
                    }
                )

                fig = px.bar(
                    df_nouns_delta,
                    x="Noun",
                    y="Delta",
                    barmode="group",
                    title=f"Delta in performance for the top {n} noun classes with {ensemble_type} ensemble",
                    # range_y=[0, 1],
                    width=width,
                    height=700,
                    color="pos",
                    color_continuous_scale="RdYlGn",
                    text=df_nouns_delta["Delta"].apply(lambda x: ("+" if x > 0 else "") + f"{100*x:1.2f}%"),
                )
                fig.update_layout(yaxis=dict(tickformat=",.2%"))
                fig.update_traces(textposition="outside")
                fig.show()

                # Save the bar plot as a SVG file
                fig.write_image(f"{output_path}/svg/delta_noun_top1_accuracy_n={n}_{ensemble_type}_ensemble.svg")
                fig.write_image(f"{output_path}/png/delta_noun_top1_accuracy_n={n}_{ensemble_type}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/pdf/delta_noun_top1_accuracy_n={n}_{ensemble_type}_ensemble.pdf")

                # Plot the confusion matrices
                fig = px.imshow(
                    ensemble_verb_cm,
                    labels=dict(x="Predicted", y="True", color="Proportion"),
                    x=topn_verbs,
                    y=topn_verbs,
                    title=f"Top {n} verbs confusion matrix for late-fusion ensemble",
                    width=1000,
                    height=1000,
                    color_continuous_scale=POSITIVE_COLOR_SCALE,
                    range_color=[0, 1],
                )
                fig.show()

                fig.write_image(f"{output_path}/svg/verb_cm_n={n}_ensemble.svg")
                fig.write_image(f"{output_path}/png/verb_cm_n={n}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/pdf/verb_cm_n={n}_ensemble.pdf")

                fig = px.imshow(
                    ensemble_noun_cm,
                    labels=dict(x="Predicted", y="True", color="Proportion"),
                    x=topn_nouns,
                    y=topn_nouns,
                    title=f"Top {n} nouns confusion matrix for late-fusion ensemble",
                    width=1000,
                    height=1000,
                    color_continuous_scale=POSITIVE_COLOR_SCALE,
                    range_color=[0, 1],
                )
                fig.show()

                fig.write_image(f"{output_path}/svg/noun_cm_n={n}_ensemble.svg")
                fig.write_image(f"{output_path}/png/noun_cm_n={n}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/pdf/noun_cm_n={n}_ensemble.pdf")

                fig = px.imshow(
                    verb_cm_audio - verb_cm_vision,
                    labels=dict(x="Predicted", y="True", color="Proportion"),
                    x=topn_verbs,
                    y=topn_verbs,
                    title=f"Diff in confusion matrix for the top 20 verb classes<br>between {audio_model_name} and {vision_model_name}",
                    width=1000,
                    height=1000,
                    color_continuous_scale=NEGATIVE_COLOR_SCALE,
                    # range_color=[0, 1],
                    range_color=RANGE_DIFF,
                )
                fig.show()

                fig.write_image(f"{output_path}/svg/delta_verb_cm_n={n}_ensemble.svg")
                fig.write_image(f"{output_path}/png/delta_verb_cm_n={n}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/pdf/delta_verb_cm_n={n}_ensemble.pdf")

                fig = px.imshow(
                    noun_cm_audio - noun_cm_vision,
                    labels=dict(x="Predicted", y="True", color="Proportion"),
                    x=topn_nouns,
                    y=topn_nouns,
                    title=f"Difference in confusion matrix for the top {n} noun classes<br>between {audio_model_name} and {vision_model_name}",
                    width=1000,
                    height=1000,
                    color_continuous_scale=NEGATIVE_COLOR_SCALE,
                    # range_color=[0, 1],
                    range_color=RANGE_DIFF,
                )
                fig.show()

                fig.write_image(f"{output_path}/svg/delta_noun_cm_n={n}_ensemble.svg")
                fig.write_image(f"{output_path}/png/delta_noun_cm_n={n}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/pdf/delta_noun_cm_n={n}_ensemble.pdf")

                fig = px.imshow(
                    ensemble_verb_cm - verb_cm_vision,
                    labels=dict(x="Predicted", y="True", color="Proportion"),
                    x=topn_verbs,
                    y=topn_verbs,
                    title=f"Difference in confusion matrix for the top {n} verb classes<br>between <b>late-fusion multimodal ensemble ({vision_model_name} + {audio_model_name}) </b> and {vision_model_name}",
                    width=1000,
                    height=1000,
                    color_continuous_scale=NEGATIVE_COLOR_SCALE,
                    # range_color=[0, 1],
                    range_color=RANGE_DIFF,
                )
                fig.show()

                fig.write_image(f"{output_path}/svg/ens_v_delta_verb_cm_n={n}_ensemble.svg")
                fig.write_image(f"{output_path}/png/ens_v_delta_verb_cm_n={n}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/pdf/ens_v_delta_verb_cm_n={n}_ensemble.pdf")

                fig = px.imshow(
                    ensemble_noun_cm - noun_cm_vision,
                    labels=dict(x="Predicted", y="True", color="Proportion"),
                    x=topn_nouns,
                    y=topn_nouns,
                    title=f"Difference in confusion matrix for the top {n} noun classes<br>between <b>late-fusion multimodal ensemble ({vision_model_name} + {audio_model_name})</b> and {vision_model_name}",
                    width=1000,
                    height=1000,
                    color_continuous_scale=NEGATIVE_COLOR_SCALE,
                    # range_color=[0, 1],
                    range_color=RANGE_DIFF,
                )
                fig.show()

                fig.write_image(f"{output_path}/svg/ens_v_delta_noun_cm_n={n}_ensemble.svg")
                fig.write_image(f"{output_path}/png/ens_v_delta_noun_cm_n={n}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/pdf/ens_v_delta_noun_cm_n={n}_ensemble.pdf")

                fig = px.imshow(
                    ensemble_verb_cm - verb_cm_audio,
                    labels=dict(x="Predicted", y="True", color="Proportion"),
                    x=topn_verbs,
                    y=topn_verbs,
                    title=f"Difference in confusion matrix for the top {n} verb classes<br>between <b>late-fusion multimodal ensemble ({vision_model_name} + {audio_model_name})</b> and {audio_model_name}",
                    width=1000,
                    height=1000,
                    color_continuous_scale=NEGATIVE_COLOR_SCALE,
                    range_color=RANGE_DIFF,
                )
                fig.show()

                fig.write_image(f"{output_path}/svg/ens_a_delta_verb_cm_n={n}_ensemble.svg")
                fig.write_image(f"{output_path}/png/ens_a_delta_verb_cm_n={n}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/pdf/ens_a_delta_verb_cm_n={n}_ensemble.pdf")

                fig = px.imshow(
                    ensemble_noun_cm - noun_cm_audio,
                    labels=dict(x="Predicted", y="True", color="Proportion"),
                    x=topn_nouns,
                    y=topn_nouns,
                    title=f"Difference in confusion matrix for the top {n} noun classes<br>between <b>multi-modal ensemble</b> and {audio_model_name}",
                    width=1000,
                    height=1000,
                    color_continuous_scale=NEGATIVE_COLOR_SCALE,
                    range_color=RANGE_DIFF,
                )
                fig.show()

                fig.write_image(f"{output_path}/svg/ens_a_delta_noun_cm_n={n}_ensemble.svg")
                fig.write_image(f"{output_path}/png/ens_a_delta_noun_cm_n={n}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/pdf/ens_a_delta_noun_cm_n={n}_ensemble.pdf")


a = {
    "audio": {
        "asf": {
            "file_path": "../audio-cm.pkl",
            "name": "AudioSlowFast",
        },
        "asf-gru": {
            "file_path": "../audio-gru-cm.pkl",
            "name": "AudioSlowFastGRU",
        },
    },
    "vision": {
        "sf": {
            "file_path": "../visu-cm.pkl",
            "name": "SlowFast",
        },
    },
}
create_comparison_plots(a, n=N)

verb_cm.shape=(50, 50)
noun_cm.shape=(50, 50)
verb_cm.shape=(50, 50)
noun_cm.shape=(50, 50)
verb_cm.shape=(50, 50)
noun_cm.shape=(50, 50)
verb_cm.shape=(50, 50)
noun_cm.shape=(50, 50)
verb_cm.shape=(50, 50)
noun_cm.shape=(50, 50)
verb_cm.shape=(50, 50)
noun_cm.shape=(50, 50)
Comparing sf and asf


Verbs for which audio accuracy is higher
verb_class
8    turn-off
Name: key, dtype: object
Nouns for which audio accuracy is higher
noun_class
30    potato
Name: key, dtype: object


Comparing sf and asf-gru


Verbs for which audio accuracy is higher
verb_class
8       turn-off
20         empty
26          fill
29         scrub
35           eat
45          cook
67    transition
49         crush
Name: key, dtype: object
Nouns for which audio accuracy is higher
noun_class
30    potato
Name: key, dtype: object
