In [1]:
import plotly.express as px
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
import os
from typing import Dict

In [2]:
labels = pd.read_pickle("../data/epic-kitchens-100-annotations/EPIC_100_validation.pkl")
nouns = pd.read_csv("../data/epic-kitchens-100-annotations/EPIC_100_noun_classes.csv", index_col=0)
verbs = pd.read_csv("../data/epic-kitchens-100-annotations/EPIC_100_verb_classes.csv", index_col=0)

In [3]:
pd.read_pickle("../audio-gru-cm.pkl").keys()

dict_keys(['verb_output', 'noun_output', 'narration_id'])

In [4]:
def get_top_n_labels(n: int):
    topn_verbs_ids = labels.verb_class.value_counts().head(n)
    topn_nouns_ids = labels.noun_class.value_counts().head(n)

    topn_verbs = verbs.loc[topn_verbs_ids.index, "key"]
    topn_nouns = nouns.loc[topn_nouns_ids.index, "key"]

    return topn_verbs, topn_nouns
get_top_n_labels(10)


(verb_class
 0        take
 1         put
 2        wash
 3        open
 5      insert
 4       close
 6     turn-on
 7         cut
 10        mix
 9        pour
 Name: key, dtype: object,
 noun_class
 2        plate
 0          tap
 3     cupboard
 1        spoon
 4        knife
 5          pan
 7         bowl
 13         cup
 8       drawer
 6          lid
 Name: key, dtype: object)

In [5]:
def get_or_compute_cm(preds):
    if "verb_cm" in preds and "noun_cm" in preds:
        verb_cm = preds["verb_cm"]
        noun_cm = preds["noun_cm"]
    else:
        verb_output = preds["verb_output"]
        noun_output = preds["noun_output"]

        verb_preds = np.argmax(verb_output, axis=1)
        noun_preds = np.argmax(noun_output, axis=1)
    
        verb_cm = confusion_matrix(labels.verb_class, verb_preds, labels=verbs.index)
        noun_cm = confusion_matrix(labels.noun_class, noun_preds, labels=nouns.index)

    assert verb_cm.shape[0] == verb_cm.shape[1] == len(verbs)
    assert noun_cm.shape[0] == noun_cm.shape[1] == len(nouns)
    
    return verb_cm, noun_cm

In [6]:
def get_top_n_classes_confusion_matrix(file_path: str, n: int, model: str):
    print("Model: {}".format(model))
    preds = pd.read_pickle(file_path)
    
    verb_cm, noun_cm = get_or_compute_cm(preds)
    topn_verbs, topn_nouns = get_top_n_labels(n)

    # Filter the verb_cm and noun_cm to only keep the top n classes via the indices
    verb_cm = pd.DataFrame(verb_cm).loc[topn_verbs.index, topn_verbs.index]
    noun_cm = pd.DataFrame(noun_cm).loc[topn_nouns.index, topn_nouns.index]

    # Normalize the confusion matrices
    verb_cm = verb_cm / verb_cm.sum(axis=1)
    noun_cm = noun_cm / noun_cm.sum(axis=1)

    output_path = f"../res/modality_comparison/{model}"
    os.makedirs(output_path, exist_ok=True)
    # Plot the confusion matrices
    fig = px.imshow(
        verb_cm,
        labels=dict(x="Predicted", y="True", color="Proportion"),
        x=topn_verbs,
        y=topn_verbs,
        width=1000,
        height=1000,
        color_continuous_scale="viridis",
        range_color=[0, 1]
    )
    fig.show()
    
    fig.write_image(f"{output_path}/verb_confusion_matrix.svg")
    fig.write_image(f"{output_path}/verb_confusion_matrix.png", scale=2.0)
    fig.write_image(f"{output_path}/verb_confusion_matrix.pdf")
    
    
    fig = px.imshow(
        noun_cm,
        labels=dict(x="Predicted", y="True", color="Proportion"),
        x=topn_nouns,
        y=topn_nouns,
        width=1000,
        height=1000,
        color_continuous_scale="viridis",
        range_color=[0, 1],
    )
    fig.show()

    # Save the confusion matrices as a SVG file
    fig.write_image(f"{output_path}/noun_confusion_matrix.svg", format="svg")
    fig.write_image(f"{output_path}/noun_confusion_matrix.png", scale=2.0, format="png")
    fig.write_image(f"{output_path}/noun_confusion_matrix.pdf", format="pdf")


get_top_n_classes_confusion_matrix("../visu-cm.pkl", n=20, model="sf")
get_top_n_classes_confusion_matrix("../audio-gru-cm.pkl", n=20, model="asf-gru")
get_top_n_classes_confusion_matrix("../audio-cm.pkl", n=20, model="asf")

Model: sf


Model: asf-gru


Model: asf


In [11]:
from typing import Any


def create_comparison_plots(files: Dict[str, Dict[str, Any]], n: int, width: int = 1500):
    data = files.copy()

    assert len(data) > 1, "At least two models are required to create a comparison plot"
    topn_verbs, topn_nouns = get_top_n_labels(n)

    for model in files.keys():
        for modality in ["audio", "vision"]:
            for model in files[modality].keys():
                file_path = files[modality][model]["file_path"]
                preds = pd.read_pickle(file_path)
                data[modality][model]["preds_verb"] = preds["verb_output"]
                data[modality][model]["preds_noun"] = preds["noun_output"]
                verb_cm, noun_cm = get_or_compute_cm(preds)
                # Filter the verb_cm and noun_cm to only keep the top n classes via the indices
                verb_cm = pd.DataFrame(verb_cm).loc[topn_verbs.index, topn_verbs.index]
                noun_cm = pd.DataFrame(noun_cm).loc[topn_nouns.index, topn_nouns.index]

                # Normalize the confusion matrices
                verb_cm = verb_cm / verb_cm.sum(axis=1)
                noun_cm = noun_cm / noun_cm.sum(axis=1)

                data[modality][model]["verb_cm"], data[modality][model]["noun_cm"] = verb_cm, noun_cm

    vision_models = list(data["vision"].keys())
    audio_models = list(data["audio"].keys())

    # For each pair of models, create a comparison plot
    for vision_model in vision_models:
        for audio_model in audio_models:
            audio_model_name = files["audio"][audio_model]["name"]
            vision_model_name = files["vision"][vision_model]["name"]

            print(f"Comparing {vision_model} and {audio_model}")
            verb_cm_vision = data["vision"][vision_model]["verb_cm"]
            verb_cm_audio = data["audio"][audio_model]["verb_cm"]

            noun_cm_vision = data["vision"][vision_model]["noun_cm"]
            noun_cm_audio = data["audio"][audio_model]["noun_cm"]

            output_path = f"../res/modality_comparison/{vision_model}_vs_{audio_model}"
            os.makedirs(output_path, exist_ok=True)

            # Prepare data for verb accuracy comparison
            top1_verbs_vision = np.diag(verb_cm_vision)
            top1_verbs_audio = np.diag(verb_cm_audio)
            df_verbs = pd.DataFrame(
                {"Verb": topn_verbs, vision_model_name: top1_verbs_vision, audio_model_name: top1_verbs_audio}
            )
            df_verbs = pd.melt(df_verbs, id_vars=["Verb"], var_name="Model", value_name="Accuracy@1")

            # Create a bar plot for the top1 accuracy of the verb classes
            fig = px.bar(
                df_verbs,
                x="Verb",
                y="Accuracy@1",
                color="Model",  # This differentiates the models
                barmode="group",
                range_y=[0, 1],
                title=f"Accuracy@1 for the top {n} verb classes",
                width=width,
                height=700,
            )
            fig.update_layout(yaxis=dict(tickformat=",.2%"))
            fig.show()

            # Save the bar plot as a SVG file
            fig.write_image(f"{output_path}/verb_top1_accuracy_n={n}.svg")
            fig.write_image(f"{output_path}/verb_top1_accuracy_n={n}.png", scale=2.0)
            fig.write_image(f"{output_path}/verb_top1_accuracy_n={n}.pdf")

            # Prepare data for noun accuracy comparison
            top1_nouns_vision = np.diag(noun_cm_vision)
            top1_nouns_audio = np.diag(noun_cm_audio)
            df_nouns = pd.DataFrame(
                {"Noun": topn_nouns, vision_model_name: top1_nouns_vision, audio_model_name: top1_nouns_audio}
            )
            df_nouns = pd.melt(df_nouns, id_vars=["Noun"], var_name="Model", value_name="Accuracy@1")

            # Create a bar plot for the Accuracy@1 of the noun classes
            fig = px.bar(
                df_nouns,
                x="Noun",
                y="Accuracy@1",
                color="Model",  # This differentiates the models
                barmode="group",
                range_y=[0, 1],
                title=f"Accuracy@1 for the top {n} noun classes",
                width=width,
                height=700,
            )
            fig.update_layout(yaxis=dict(tickformat=",.2%"))
            fig.show()

            # Save the bar plot as a SVG file
            fig.write_image(f"{output_path}/noun_top1_accuracy_n={n}.svg")
            fig.write_image(f"{output_path}/noun_top1_accuracy_n={n}.png", scale=2.0)
            fig.write_image(f"{output_path}/noun_top1_accuracy_n={n}.pdf")

            # Print the verbs for which audio accuracy is higher
            print("Verbs for which audio accuracy is higher")
            print(topn_verbs[np.diag(verb_cm_audio) > np.diag(verb_cm_vision)])

            # Print the verbs for which audio accuracy is higher
            print("Nouns for which audio accuracy is higher")
            print(topn_nouns[np.diag(noun_cm_audio) > np.diag(noun_cm_vision)])
            # Get ensemble performance

            for ensemble_type in ["sum", "mean"]:
                ensemble_logits_verb = (
                    data["vision"][vision_model]["preds_verb"] + data["audio"][audio_model]["preds_verb"]
                )
                ensemble_logits_noun = (
                    data["vision"][vision_model]["preds_noun"] + data["audio"][audio_model]["preds_noun"]
                )
                if ensemble_type == "mean":
                    ensemble_logits_verb /= 2
                    ensemble_logits_noun /= 2

                ensemble_preds_verb = np.argmax(ensemble_logits_verb, axis=1)
                ensemble_preds_noun = np.argmax(ensemble_logits_noun, axis=1)

                ensemble_verb_cm = confusion_matrix(labels.verb_class, ensemble_preds_verb, labels=verbs.index)
                ensemble_noun_cm = confusion_matrix(labels.noun_class, ensemble_preds_noun, labels=nouns.index)

                # Filter the verb_cm and noun_cm to only keep the top n classes via the indices
                ensemble_verb_cm = pd.DataFrame(ensemble_verb_cm).loc[topn_verbs.index, topn_verbs.index]
                ensemble_noun_cm = pd.DataFrame(ensemble_noun_cm).loc[topn_nouns.index, topn_nouns.index]

                # Normalize the confusion matrices
                ensemble_verb_cm = ensemble_verb_cm / ensemble_verb_cm.sum(axis=1)
                ensemble_noun_cm = ensemble_noun_cm / ensemble_noun_cm.sum(axis=1)

                # Prepare data for verb accuracy comparison
                top1_verbs_ensemble = np.diag(ensemble_verb_cm)
                df_verbs_ensemble = pd.DataFrame(
                    {
                        "Verb": topn_verbs,
                        vision_model_name: top1_verbs_vision,
                        audio_model_name: top1_verbs_audio,
                        "Ensemble": top1_verbs_ensemble,
                    }
                )
                df_verbs_ensemble = pd.melt(
                    df_verbs_ensemble, id_vars=["Verb"], var_name="Model", value_name="Accuracy@1"
                )

                # Create a bar plot for the top1 accuracy of the verb classes
                fig = px.bar(
                    df_verbs_ensemble,
                    x="Verb",
                    y="Accuracy@1",
                    color="Model",  # This differentiates the models
                    barmode="group",
                    range_y=[0, 1],
                    title=f"Accuracy@1 for the top {n} verb classes with {ensemble_type} ensemble",
                    width=width,
                    height=700,
                )
                fig.update_layout(yaxis=dict(tickformat=",.2%"))
                fig.show()

                # Save the bar plot as a SVG file
                fig.write_image(f"{output_path}/verb_top1_accuracy_n={n}_{ensemble_type}_ensemble.svg")
                fig.write_image(f"{output_path}/verb_top1_accuracy_n={n}_{ensemble_type}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/verb_top1_accuracy_n={n}_{ensemble_type}_ensemble.pdf")

                delta_top_1_verbs = top1_verbs_ensemble - np.maximum(top1_verbs_audio, top1_verbs_vision)
                df_verbs_delta = pd.DataFrame(
                    {
                        "Verb": topn_verbs,
                        "Delta": delta_top_1_verbs,
                        "pos": (delta_top_1_verbs > 0).astype(int),
                    }
                )

                fig = px.bar(
                    df_verbs_delta,
                    x="Verb",
                    y="Delta",
                    barmode="group",
                    title=f"Delta in performance for the top {n} verb classes with {ensemble_type} ensemble",
                    # range_y=[0, 1],
                    width=width,
                    height=700,
                    color="pos",
                    color_continuous_scale="RdYlGn",
                    text=df_verbs_delta["Delta"].apply(lambda x: ("+" if x > 0 else "") +  f"{100*x:1.2f}%"),
                )
                fig.update_layout(yaxis=dict(tickformat=",.2%"))
                fig.update_traces(textposition="outside")
                fig.show()

                # Save the bar plot as a SVG file
                fig.write_image(f"{output_path}/delta_verb_top1_accuracy_n={n}_{ensemble_type}_ensemble.svg")
                fig.write_image(f"{output_path}/delta_verb_top1_accuracy_n={n}_{ensemble_type}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/delta_verb_top1_accuracy_n={n}_{ensemble_type}_ensemble.pdf")

                # Prepare data for noun accuracy comparison
                top1_nouns_ensemble = np.diag(ensemble_noun_cm)
                df_nouns_ensemble = pd.DataFrame(
                    {
                        "Noun": topn_nouns,
                        vision_model_name: top1_nouns_vision,
                        audio_model_name: top1_nouns_audio,
                        "Ensemble": top1_nouns_ensemble,
                    }
                )
                df_nouns_ensemble = pd.melt(
                    df_nouns_ensemble, id_vars=["Noun"], var_name="Model", value_name="Accuracy@1"
                )

                # Create a bar plot for the top1 accuracy of the noun classes
                fig = px.bar(
                    df_nouns_ensemble,
                    x="Noun",
                    y="Accuracy@1",
                    color="Model",  # This differentiates the models
                    barmode="group",
                    title=f"Accuracy@1 for the top {n} noun classes with {ensemble_type} ensemble",
                    range_y=[0, 1],
                    width=width,
                    height=700,
                )
                fig.show()

                # Save the bar plot as a SVG file
                fig.write_image(f"{output_path}/noun_top1_accuracy_n={n}_{ensemble_type}_ensemble.svg")
                fig.write_image(f"{output_path}/noun_top1_accuracy_n={n}_{ensemble_type}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/noun_top1_accuracy_n={n}_{ensemble_type}_ensemble.pdf")

                delta_top_1_nouns = top1_nouns_ensemble - np.maximum(top1_nouns_audio, top1_nouns_vision)
                df_nouns_delta = pd.DataFrame(
                    {
                        "Noun": topn_nouns,
                        "Delta": delta_top_1_nouns,
                        "pos": (delta_top_1_nouns > 0).astype(int),
                    }
                )

                fig = px.bar(
                    df_nouns_delta,
                    x="Noun",
                    y="Delta",
                    barmode="group",
                    title=f"Delta in performance for the top {n} noun classes with {ensemble_type} ensemble",
                    # range_y=[0, 1],
                    width=width,
                    height=700,
                    color="pos",
                    color_continuous_scale="RdYlGn",
                    text=df_verbs_delta["Delta"].apply(lambda x: ("+" if x > 0 else "") + f"{100*x:1.2f}%"),
                )
                fig.update_layout(yaxis=dict(tickformat=",.2%"))
                fig.update_traces(textposition="outside")
                fig.show()

                # Save the bar plot as a SVG file
                fig.write_image(f"{output_path}/delta_noun_top1_accuracy_n={n}_{ensemble_type}_ensemble.svg")
                fig.write_image(f"{output_path}/delta_noun_top1_accuracy_n={n}_{ensemble_type}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/delta_noun_top1_accuracy_n={n}_{ensemble_type}_ensemble.pdf")

                # Plot the confusion matrices
                fig = px.imshow(
                    ensemble_verb_cm,
                    labels=dict(x="Predicted", y="True", color="Proportion"),
                    x=topn_verbs,
                    y=topn_verbs,
                    width=1000,
                    height=1000,
                    color_continuous_scale="viridis",
                    range_color=[0, 1],
                )
                fig.show()

                fig.write_image(f"{output_path}/verb_confusion_matrix_n={n}_{ensemble_type}_ensemble.svg")
                fig.write_image(f"{output_path}/verb_confusion_matrix_n={n}_{ensemble_type}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/verb_confusion_matrix_n={n}_{ensemble_type}_ensemble.pdf")

                fig = px.imshow(
                    ensemble_noun_cm,
                    labels=dict(x="Predicted", y="True", color="Proportion"),
                    x=topn_nouns,
                    y=topn_nouns,
                    width=1000,
                    height=1000,
                    color_continuous_scale="viridis",
                    range_color=[0, 1],
                )
                fig.show()

                fig.write_image(f"{output_path}/noun_confusion_matrix_n={n}_{ensemble_type}_ensemble.svg")
                fig.write_image(f"{output_path}/noun_confusion_matrix_n={n}_{ensemble_type}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/noun_confusion_matrix_n={n}_{ensemble_type}_ensemble.pdf")

                fig = px.imshow(
                    verb_cm_audio - verb_cm_vision,
                    labels=dict(x="Predicted", y="True", color="Proportion"),
                    x=topn_verbs,
                    y=topn_verbs,
                    title=f"Difference in confusion matrix for the top 20 verb classes<br>between {audio_model_name} and {vision_model_name}",
                    width=1000,
                    height=1000,
                    color_continuous_scale="viridis",
                    # range_color=[0, 1],
                )
                fig.show()

                fig.write_image(f"{output_path}/delta_verb_confusion_matrix_n={n}_{ensemble_type}_ensemble.svg")
                fig.write_image(f"{output_path}/delta_verb_confusion_matrix_n={n}_{ensemble_type}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/delta_verb_confusion_matrix_n={n}_{ensemble_type}_ensemble.pdf")

                fig = px.imshow(
                    noun_cm_audio - noun_cm_vision,
                    labels=dict(x="Predicted", y="True", color="Proportion"),
                    x=topn_nouns,
                    y=topn_nouns,
                    title=f"Difference in confusion matrix for the top 20 noun classes<br>between {audio_model_name} and {vision_model_name}",
                    width=1000,
                    height=1000,
                    color_continuous_scale="viridis",
                    # range_color=[0, 1],
                )
                fig.show()

                fig.write_image(f"{output_path}/delta_noun_confusion_matrix_n={n}_{ensemble_type}_ensemble.svg")
                fig.write_image(f"{output_path}/delta_noun_confusion_matrix_n={n}_{ensemble_type}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/delta_noun_confusion_matrix_n={n}_{ensemble_type}_ensemble.pdf")

                fig = px.imshow(
                    ensemble_verb_cm - verb_cm_vision,
                    labels=dict(x="Predicted", y="True", color="Proportion"),
                    x=topn_verbs,
                    y=topn_verbs,
                    title=f"Difference in confusion matrix for the top 20 verb classes<br>between <b>multi-modal ensemble</b> and {vision_model_name}",
                    width=1000,
                    height=1000,
                    color_continuous_scale="viridis",
                    # range_color=[0, 1],
                )
                fig.show()

                fig.write_image(f"{output_path}/ens_v_delta_verb_confusion_matrix_n={n}_{ensemble_type}_ensemble.svg")
                fig.write_image(f"{output_path}/ens_v_delta_verb_confusion_matrix_n={n}_{ensemble_type}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/ens_v_delta_verb_confusion_matrix_n={n}_{ensemble_type}_ensemble.pdf")

                fig = px.imshow(
                    ensemble_noun_cm - noun_cm_vision,
                    labels=dict(x="Predicted", y="True", color="Proportion"),
                    x=topn_nouns,
                    y=topn_nouns,
                    title=f"Difference in confusion matrix for the top 20 noun classes<br>between <b>multi-modal ensemble</b> and {vision_model_name}",
                    width=1000,
                    height=1000,
                    color_continuous_scale="viridis",
                    # range_color=[0, 1],
                )
                fig.show()

                fig.write_image(f"{output_path}/ens_v_delta_noun_confusion_matrix_n={n}_{ensemble_type}_ensemble.svg")
                fig.write_image(f"{output_path}/ens_v_delta_noun_confusion_matrix_n={n}_{ensemble_type}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/ens_v_delta_noun_confusion_matrix_n={n}_{ensemble_type}_ensemble.pdf")

                fig = px.imshow(
                    ensemble_verb_cm - verb_cm_audio,
                    labels=dict(x="Predicted", y="True", color="Proportion"),
                    x=topn_verbs,
                    y=topn_verbs,
                    title=f"Difference in confusion matrix for the top 20 verb classes<br>between <b>multi-modal ensemble</b> and {audio_model_name}",
                    width=1000,
                    height=1000,
                    color_continuous_scale="viridis",
                    # range_color=[0, 1],
                )
                fig.show()

                fig.write_image(f"{output_path}/ens_a_delta_verb_confusion_matrix_n={n}_{ensemble_type}_ensemble.svg")
                fig.write_image(f"{output_path}/ens_a_delta_verb_confusion_matrix_n={n}_{ensemble_type}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/ens_a_delta_verb_confusion_matrix_n={n}_{ensemble_type}_ensemble.pdf")

                fig = px.imshow(
                    ensemble_noun_cm - noun_cm_audio,
                    labels=dict(x="Predicted", y="True", color="Proportion"),
                    x=topn_nouns,
                    y=topn_nouns,
                    title=f"Difference in confusion matrix for the top 20 noun classes<br>between <b>multi-modal ensemble</b> and {audio_model_name}",
                    width=1000,
                    height=1000,
                    color_continuous_scale="viridis",
                    # range_color=[0, 1],
                )
                fig.show()

                fig.write_image(f"{output_path}/ens_a_delta_noun_confusion_matrix_n={n}_{ensemble_type}_ensemble.svg")
                fig.write_image(f"{output_path}/ens_a_delta_noun_confusion_matrix_n={n}_{ensemble_type}_ensemble.png", scale=2.0)
                fig.write_image(f"{output_path}/ens_a_delta_noun_confusion_matrix_n={n}_{ensemble_type}_ensemble.pdf")


a = {
    "audio": {
        "asf": {
            "file_path": "../audio-cm.pkl",
            "name": "AudioSlowFast",
        },
        "asf-gru": {
            "file_path": "../audio-gru-cm.pkl",
            "name": "AudioSlowFastGRU",
        },
    },
    "vision": {
        "sf": {
            "file_path": "../visu-cm.pkl",
            "name": "SlowFast",
        },
    },
}
create_comparison_plots(a, n=20)

Comparing sf and asf


Verbs for which audio accuracy is higher
verb_class
8    turn-off
Name: key, dtype: object
Nouns for which audio accuracy is higher
Series([], Name: key, dtype: object)


Comparing sf and asf-gru


Verbs for which audio accuracy is higher
verb_class
8    turn-off
Name: key, dtype: object
Nouns for which audio accuracy is higher
Series([], Name: key, dtype: object)


In [54]:
labels

Unnamed: 0_level_0,participant_id,video_id,narration_timestamp,start_timestamp,stop_timestamp,start_frame,stop_frame,narration,verb,verb_class,noun,noun_class,all_nouns,all_noun_classes
narration_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
P01_11_0,P01,P01_11,00:00:00.560,00:00:00.00,00:00:01.89,1,113,take plate,take,0,plate,2,[plate],[2]
P01_11_1,P01,P01_11,00:00:01.700,00:00:01.56,00:00:02.45,93,147,put down plate,put-down,1,plate,2,[plate],[2]
P01_11_10,P01,P01_11,00:00:48.500,00:00:49.15,00:00:50.95,2949,3057,take paper,take,0,paper,49,[paper],[49]
P01_11_100,P01,P01_11,00:05:27.840,00:05:27.28,00:05:31.97,19636,19918,wash cloth,wash,2,cloth,17,[cloth],[17]
P01_11_101,P01,P01_11,00:05:26.840,00:05:27.37,00:05:29.86,19642,19791,take cloth,take,0,cloth,17,[cloth],[17]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P32_10_5,P32,P32_10,00:01:13.920,00:01:14.36,00:01:17.04,4461,4622,spray degreaser,spray,44,degreaser,22,[degreaser],[22]
P32_10_6,P32,P32_10,00:01:17.729,00:01:18.69,00:01:33.75,4721,5625,wash cooker,wash,2,cooker,46,[cooker],[46]
P32_10_7,P32,P32_10,00:01:40.790,00:01:39.52,00:01:59.64,5971,7178,rinse cooker,rinse,2,cooker,46,[cooker],[46]
P32_10_8,P32,P32_10,00:02:01.070,00:02:00.48,00:02:09.11,7228,7746,rinse cloth,rinse,2,cloth,17,[cloth],[17]


In [56]:
# Make an plot with the delta from the ensemble to max of audio or video cues
pd.DataFrame(labels.verb.value_counts()).sort_values(by="count", ascending=False).loc["turn-off"]


count    201
Name: turn-off, dtype: int64