# Table 2

Audio-visual video classification results on VGGSounder. We report multi-label classification metrics (subset accuracy, F1- score, Hit accuracy, modality confusion μ) on background music free subset for audio- a(A), visual - v(V), audio-visual - av(AV), audio-only - a(A¬V) and video-only - v(V¬A) inputs. The embedding models CAV-MAE, DeepAVFusion, and Equi-AV were finetuned on the VGGSound training set. We report metrics for k = 1 here and for other k in Appendix D. The closed sourced multi-modal foundation models Gemini and open-sourced models use a zero-shot evaluation protocol and LLM-assisted protocol respectively.

In [1]:
from vggsounder.benchmark import benchmark

main_table = benchmark(
    models_path="../../supplimentary/models-results",
    display_names = {
        "cav-mae": "CAV-MAE",
        "deepavfusion": "DeepAVFusion",
        "equiav": "Equi-AV",
        "avsiam": "AV-Siam",
        "gemini-1.5-flash": "Gemini 1.5 Flash",
        "gemini-1.5-pro": "Gemini 1.5 Pro",
        "gemini-2.0-flash": "Gemini 2.0 Flash",
        "video-llama-2-av": "VideoLLaMA 2",
        "unified-io-2": "Unified-IO 2",
        "pandagpt": "PandaGPT",
        "ola": "OLA",
    },
    metrics=[
        ("accuracy", ["a", "v", "av"]), 
        ("f1",["a", "v", "av", "a only", "v only"]), 
        ("hit_rate",["a", "v", "av"]), 
        ("mu",["a", "v", "av"])
    ],
)

  0%|          | 0/11 [00:00<?, ?it/s]



In [2]:
print(main_table.to_latex(float_format="%.2f".__mod__))

\begin{tabular}{lrrrrrrrrrrrrrr}
\toprule
metric & \multicolumn{3}{r}{accuracy} & \multicolumn{5}{r}{f1} & \multicolumn{3}{r}{hit_rate} & \multicolumn{3}{r}{mu} \\
modality & a & v & av & a & v & av & a only & v only & a & v & av & a & v & av \\
\midrule
CAV-MAE & 13.19 & 19.23 & 24.49 & 34.47 & 34.91 & 42.62 & 13.94 & 19.00 & 62.30 & 53.44 & 64.17 & 3.59 & 6.43 & 0.78 \\
DeepAVFusion & 10.19 & 11.10 & 21.53 & 25.31 & 21.29 & 37.35 & 10.37 & 10.55 & 45.77 & 32.61 & 56.27 & 3.74 & 3.93 & 0.17 \\
Equi-AV & 11.60 & 10.52 & 20.00 & 29.39 & 20.42 & 34.69 & 12.55 & 10.65 & 53.12 & 31.26 & 52.24 & 6.97 & 7.13 & 1.38 \\
AV-Siam & 12.79 & 19.75 & 22.83 & 33.30 & 35.41 & 39.43 & 12.90 & 18.21 & 60.18 & 54.20 & 59.36 & 9.36 & 8.80 & 3.58 \\
Gemini 1.5 Flash & 1.78 & 14.44 & 16.44 & 14.49 & 36.98 & 42.52 & 15.61 & 21.61 & 32.73 & 47.36 & 59.10 & 10.22 & 4.25 & 0.77 \\
Gemini 1.5 Pro & 3.05 & 20.86 & 22.53 & 19.26 & 49.73 & 53.74 & 17.73 & 22.90 & 35.03 & 69.23 & 75.42 & 2.09 & 4.85 & 0.57 \\
Gemin

# Table 3
Difference between labels that contain confounding factors such as "background music" and that do not contain.

In [None]:
from vggsounder.benchmark import compute_all_results, get_metric_dataframe, select_metrics
from vggsounder.labels import VGGSounder

models_path = "../../supplimentary/models-results"
csv_path= "../../supplimentary/data/vggsounder+background-music.csv"
display_names = {
    "cav-mae": "CAV-MAE",
    "deepavfusion": "DeepAVFusion",
    "equiav": "Equi-AV",
    "avsiam": "AV-Siam",
    "gemini-1.5-flash": "Gemini 1.5 Flash",
    "gemini-1.5-pro": "Gemini 1.5 Pro",
    "gemini-2.0-flash": "Gemini 2.0 Flash",
    "video-llama-2-av": "VideoLLaMA 2",
    "unified-io-2": "Unified-IO 2",
    "pandagpt": "PandaGPT",
    "ola": "OLA",
}
vggsounder = VGGSounder(csv_path=csv_path)

# background_music
vggsounder.set_meta_filters(background_music=True)
background_music_ids = [video.video_id for video in vggsounder]

background_music_only_results = compute_all_results(
    models_path=models_path,
    models_filter=display_names.keys(),
    subset_ids=background_music_ids,
    dataset_path=csv_path
)

# no background music
vggsounder.set_meta_filters(background_music=False)
no_background_music_ids = [video.video_id for video in vggsounder]

no_background_music_results = compute_all_results(
    models_path=models_path,
    models_filter=display_names.keys(),
    subset_ids=no_background_music_ids,
    dataset_path=csv_path
)

# voice over
vggsounder.set_meta_filters(voice_over=True)
voice_over_ids = [video.video_id for video in vggsounder]

voice_over_results = compute_all_results(
    models_path=models_path,
    models_filter=display_names.keys(),
    subset_ids=voice_over_ids,
    dataset_path=csv_path
)

# no voice over
vggsounder.set_meta_filters(voice_over=False)
no_voice_over_ids = [video.video_id for video in vggsounder]

no_voice_over_results = compute_all_results(
    models_path=models_path,
    models_filter=display_names.keys(),
    subset_ids=no_voice_over_ids,
    dataset_path=csv_path
)

# static image
vggsounder.set_meta_filters(static_image=True)
static_image_ids = [video.video_id for video in vggsounder]

static_image_results = compute_all_results(
    models_path=models_path,
    models_filter=display_names.keys(),
    subset_ids=static_image_ids,
    dataset_path=csv_path
)

# no static image
vggsounder.set_meta_filters(static_image=False)
no_static_image_ids = [video.video_id for video in vggsounder]

no_static_image_results = compute_all_results(
    models_path=models_path,
    models_filter=display_names.keys(),
    subset_ids=no_static_image_ids,
    dataset_path=csv_path
)

# neither
vggsounder.set_meta_filters(
    static_image=False, 
    voice_over=False, 
    background_music=False
)
neither_ids = [video.video_id for video in vggsounder]

neither_results = compute_all_results(
    models_path=models_path,
    models_filter=display_names.keys(),
    subset_ids=neither_ids,
    dataset_path=csv_path
)

# main table -- vggsounder + background music
either_of_confounder = list(set(background_music_ids + voice_over_ids + static_image_ids))
main_table_results = compute_all_results(
    models_path=models_path,
    models_filter=display_names.keys(),
    subset_ids=either_of_confounder,
    dataset_path=csv_path
)

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]



  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]



  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]



In [None]:
main_table_results = compute_all_results(
    models_path=models_path,
    models_filter=display_names.keys(),
    dataset_path=csv_path
)

In [13]:
# background music only
background_music_only_df = get_metric_dataframe(background_music_only_results)
background_music_only_table = select_metrics(background_music_only_df, [("f1", ["a", "v", "av"])], display_names=display_names)

# no background music
no_background_music_df = get_metric_dataframe(no_background_music_results)
no_background_music_table = select_metrics(no_background_music_df, [("f1", ["a", "v", "av"])], display_names=display_names)

# voice over only
voice_over_df = get_metric_dataframe(voice_over_results)
voice_over_table = select_metrics(voice_over_df, [("f1", ["a"])], display_names=display_names)

# no voice over
no_voice_over_df = get_metric_dataframe(no_voice_over_results)
no_voice_over_table = select_metrics(no_voice_over_df, [("f1", ["a"])], display_names=display_names)

# static image only
static_image_df = get_metric_dataframe(static_image_results)
static_image_table = select_metrics(static_image_df, [("f1", ["a", "v"])], display_names=display_names)
static_image_accuracy_table = select_metrics(static_image_df, [("accuracy", ["a", "v"])], display_names=display_names)

# no static image
no_static_image_df = get_metric_dataframe(no_static_image_results)
no_static_image_table = select_metrics(no_static_image_df, [("f1", ["a", "v"])], display_names=display_names)
no_static_image_accuracy_table = select_metrics(no_static_image_df, [("accuracy", ["a", "v"])], display_names=display_names)

# neither
neither_df = get_metric_dataframe(neither_results)
neither_table = select_metrics(neither_df, [("f1", ["a", "v", "av"])], display_names=display_names)

# main table -- vggsounder + background music
main_table_df = get_metric_dataframe(main_table_results)
main_table_table = select_metrics(main_table_df, [("f1", ["a", "v", "av"])], display_names=display_names)

In [14]:
import pandas as pd

# final diff table 
diff_tables = [
    background_music_only_table - no_background_music_table,
    voice_over_table - no_voice_over_table,
    static_image_table - no_static_image_table,
    static_image_accuracy_table,
    no_static_image_accuracy_table,
    main_table_table - neither_table,
]

diff_tables = pd.concat(diff_tables, axis=1)
diff_tables

metric,f1,f1,f1,f1,f1,f1,accuracy,accuracy,accuracy,accuracy,f1,f1,f1
modality,a,v,av,a.1,a.2,v.1,a,v,a.1,v.1,a.3,v.2,av.1
CAV-MAE,-3.434171,-3.595847,-4.005153,-8.186111,4.747236,-7.392629,22.132616,19.480519,11.981861,19.210926,-3.135352,-4.706228,-4.651923
DeepAVFusion,-3.650865,-4.861405,-4.272057,-9.051576,4.33495,-4.878302,15.981308,10.958904,9.300612,10.80761,-3.971501,-3.712876,-5.067855
Equi-AV,-4.06621,-2.534618,-2.541157,-7.131573,4.25802,-5.991801,18.996416,10.38961,10.493871,10.447648,-3.044443,-3.582058,-3.326803
AV-Siam,-3.745411,-4.387541,-5.09811,-7.961802,5.22665,-6.851267,22.043011,19.805195,11.570892,19.522003,-3.194301,-4.968066,-5.19969
Gemini 1.5 Flash,-1.173969,-2.392415,-4.173242,17.251452,-5.277933,-7.313747,1.433692,13.474026,1.679303,14.39302,4.574607,-4.073223,-6.145446
Gemini 1.5 Pro,-1.86154,-3.672653,-5.799092,18.163835,-4.899848,-6.86439,2.329749,22.077922,2.869695,20.796662,5.194423,-3.277049,-4.016691
Gemini 2.0 Flash,-0.466296,-1.918562,-3.471032,0.19522,1.962523,-7.108631,3.853047,10.876623,1.530504,12.397572,-0.092163,-3.96711,-4.530007
VideoLLaMA 2,-2.431225,-4.620166,-5.52472,-3.969227,4.224274,-9.516781,18.996416,18.181818,12.038546,19.704097,-1.481078,-4.860841,-5.176603
Unified-IO 2,-6.405868,1.151202,-4.17944,-4.981597,1.881191,-6.18815,17.921147,9.577922,10.876497,11.995448,-3.419909,-0.568699,-4.532109
PandaGPT,-5.97794,-0.925743,-2.745251,3.916475,-3.682674,-4.864189,3.315412,4.87013,2.905123,4.241275,-1.921213,-0.580383,-2.532941


In [15]:
print(diff_tables.to_latex(float_format="%.2f".__mod__))

\begin{tabular}{lrrrrrrrrrrrrr}
\toprule
metric & \multicolumn{6}{r}{f1} & \multicolumn{4}{r}{accuracy} & \multicolumn{3}{r}{f1} \\
modality & a & v & av & a & a & v & a & v & a & v & a & v & av \\
\midrule
CAV-MAE & -3.43 & -3.60 & -4.01 & -8.19 & 4.75 & -7.39 & 22.13 & 19.48 & 11.98 & 19.21 & -3.14 & -4.71 & -4.65 \\
DeepAVFusion & -3.65 & -4.86 & -4.27 & -9.05 & 4.33 & -4.88 & 15.98 & 10.96 & 9.30 & 10.81 & -3.97 & -3.71 & -5.07 \\
Equi-AV & -4.07 & -2.53 & -2.54 & -7.13 & 4.26 & -5.99 & 19.00 & 10.39 & 10.49 & 10.45 & -3.04 & -3.58 & -3.33 \\
AV-Siam & -3.75 & -4.39 & -5.10 & -7.96 & 5.23 & -6.85 & 22.04 & 19.81 & 11.57 & 19.52 & -3.19 & -4.97 & -5.20 \\
Gemini 1.5 Flash & -1.17 & -2.39 & -4.17 & 17.25 & -5.28 & -7.31 & 1.43 & 13.47 & 1.68 & 14.39 & 4.57 & -4.07 & -6.15 \\
Gemini 1.5 Pro & -1.86 & -3.67 & -5.80 & 18.16 & -4.90 & -6.86 & 2.33 & 22.08 & 2.87 & 20.80 & 5.19 & -3.28 & -4.02 \\
Gemini 2.0 Flash & -0.47 & -1.92 & -3.47 & 0.20 & 1.96 & -7.11 & 3.85 & 10.88 & 1.53 & 12.40 

# Figure 9 (Table)
Performance of state-of-the-art models on VGGSound. 

In [14]:
import pickle as pk
from vggsounder.benchmark import benchmark, get_metric_dataframe, select_metrics
from vggsounder.labels import VGGSounder


models_path = "../../supplimentary/models-results"
csv_path= "../../supplimentary/data/vggsounder+background-music.csv"
display_names = {
    "cav-mae": "CAV-MAE",
    "deepavfusion": "DeepAVFusion",
    "equiav": "Equi-AV",
    "avsiam": "AV-Siam",
    "gemini-1.5-flash": "Gemini 1.5 Flash",
    "gemini-1.5-pro": "Gemini 1.5 Pro",
    "gemini-2.0-flash": "Gemini 2.0 Flash",
    "video-llama-2-av": "VideoLLaMA 2",
    "unified-io-2": "Unified-IO 2",
    "pandagpt": "PandaGPT",
    "ola": "OLA",
}
vggsounder = VGGSounder(csv_path=csv_path)

vggsound_results = pk.load(open("precomputed_results/vggsound_results.pkl", "rb"))
table_9_df = get_metric_dataframe(vggsound_results, modalities=["a", "v", "av"])
table_9 = select_metrics(
    table_9_df, 
    [
        ("accuracy", ["a", "v", "av"]), 
        ("mu",["a", "v", "av"])
    ], 
    display_names=display_names
)
table_9
    

metric,accuracy,accuracy,accuracy,mu,mu,mu
modality,a,v,av,a,v,av
CAV-MAE,59.050887,45.565195,65.078337,4.713194,4.836204,0.666839
DeepAVFusion,40.824576,27.243331,53.098895,4.176772,3.172999,0.067367
Equi-AV,46.678752,24.841383,50.07769,6.907937,5.509517,0.977599
AV-Siam,56.907937,47.267901,55.25055,13.174932,8.921404,3.923346
Gemini 1.5 Flash,0.31076,22.122232,23.598343,1.514955,4.169364,0.090638
Gemini 1.5 Pro,1.288359,25.767189,21.306487,1.618542,5.412405,0.239544
Gemini 2.0 Flash,5.703742,20.290043,19.390133,2.499029,4.771462,0.627994
VideoLLaMA 2,27.981354,17.00764,21.461867,11.161466,2.848634,1.417843
Unified-IO 2,32.280202,20.238249,52.395442,4.875049,3.418361,0.874013
PandaGPT,5.198757,7.652467,8.9473,4.506021,4.480124,0.938754


In [15]:
print(table_9.to_latex(float_format="%.2f".__mod__))

\begin{tabular}{lrrrrrr}
\toprule
metric & \multicolumn{3}{r}{accuracy} & \multicolumn{3}{r}{mu} \\
modality & a & v & av & a & v & av \\
\midrule
CAV-MAE & 59.05 & 45.57 & 65.08 & 4.71 & 4.84 & 0.67 \\
DeepAVFusion & 40.82 & 27.24 & 53.10 & 4.18 & 3.17 & 0.07 \\
Equi-AV & 46.68 & 24.84 & 50.08 & 6.91 & 5.51 & 0.98 \\
AV-Siam & 56.91 & 47.27 & 55.25 & 13.17 & 8.92 & 3.92 \\
Gemini 1.5 Flash & 0.31 & 22.12 & 23.60 & 1.51 & 4.17 & 0.09 \\
Gemini 1.5 Pro & 1.29 & 25.77 & 21.31 & 1.62 & 5.41 & 0.24 \\
Gemini 2.0 Flash & 5.70 & 20.29 & 19.39 & 2.50 & 4.77 & 0.63 \\
VideoLLaMA 2 & 27.98 & 17.01 & 21.46 & 11.16 & 2.85 & 1.42 \\
Unified-IO 2 & 32.28 & 20.24 & 52.40 & 4.88 & 3.42 & 0.87 \\
PandaGPT & 5.20 & 7.65 & 8.95 & 4.51 & 4.48 & 0.94 \\
OLA & 10.71 & 8.63 & 14.29 & 7.61 & 4.05 & 0.71 \\
\bottomrule
\end{tabular}



# Table 7
**Audio-visual video classification results on VGGSounder for k ∈ {3, 5, 10}.** The table is vertically grouped by k. Within each block, the four models are compared across the three metrics and input modalities

In [None]:
from vggsounder.benchmark import benchmark
import pandas as pd

# top 3 table
top_3_table = benchmark(
    models_path="../../supplimentary/models-results",
    display_names = {
        "cav-mae": "CAV-MAE",
        "deepavfusion": "DeepAVFusion",
        "equiav": "Equi-AV",
        "avsiam": "AV-Siam",
    },
    metrics=[
        ("accuracy@3", ["a", "v", "av"]),
        ("f1@3", ["a", "v", "av"]),
        ("hit_rate@3", ["a", "v", "av"]),
    ],
)

# top 5 table
top_5_table = benchmark(
    models_path="../../supplimentary/models-results",
    display_names = {
        "cav-mae": "CAV-MAE",
        "deepavfusion": "DeepAVFusion",
        "equiav": "Equi-AV",
        "avsiam": "AV-Siam",
    },
    metrics=[
        ("accuracy@5", ["a", "v", "av"]),
        ("f1@5", ["a", "v", "av"]),
        ("hit_rate@5", ["a", "v", "av"]),
    ],
)

# top 10 table
top_10_table = benchmark(
    models_path="../../supplimentary/models-results",
    display_names = {
        "cav-mae": "CAV-MAE",
        "deepavfusion": "DeepAVFusion",
        "equiav": "Equi-AV",
        "avsiam": "AV-Siam",
    },
    metrics=[
        ("accuracy@10", ["a", "v", "av"]),
        ("f1@10", ["a", "v", "av"]),
        ("hit_rate@10", ["a", "v", "av"]),
    ],
)

# rename columns 
new_cols = [(col[0].split("@")[0], col[1]) for col in top_10_table.columns.to_list()]
top_3_table.columns = pd.MultiIndex.from_tuples(new_cols)
top_5_table.columns = pd.MultiIndex.from_tuples(new_cols)
top_10_table.columns = pd.MultiIndex.from_tuples(new_cols)

# concat to get the final table
table_7 = pd.concat([top_3_table, top_5_table, top_10_table])
table_7

Unnamed: 0_level_0,accuracy,accuracy,accuracy,f1,f1,f1,hit_rate,hit_rate,hit_rate
Unnamed: 0_level_1,a,v,av,a,v,av,a,v,av
CAV-MAE,0.99389,0.561305,1.085689,39.084762,35.584821,42.916995,81.156823,72.136467,82.548228
DeepAVFusion,0.220919,0.082335,0.683009,28.067097,22.4294,37.360258,65.145722,50.855365,74.74738
Equi-AV,0.545825,0.236801,0.34096,33.504541,22.78288,34.057867,73.816701,48.131907,70.776133
AV-Siam,0.830957,0.771794,0.744729,37.361075,37.047305,40.908114,79.307536,73.206455,79.82952
CAV-MAE,0.040733,0.035082,0.026918,35.150633,30.789676,35.998567,87.046843,78.731801,87.644684
DeepAVFusion,0.0,0.0,0.0,24.905467,19.477004,31.060475,72.240632,58.475894,80.379865
Equi-AV,0.008147,0.00877,0.0,30.058199,20.441166,28.622252,80.635438,55.665673,77.092867
AV-Siam,0.016293,0.043852,0.017945,33.126498,31.877686,34.670216,84.814664,79.573759,85.527142
CAV-MAE,0.0,0.0,0.0,25.616253,21.658765,24.361493,91.641548,85.265743,92.005384
DeepAVFusion,0.0,0.0,0.0,18.362784,14.226779,21.062797,80.346673,67.871192,85.703593


# Table 8 
**Audio-visual video classification results on the subset of VGGSounder that is labelled as containing background music**

In [1]:
from vggsounder.benchmark import benchmark
from vggsounder.labels import VGGSounder

models_path = "../../supplimentary/models-results"
csv_path= "../../supplimentary/data/vggsounder+background-music.csv"
display_names = {
    "cav-mae": "CAV-MAE",
    "deepavfusion": "DeepAVFusion",
    "equiav": "Equi-AV",
    "avsiam": "AV-Siam",
    "gemini-1.5-flash": "Gemini 1.5 Flash",
    "gemini-1.5-pro": "Gemini 1.5 Pro",
    "gemini-2.0-flash": "Gemini 2.0 Flash",
    "video-llama-2-av": "VideoLLaMA 2",
    "unified-io-2": "Unified-IO 2",
    "pandagpt": "PandaGPT",
    "ola": "OLA",
}
vggsounder = VGGSounder(csv_path=csv_path)

# background_music
vggsounder.set_meta_filters(background_music=True)
background_music_ids = [video.video_id for video in vggsounder]

background_music_only_table = benchmark(
    models_path=models_path,
    display_names=display_names,
    metrics=[
        ("accuracy", ["a", "v", "av"]),
        ("f1",["a", "v", "av", "a only", "v only"]), 
        ("hit_rate",["a", "v", "av"]), 
        ("mu",["a", "v", "av"])
    ],
    subset_ids=background_music_ids,
    dataset_path=csv_path
)
background_music_only_table

  0%|          | 0/11 [00:00<?, ?it/s]

metric,accuracy,accuracy,accuracy,f1,f1,f1,f1,f1,hit_rate,hit_rate,hit_rate,mu,mu,mu
modality,a,v,av,a,v,av,a only,v only,a,v,av,a,v,av
CAV-MAE,10.798917,19.172932,23.869565,31.033512,31.315056,38.614477,17.834631,22.264875,55.958023,44.611529,54.043478,4.26087,6.913043,0.869565
DeepAVFusion,8.15331,9.482759,20.700494,21.655066,16.431711,33.076181,12.642369,8.267717,39.02439,23.362069,46.205658,2.514594,3.322856,0.089807
Equi-AV,9.106297,10.108605,19.73913,25.326199,17.88594,32.152842,14.720059,11.516315,45.666892,25.480368,45.0,5.391304,6.086957,1.086957
AV-Siam,10.460393,18.504595,21.347826,29.550361,31.021844,34.327431,16.499815,22.648752,53.283683,44.193818,48.043478,10.173913,8.826087,3.695652
Gemini 1.5 Flash,1.150982,13.909774,14.73913,13.311819,34.589928,38.351693,11.489097,22.099448,30.467163,44.27736,53.478261,11.521739,3.695652,0.782609
Gemini 1.5 Pro,1.895735,20.843776,20.73913,17.40141,46.060449,47.941027,13.681262,27.495908,33.649289,62.406015,67.521739,3.913043,3.956522,0.608696
Gemini 2.0 Flash,1.083277,11.319967,10.434783,11.331916,32.162606,32.98134,9.843158,21.83908,18.618822,39.849624,43.130435,2.304348,4.391304,0.826087
VideoLLaMA 2,11.238998,18.629908,22.695652,36.434847,43.202208,46.826667,23.934567,33.410673,53.859174,43.692565,48.391304,15.26087,5.347826,2.826087
Unified-IO 2,9.106297,13.450292,24.521739,28.903912,29.068656,44.7131,20.731143,22.969188,42.552471,28.822055,52.26087,5.913043,5.869565,1.565217
PandaGPT,1.861882,4.636591,5.782609,12.753553,17.635108,18.103262,8.65557,16.091954,14.962762,15.497076,15.173913,6.869565,5.695652,2.217391


# Table 9 
**Audio-visual video classification results on the subset of VGGSounder that is labelled as not containing background music**

In [9]:
from vggsounder.benchmark import benchmark
from vggsounder.labels import VGGSounder

models_path = "../../supplimentary/models-results"
csv_path= "../../supplimentary/data/vggsounder+background-music.csv"
display_names = {
    "cav-mae": "CAV-MAE",
    "deepavfusion": "DeepAVFusion",
    "equiav": "Equi-AV",
    "avsiam": "AV-Siam",
    "gemini-1.5-flash": "Gemini 1.5 Flash",
    "gemini-1.5-pro": "Gemini 1.5 Pro",
    "gemini-2.0-flash": "Gemini 2.0 Flash",
    "video-llama-2-av": "VideoLLaMA 2",
    "unified-io-2": "Unified-IO 2",
    "pandagpt": "PandaGPT",
    "ola": "OLA",
}
vggsounder = VGGSounder(csv_path=csv_path)


# no background music
vggsounder.set_meta_filters(background_music=False)
no_background_music_ids = [video.video_id for video in vggsounder]

no_background_music_only_table = benchmark(
    models_path=models_path,
    display_names=display_names,
    metrics=[
        ("accuracy", ["a", "v", "av"]),
        ("f1",["a", "v", "av", "a only", "v only"]), 
        ("hit_rate",["a", "v", "av"]), 
        ("mu",["a", "v", "av"])
    ],
    subset_ids=no_background_music_ids,
    dataset_path=csv_path
)
no_background_music_only_table

  0%|          | 0/11 [00:00<?, ?it/s]



metric,accuracy,accuracy,accuracy,f1,f1,f1,f1,f1,hit_rate,hit_rate,hit_rate,mu,mu,mu
modality,a,v,av,a,v,av,a only,v only,a,v,av,a,v,av
CAV-MAE,13.189409,19.233468,24.486317,34.467682,34.910904,42.619629,13.943105,19.004975,62.297352,53.437993,64.172275,3.589053,6.433378,0.780619
DeepAVFusion,10.187781,11.09688,21.528817,25.305931,21.293116,37.348238,10.366826,10.548303,45.772793,32.613668,56.268713,3.742515,3.929641,0.168413
Equi-AV,11.600815,10.515699,20.0,29.39241,20.420558,34.693999,12.552622,10.646766,53.124236,31.257674,52.238672,6.971736,7.133244,1.381786
AV-Siam,12.790224,19.750921,22.82638,33.295772,35.409385,39.425541,12.897053,18.208955,60.179226,54.201017,59.362943,9.358457,8.802153,3.580081
Gemini 1.5 Flash,1.784114,14.444834,16.437865,14.485788,36.982343,42.524935,15.605109,21.60555,32.733198,47.360112,59.102737,10.21983,4.253028,0.771646
Gemini 1.5 Pro,3.05499,20.85599,22.530283,19.26295,49.733103,53.740119,17.732038,22.898799,35.03055,69.233468,75.423957,2.090624,4.845222,0.574249
Gemini 2.0 Flash,1.849287,12.541659,12.687304,11.798212,34.081168,36.452372,6.185984,18.896834,18.509165,43.825645,47.716465,2.386721,5.428443,0.995962
VideoLLaMA 2,12.863544,19.847395,24.468371,38.866072,47.822374,52.351387,20.340482,28.0837,58.90835,52.01719,59.802602,12.723194,5.464334,2.951996
Unified-IO 2,11.942974,11.559376,25.607896,35.30978,27.917454,48.89254,21.378038,16.534867,54.386965,31.047185,65.105428,8.703454,5.159264,1.785554
PandaGPT,3.193483,4.192247,5.464334,18.731493,18.560851,20.848513,16.816454,14.401176,21.075356,17.014559,18.824585,7.590848,5.89502,2.467474


# Table 10 
**Audio-visual video classification results on the subset of VGGSounder that is labelled as containing static images**

In [None]:
from vggsounder.benchmark import benchmark
from vggsounder.labels import VGGSounder

models_path = "../../supplimentary/models-results"
csv_path= "../../supplimentary/data/vggsounder+background-music.csv"
display_names = {
    "cav-mae": "CAV-MAE",
    "deepavfusion": "DeepAVFusion",
    "equiav": "Equi-AV",
    "avsiam": "AV-Siam",
    "gemini-1.5-flash": "Gemini 1.5 Flash",
    "gemini-1.5-pro": "Gemini 1.5 Pro",
    "gemini-2.0-flash": "Gemini 2.0 Flash",
    "video-llama-2-av": "VideoLLaMA 2",
    "unified-io-2": "Unified-IO 2",
    "pandagpt": "PandaGPT",
    "ola": "OLA",
}

# static image
vggsounder = VGGSounder(csv_path=csv_path)
vggsounder.set_meta_filters(static_image=True)
static_image_ids = [video.video_id for video in vggsounder]

static_image_results = compute_all_results(
    models_path=models_path,
    models_filter=display_names.keys(),
    subset_ids=static_image_ids,
    dataset_path=csv_path
)

static_image_table = benchmark(
    models_path=models_path,
    display_names=display_names,
    metrics=[
        ("accuracy", ["a", "v", "av"]),
        ("f1",["a", "v", "av", "a only", "v only"]), 
        ("hit_rate",["a", "v", "av"]), 
        ("mu",["a", "v", "av"])
    ],
    subset_ids=static_image_ids,
    dataset_path=csv_path
)
static_image_table

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

metric,accuracy,accuracy,accuracy,f1,f1,f1,f1,f1,hit_rate,hit_rate,hit_rate,mu,mu,mu
modality,a,v,av,a,v,av,a only,v only,a,v,av,a,v,av
CAV-MAE,22.132616,19.480519,27.41652,38.241881,27.209154,37.622378,35.177539,15.028902,61.200717,34.74026,47.275923,4.217926,6.85413,0.351494
DeepAVFusion,15.981308,10.958904,23.20442,28.646441,15.796519,31.501832,26.426426,6.329114,45.88785,20.205479,39.594843,4.235727,3.867403,0.0
Equi-AV,18.996416,10.38961,22.495606,32.586786,14.240305,31.328671,30.14038,9.248555,52.150538,18.181818,39.367311,5.623902,4.217926,0.351494
AV-Siam,22.043011,19.805195,24.253076,37.458007,28.099174,34.265734,33.608588,13.872832,59.946237,35.876623,43.057996,10.720562,7.908612,2.636204
Gemini 1.5 Flash,1.433692,13.474026,15.641476,9.345093,29.512894,33.266533,8.146965,18.63354,20.250896,30.194805,40.59754,10.896309,4.56942,0.878735
Gemini 1.5 Pro,2.329749,22.077922,23.374341,14.315888,42.498469,44.885307,12.521266,24.444444,24.193548,51.298701,57.469244,3.866432,5.448155,0.527241
Gemini 2.0 Flash,3.853047,10.876623,13.884007,13.538111,26.912568,31.636864,12.819439,13.75,19.53405,29.87013,38.31283,1.933216,3.866432,0.878735
VideoLLaMA 2,18.996416,18.181818,23.725835,42.358219,37.899918,43.220339,39.409805,27.536232,56.451613,32.305195,40.59754,15.465729,5.096661,2.636204
Unified-IO 2,17.921147,9.577922,28.471002,35.871743,22.121487,45.950413,33.450527,15.929204,47.491039,18.344156,47.803163,7.381371,3.339192,1.054482
PandaGPT,3.315412,4.87013,5.272408,14.145234,13.714286,15.320911,12.751292,11.009174,14.784946,11.363636,11.775044,8.963093,5.623902,2.108963


# Table 11 
**Audio-visual video classification results on the subset of VGGSounder that is labelled as not containing static images**

In [None]:
from vggsounder.benchmark import benchmark
from vggsounder.labels import VGGSounder

models_path = "../../supplimentary/models-results"
csv_path= "../../supplimentary/data/vggsounder+background-music.csv"
display_names = {
    "cav-mae": "CAV-MAE",
    "deepavfusion": "DeepAVFusion",
    "equiav": "Equi-AV",
    "avsiam": "AV-Siam",
    "gemini-1.5-flash": "Gemini 1.5 Flash",
    "gemini-1.5-pro": "Gemini 1.5 Pro",
    "gemini-2.0-flash": "Gemini 2.0 Flash",
    "video-llama-2-av": "VideoLLaMA 2",
    "unified-io-2": "Unified-IO 2",
    "pandagpt": "PandaGPT",
    "ola": "OLA",
}

# no static image
vggsounder = VGGSounder(csv_path=csv_path)
vggsounder.set_meta_filters(static_image=False)
no_static_image_ids = [video.video_id for video in vggsounder]

no_static_image_table = benchmark(
    models_path=models_path,
    display_names=display_names,
    metrics=[
        ("accuracy", ["a", "v", "av"]),
        ("f1",["a", "v", "av", "a only", "v only"]), 
        ("hit_rate",["a", "v", "av"]), 
        ("mu",["a", "v", "av"])
    ],
    subset_ids=no_static_image_ids,
    dataset_path=csv_path
)
no_static_image_table


  0%|          | 0/11 [00:00<?, ?it/s]



# Table 12
**Audio-visual video classification results on the subset of VGGSounder that is labelled as containing voice over narrations**

In [None]:
from vggsounder.benchmark import benchmark
from vggsounder.labels import VGGSounder

models_path = "../../supplimentary/models-results"
csv_path= "../../supplimentary/data/vggsounder+background-music.csv"
display_names = {
    "cav-mae": "CAV-MAE",
    "deepavfusion": "DeepAVFusion",
    "equiav": "Equi-AV",
    "avsiam": "AV-Siam",
    "gemini-1.5-flash": "Gemini 1.5 Flash",
    "gemini-1.5-pro": "Gemini 1.5 Pro",
    "gemini-2.0-flash": "Gemini 2.0 Flash",
    "video-llama-2-av": "VideoLLaMA 2",
    "unified-io-2": "Unified-IO 2",
    "pandagpt": "PandaGPT",
    "ola": "OLA",
}
# voice over
vggsounder = VGGSounder(csv_path=csv_path)
vggsounder.set_meta_filters(voice_over=True)
voice_over_ids = [video.video_id for video in vggsounder]


voice_over_table = benchmark(
    models_path=models_path,
    display_names=display_names,
    metrics=[
        ("accuracy", ["a", "v", "av"]),
        ("f1",["a", "v", "av", "a only", "v only"]), 
        ("hit_rate",["a", "v", "av"]), 
        ("mu",["a", "v", "av"])
    ],
    subset_ids=voice_over_ids,
    dataset_path=csv_path
)
voice_over_table

  0%|          | 0/11 [00:00<?, ?it/s]

metric,accuracy,accuracy,accuracy,f1,f1,f1,f1,f1,hit_rate,hit_rate,hit_rate,mu,mu,mu
modality,a,v,av,a,v,av,a only,v only,a,v,av,a,v,av
CAV-MAE,2.783505,14.343551,17.37841,26.734123,28.356242,35.374682,11.188811,25.503356,51.752577,43.551186,53.618031,4.270463,7.058126,0.711744
DeepAVFusion,2.018056,9.410363,15.332926,16.787821,17.78036,29.134809,6.507984,13.986014,32.501328,27.337701,44.227245,2.687844,3.787416,0.183262
Equi-AV,3.505155,7.750145,14.531435,22.447078,15.439654,28.135394,9.615385,10.738255,43.453608,23.713129,42.645314,7.47331,6.346382,1.067616
AV-Siam,2.731959,13.938693,15.658363,25.695646,28.506873,31.109372,9.55711,26.845638,49.742268,43.782533,47.153025,10.972716,8.956109,3.499407
Gemini 1.5 Flash,5.257732,9.427415,10.854093,29.430582,30.610255,34.008393,27.552537,23.606557,63.71134,40.023135,48.279953,22.182681,4.567023,1.364176
Gemini 1.5 Pro,7.731959,17.698091,16.073547,34.584156,46.218066,50.839695,29.25959,27.76204,68.298969,65.702718,75.504152,4.151839,4.270463,0.948992
Gemini 2.0 Flash,0.721649,7.403123,8.362989,11.873425,27.473394,29.847284,6.104468,21.621622,19.948454,36.37941,40.569395,2.787663,5.812574,1.304864
VideoLLaMA 2,6.340206,16.078658,19.395018,34.976387,42.756259,47.722435,21.740603,36.144578,54.43299,47.599769,54.685647,13.582444,5.39739,2.431791
Unified-IO 2,4.690722,9.080393,18.149466,29.806668,24.800194,40.941704,23.166844,21.649485,48.608247,27.125506,53.440095,9.489917,5.160142,2.016607
PandaGPT,3.659794,4.279931,4.863582,20.963082,18.578152,18.922403,19.073569,17.73399,26.752577,17.466744,18.030842,9.786477,6.524318,3.321471


# Table 13 
**Audio-visual video classification results on the subset of VGGSounder that is labelled as not containing voice over narrations**

In [None]:

from vggsounder.benchmark import benchmark
from vggsounder.labels import VGGSounder

models_path = "../../supplimentary/models-results"
csv_path= "../../supplimentary/data/vggsounder+background-music.csv"
display_names = {
    "cav-mae": "CAV-MAE",
    "deepavfusion": "DeepAVFusion",
    "equiav": "Equi-AV",
    "avsiam": "AV-Siam",
    "gemini-1.5-flash": "Gemini 1.5 Flash",
    "gemini-1.5-pro": "Gemini 1.5 Pro",
    "gemini-2.0-flash": "Gemini 2.0 Flash",
    "video-llama-2-av": "VideoLLaMA 2",
    "unified-io-2": "Unified-IO 2",
    "pandagpt": "PandaGPT",
    "ola": "OLA",
}
# no voice over
vggsounder = VGGSounder(csv_path=csv_path)
vggsounder.set_meta_filters(voice_over=False)
no_voice_over_ids = [video.video_id for video in vggsounder]


no_voice_over_table = benchmark(
    models_path=models_path,
    display_names=display_names,
    metrics=[
        ("accuracy", ["a", "v", "av"]),
        ("f1",["a", "v", "av", "a only", "v only"]), 
        ("hit_rate",["a", "v", "av"]), 
        ("mu",["a", "v", "av"])
    ],
    subset_ids=no_voice_over_ids,
    dataset_path=csv_path
)
no_voice_over_table


  0%|          | 0/11 [00:00<?, ?it/s]



metric,accuracy,accuracy,accuracy,f1,f1,f1,f1,f1,hit_rate,hit_rate,hit_rate,mu,mu,mu
modality,a,v,av,a,v,av,a only,v only,a,v,av,a,v,av
CAV-MAE,14.177139,19.922102,25.384812,34.920234,35.193322,42.941901,15.668934,18.898343,62.427572,53.103505,63.704397,3.622757,6.437622,0.807892
DeepAVFusion,10.93603,11.017974,22.264586,25.839397,20.894069,37.773859,11.825616,9.546093,46.213547,31.524369,56.029438,3.65313,3.830466,0.150736
Equi-AV,12.228159,10.831193,20.733056,29.578651,20.672232,35.185875,13.786848,10.837438,52.87832,31.192508,52.198316,6.590696,7.041415,1.369164
AV-Siam,13.740688,20.336455,23.564929,33.657448,35.594244,39.703058,14.648526,18.092253,60.170065,53.708461,58.899566,9.286504,8.784761,3.614253
Gemini 1.5 Flash,1.136278,15.057595,16.9062,12.179131,37.427257,42.934296,12.376649,21.453901,27.707126,47.799785,59.554384,8.759248,4.098988,0.688834
Gemini 1.5 Pro,2.114531,21.306041,23.105706,16.420321,49.550972,53.103019,14.289442,23.320463,29.866807,68.384851,73.866825,2.151543,4.753806,0.527256
Gemini 2.0 Flash,1.84363,13.035552,12.86674,11.678204,34.664861,36.733678,7.379859,19.230769,18.323425,44.103754,47.844204,2.313122,5.170508,0.918445
VideoLLaMA 2,13.454737,20.145852,24.849052,38.945614,47.72755,52.062844,21.163674,28.228228,58.439311,50.998591,58.304278,13.096352,5.451144,3.001956
Unified-IO 2,12.371134,12.289716,26.464835,34.788264,28.577418,49.282813,20.837068,17.374517,52.599895,31.167647,64.265669,8.044902,5.29807,1.709329
PandaGPT,2.829408,4.267838,5.612722,17.046607,18.390889,20.649237,13.898358,14.342629,18.887802,16.648711,18.224339,7.13496,5.765796,2.296114


# Table 14
**Audio-visual video classification results on the subset of VGGSounder that is labelled as not containing background music, static images, or voice over narrations**

In [2]:

from vggsounder.benchmark import benchmark
from vggsounder.labels import VGGSounder

models_path = "../../supplimentary/models-results"
csv_path= "../../supplimentary/data/vggsounder+background-music.csv"
display_names = {
    "cav-mae": "CAV-MAE",
    "deepavfusion": "DeepAVFusion",
    "equiav": "Equi-AV",
    "avsiam": "AV-Siam",
    "gemini-1.5-flash": "Gemini 1.5 Flash",
    "gemini-1.5-pro": "Gemini 1.5 Pro",
    "gemini-2.0-flash": "Gemini 2.0 Flash",
    "video-llama-2-av": "VideoLLaMA 2",
    "unified-io-2": "Unified-IO 2",
    "pandagpt": "PandaGPT",
    "ola": "OLA",
}

# neither
vggsounder = VGGSounder(csv_path=csv_path)
vggsounder.set_meta_filters(
    static_image=False, 
    voice_over=False, 
    background_music=False
)
neither_ids = [video.video_id for video in vggsounder]

neither_table = benchmark(
    models_path=models_path,
    display_names=display_names,
    metrics=[
        ("accuracy", ["a", "v", "av"]),
        ("f1",["a", "v", "av", "a only", "v only"]), 
        ("hit_rate",["a", "v", "av"]), 
        ("mu",["a", "v", "av"])
    ],
    subset_ids=neither_ids,
    dataset_path=csv_path
)
neither_table

  0%|          | 0/11 [00:00<?, ?it/s]



metric,accuracy,accuracy,accuracy,f1,f1,f1,f1,f1,hit_rate,hit_rate,hit_rate,mu,mu,mu
modality,a,v,av,a,v,av,a only,v only,a,v,av,a,v,av
CAV-MAE,13.509615,19.52621,24.89964,34.796956,35.591719,43.214505,11.714798,18.992472,62.865385,54.677419,65.259907,3.520329,6.423057,0.802882
DeepAVFusion,10.556391,11.036367,21.844139,25.857622,21.497213,38.012008,8.982849,10.271903,46.736842,33.035527,57.428081,3.864319,3.896522,0.161013
Equi-AV,11.75,10.725806,20.18528,29.570493,20.971817,35.171427,10.670582,10.770122,53.423077,32.217742,53.113742,6.845085,7.308286,1.420484
AV-Siam,13.019231,20.080645,23.211529,33.58348,36.031366,39.990457,10.817425,17.602779,60.673077,55.352823,60.391148,9.294905,8.862584,3.654143
Gemini 1.5 Flash,1.269231,14.929435,16.901698,12.859051,37.674233,43.458371,14.047582,21.547278,29.317308,48.578629,60.699949,8.79053,4.127638,0.710242
Gemini 1.5 Pro,2.346154,20.856855,22.974781,17.258326,50.008842,53.887217,15.689401,22.2666,31.432692,69.778226,75.440041,1.966032,4.796706,0.545548
Gemini 2.0 Flash,1.846154,13.014113,12.928461,11.733728,34.828354,37.088984,5.885363,18.928571,18.384615,44.858871,48.564076,2.38806,5.362841,0.946989
VideoLLaMA 2,13.019231,20.080645,24.940813,38.867858,48.363374,52.840744,17.781973,27.520609,59.105769,52.65121,60.422028,12.609367,5.373134,2.974781
Unified-IO 2,11.942308,11.764113,25.959856,35.177339,28.2514,49.419084,18.841439,16.013344,54.067308,31.622984,66.073083,8.409676,5.239321,1.749871
PandaGPT,3.0,4.092742,5.434894,18.209753,18.568262,21.082774,15.995575,14.675768,20.336538,17.056452,18.980957,7.174472,5.764282,2.336593


# Table 15 
**Audio-visual video classification results on VGGSound inputs.**

In [4]:
import pickle as pk
from vggsounder.benchmark import benchmark, get_metric_dataframe, select_metrics
from vggsounder.labels import VGGSounder


models_path = "../../supplimentary/models-results"
csv_path= "../../supplimentary/data/vggsounder+background-music.csv"
display_names = {
    "gemini-1.5-flash": "Gemini 1.5 Flash",
    "gemini-1.5-pro": "Gemini 1.5 Pro",
    "gemini-2.0-flash": "Gemini 2.0 Flash",
    "video-llama-2-av": "VideoLLaMA 2",
    "unified-io-2": "Unified-IO 2",
    "pandagpt": "PandaGPT",
    "ola": "OLA",
}
vggsounder = VGGSounder(csv_path=csv_path)

vggsound_results = pk.load(open("precomputed_results/vggsound_results.pkl", "rb"))
table_15_df = get_metric_dataframe(vggsound_results, modalities=["a", "v", "av"])
table_15 = select_metrics(
    table_15_df, 
    metrics=[
        ("accuracy", ["a", "v", "av"]),
        ("f1",["a", "v", "av"]), 
        ("hit_rate",["a", "v", "av"]), 
        ("mu",["a", "v", "av"])
    ], 
    display_names=display_names
)
table_15
    

metric,accuracy,accuracy,accuracy,f1,f1,f1,hit_rate,hit_rate,hit_rate,mu,mu,mu
modality,a,v,av,a,v,av,a,v,av,a,v,av
Gemini 1.5 Flash,0.31076,22.122232,23.598343,1.708132,33.154648,35.938382,2.978117,31.833484,41.233976,1.514955,4.169364,0.090638
Gemini 1.5 Pro,1.288359,25.767189,21.306487,4.429014,36.409865,35.624527,6.111615,41.719539,45.701152,1.618542,5.412405,0.239544
Gemini 2.0 Flash,5.703742,20.290043,19.390133,9.947751,32.344071,33.913692,9.49113,30.551599,35.310113,2.499029,4.771462,0.627994
VideoLLaMA 2,27.981354,17.00764,21.461867,41.323123,31.46148,36.79918,30.046614,22.72433,27.89719,11.161466,2.848634,1.417843
Unified-IO 2,32.280202,20.238249,52.395442,43.714058,33.844973,64.064593,33.710993,22.840865,54.195261,4.875049,3.418361,0.874013
PandaGPT,5.198757,7.652467,8.9473,12.681473,16.831683,19.547942,8.539428,11.226207,13.297941,4.506021,4.480124,0.938754
OLA,10.714748,8.630066,14.294963,23.334031,17.811464,28.862026,18.056455,10.954292,22.41357,7.613622,4.052829,0.712158


# Table 16 
**Audio-visual video classification results on VGGSound + human annotations**

In [6]:
from vggsounder.benchmark import benchmark


models_path = "../../supplimentary/models-results"
csv_path= "../../supplimentary/intermediate-tables/0.4_majority_inhouse+mturk_formated.csv"
display_names = {
    "gemini-1.5-flash": "Gemini 1.5 Flash",
    "gemini-1.5-pro": "Gemini 1.5 Pro",
    "gemini-2.0-flash": "Gemini 2.0 Flash",
    "video-llama-2-av": "VideoLLaMA 2",
    "unified-io-2": "Unified-IO 2",
    "pandagpt": "PandaGPT",
    "ola": "OLA",
}
table_16 = benchmark(
    models_path=models_path,
    display_names=display_names,
    metrics=[
        ("accuracy", ["a", "v", "av"]),
        ("f1",["a", "v", "av"]), 
        ("hit_rate",["a", "v", "av"]), 
        ("mu",["a", "v", "av"])
    ],
    dataset_path=csv_path
)
table_16

  0%|          | 0/7 [00:00<?, ?it/s]



metric,accuracy,accuracy,accuracy,f1,f1,f1,hit_rate,hit_rate,hit_rate,mu,mu,mu
modality,a,v,av,a,v,av,a,v,av,a,v,av
Gemini 1.5 Flash,1.667871,14.489707,16.418688,14.421158,36.723976,41.927887,32.254252,46.433749,57.588157,10.482071,4.195804,0.796012
Gemini 1.5 Pro,2.856392,21.317773,22.585925,19.165398,49.146085,52.848607,34.716659,67.251377,73.329862,2.425234,4.761196,0.580271
Gemini 2.0 Flash,1.825465,12.829806,12.661806,11.746822,33.977768,36.001974,18.294044,42.71528,46.443982,2.350841,5.341467,0.952239
VideoLLaMA 2,12.706021,19.396927,23.977087,38.374354,47.2596,51.589033,56.937422,50.217454,57.364975,13.041214,5.482815,2.871596
Unified-IO 2,11.412437,11.648304,25.985716,34.165002,28.366349,48.542419,51.441329,30.52334,62.38655,8.153549,5.319149,1.740812
PandaGPT,2.928623,4.269353,5.408421,17.687458,18.554328,20.528199,19.679559,16.526529,18.04047,7.402172,5.810147,2.388037
OLA,13.316698,8.748913,18.442196,45.894388,25.215388,46.356712,55.762033,24.275152,50.022318,14.588603,6.95581,2.343401


In [8]:
print(table_16.to_latex(float_format="%.2f".__mod__))

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
metric & \multicolumn{3}{r}{accuracy} & \multicolumn{3}{r}{f1} & \multicolumn{3}{r}{hit_rate} & \multicolumn{3}{r}{mu} \\
modality & a & v & av & a & v & av & a & v & av & a & v & av \\
\midrule
Gemini 1.5 Flash & 1.67 & 14.49 & 16.42 & 14.42 & 36.72 & 41.93 & 32.25 & 46.43 & 57.59 & 10.48 & 4.20 & 0.80 \\
Gemini 1.5 Pro & 2.86 & 21.32 & 22.59 & 19.17 & 49.15 & 52.85 & 34.72 & 67.25 & 73.33 & 2.43 & 4.76 & 0.58 \\
Gemini 2.0 Flash & 1.83 & 12.83 & 12.66 & 11.75 & 33.98 & 36.00 & 18.29 & 42.72 & 46.44 & 2.35 & 5.34 & 0.95 \\
VideoLLaMA 2 & 12.71 & 19.40 & 23.98 & 38.37 & 47.26 & 51.59 & 56.94 & 50.22 & 57.36 & 13.04 & 5.48 & 2.87 \\
Unified-IO 2 & 11.41 & 11.65 & 25.99 & 34.17 & 28.37 & 48.54 & 51.44 & 30.52 & 62.39 & 8.15 & 5.32 & 1.74 \\
PandaGPT & 2.93 & 4.27 & 5.41 & 17.69 & 18.55 & 20.53 & 19.68 & 16.53 & 18.04 & 7.40 & 5.81 & 2.39 \\
OLA & 13.32 & 8.75 & 18.44 & 45.89 & 25.22 & 46.36 & 55.76 & 24.28 & 50.02 & 14.59 & 6.96 & 2.34 \\
\bottomru

# Table 17
**Audio-visual video classification results on VGGSound + human annotations + automatically added labels**

In [7]:
from vggsounder.benchmark import benchmark


models_path = "../../supplimentary/models-results"
csv_path= "../../supplimentary/data/vggsounder+background-music.csv"
display_names = {
    "gemini-1.5-flash": "Gemini 1.5 Flash",
    "gemini-1.5-pro": "Gemini 1.5 Pro",
    "gemini-2.0-flash": "Gemini 2.0 Flash",
    "video-llama-2-av": "VideoLLaMA 2",
    "unified-io-2": "Unified-IO 2",
    "pandagpt": "PandaGPT",
    "ola": "OLA",
}
table_17 = benchmark(
    models_path=models_path,
    display_names=display_names,
    metrics=[
        ("accuracy", ["a", "v", "av"]),
        ("f1",["a", "v", "av"]), 
        ("hit_rate",["a", "v", "av"]), 
        ("mu",["a", "v", "av"])
    ],
    dataset_path=csv_path
)
table_17

  0%|          | 0/7 [00:00<?, ?it/s]



metric,accuracy,accuracy,accuracy,f1,f1,f1,hit_rate,hit_rate,hit_rate,mu,mu,mu
modality,a,v,av,a,v,av,a,v,av,a,v,av
Gemini 1.5 Flash,1.661304,14.351986,16.147267,14.26108,36.578915,41.836139,32.29365,46.825167,58.140573,10.442544,4.157679,0.773522
Gemini 1.5 Pro,2.830127,20.853871,22.223875,18.902668,49.121902,52.796393,34.762624,68.04871,74.072146,2.40238,4.693194,0.580141
Gemini 2.0 Flash,1.700703,12.329661,12.301971,11.70419,33.764375,35.871879,18.530435,43.135692,46.931945,2.372629,5.251023,0.966902
VideoLLaMA 2,12.548427,19.636126,24.165117,38.40094,47.106621,51.512206,57.928951,50.57263,57.850502,13.157308,5.444403,2.930457
Unified-IO 2,11.392738,11.887504,25.42209,34.107689,28.101243,48.248638,52.091405,30.661061,62.908144,8.226106,5.280774,1.747862
PandaGPT,2.935189,4.269353,5.51878,17.605054,18.415075,20.427112,19.889684,16.751232,18.200074,7.46746,5.860915,2.424693
OLA,13.027776,8.879385,18.356266,45.549384,24.944677,46.039207,56.201983,24.376631,50.368167,14.830792,6.909632,2.357754


In [9]:
print(table_17.to_latex(float_format="%.2f".__mod__))

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
metric & \multicolumn{3}{r}{accuracy} & \multicolumn{3}{r}{f1} & \multicolumn{3}{r}{hit_rate} & \multicolumn{3}{r}{mu} \\
modality & a & v & av & a & v & av & a & v & av & a & v & av \\
\midrule
Gemini 1.5 Flash & 1.66 & 14.35 & 16.15 & 14.26 & 36.58 & 41.84 & 32.29 & 46.83 & 58.14 & 10.44 & 4.16 & 0.77 \\
Gemini 1.5 Pro & 2.83 & 20.85 & 22.22 & 18.90 & 49.12 & 52.80 & 34.76 & 68.05 & 74.07 & 2.40 & 4.69 & 0.58 \\
Gemini 2.0 Flash & 1.70 & 12.33 & 12.30 & 11.70 & 33.76 & 35.87 & 18.53 & 43.14 & 46.93 & 2.37 & 5.25 & 0.97 \\
VideoLLaMA 2 & 12.55 & 19.64 & 24.17 & 38.40 & 47.11 & 51.51 & 57.93 & 50.57 & 57.85 & 13.16 & 5.44 & 2.93 \\
Unified-IO 2 & 11.39 & 11.89 & 25.42 & 34.11 & 28.10 & 48.25 & 52.09 & 30.66 & 62.91 & 8.23 & 5.28 & 1.75 \\
PandaGPT & 2.94 & 4.27 & 5.52 & 17.61 & 18.42 & 20.43 & 19.89 & 16.75 & 18.20 & 7.47 & 5.86 & 2.42 \\
OLA & 13.03 & 8.88 & 18.36 & 45.55 & 24.94 & 46.04 & 56.20 & 24.38 & 50.37 & 14.83 & 6.91 & 2.36 \\
\bottomru

# Poster Tables

## Voice over

In [2]:

from vggsounder.benchmark import benchmark
from vggsounder.labels import VGGSounder

models_path = "../../supplimentary/models-results"
csv_path= "../../supplimentary/data/vggsounder+background-music.csv"
display_names = {
    "gemini-1.5-flash": "Gemini 1.5 Flash",
    "gemini-1.5-pro": "Gemini 1.5 Pro",
    "gemini-2.0-flash": "Gemini 2.0 Flash",
    "video-llama-2-av": "VideoLLaMA 2",
    "unified-io-2": "Unified-IO 2",
    "pandagpt": "PandaGPT",
    "ola": "OLA",
}

# no voice over
vggsounder = VGGSounder(csv_path=csv_path)
vggsounder.set_meta_filters(voice_over=False)
no_voice_over_ids = [video.video_id for video in vggsounder]


no_voice_over_table = benchmark(
    models_path=models_path,
    display_names=display_names,
    metrics=[
        ("f1",["a", "v", "av"]),
    ],
    subset_ids=no_voice_over_ids,
    dataset_path=csv_path
)
no_voice_over_table

# voice over
vggsounder.set_meta_filters(voice_over=True)
voice_over_ids = [video.video_id for video in vggsounder]


voice_over_table = benchmark(
    models_path=models_path,
    display_names=display_names,
    metrics=[
        ("f1",["a", "v", "av"]),
    ],
    subset_ids=voice_over_ids,
    dataset_path=csv_path
)
voice_over_table


diff_voice_over_table =  no_voice_over_table - voice_over_table
diff_voice_over_table


  0%|          | 0/7 [00:00<?, ?it/s]



  0%|          | 0/7 [00:00<?, ?it/s]

metric,f1,f1,f1
modality,a,v,av
Gemini 1.5 Flash,-19.009456,5.113331,7.978973
Gemini 1.5 Pro,-19.846816,2.064353,0.884122
Gemini 2.0 Flash,-0.379375,6.14411,5.889554
VideoLLaMA 2,3.618252,3.550473,3.725342
Unified-IO 2,2.224669,2.011348,6.038606
PandaGPT,-4.666135,-0.786998,1.632249
OLA,-11.273834,0.733535,-4.249165


In [3]:
print(diff_voice_over_table.to_latex(float_format="%.2f".__mod__))

\begin{tabular}{lrrr}
\toprule
metric & \multicolumn{3}{r}{f1} \\
modality & a & v & av \\
\midrule
Gemini 1.5 Flash & -19.01 & 5.11 & 7.98 \\
Gemini 1.5 Pro & -19.85 & 2.06 & 0.88 \\
Gemini 2.0 Flash & -0.38 & 6.14 & 5.89 \\
VideoLLaMA 2 & 3.62 & 3.55 & 3.73 \\
Unified-IO 2 & 2.22 & 2.01 & 6.04 \\
PandaGPT & -4.67 & -0.79 & 1.63 \\
OLA & -11.27 & 0.73 & -4.25 \\
\bottomrule
\end{tabular}



## Background music

In [2]:
from vggsounder.benchmark import benchmark
from vggsounder.labels import VGGSounder

models_path = "../../supplimentary/models-results"
csv_path = "../../supplimentary/data/vggsounder+background-music.csv"
display_names = {
    "gemini-1.5-flash": "Gemini 1.5 Flash",
    "gemini-1.5-pro": "Gemini 1.5 Pro",
    "gemini-2.0-flash": "Gemini 2.0 Flash",
    "video-llama-2-av": "VideoLLaMA 2",
    "unified-io-2": "Unified-IO 2",
    "pandagpt": "PandaGPT",
    "ola": "OLA",
}

# no background music
vggsounder = VGGSounder(csv_path=csv_path)
vggsounder.set_meta_filters(background_music=False)
no_background_music_ids = [video.video_id for video in vggsounder]

no_background_music_table = benchmark(
    models_path=models_path,
    display_names=display_names,
    metrics=[
        ("f1", ["a", "v", "av"]),
    ],
    subset_ids=no_background_music_ids,
    dataset_path=csv_path
)
no_background_music_table

# with background music
vggsounder = VGGSounder(background_music=True) 
background_music_ids = [video.video_id for video in vggsounder]

background_music_table = benchmark(
    models_path=models_path,
    display_names=display_names,
    metrics=[
        ("f1", ["a", "v", "av"]),
    ],
    subset_ids=background_music_ids,
    dataset_path=csv_path
)
background_music_table

diff_background_music_table = no_background_music_table - background_music_table
diff_background_music_table


  0%|          | 0/7 [00:00<?, ?it/s]



  0%|          | 0/7 [00:00<?, ?it/s]

metric,f1,f1,f1
modality,a,v,av
Gemini 1.5 Flash,1.173969,2.392415,4.173242
Gemini 1.5 Pro,1.86154,3.672653,5.799092
Gemini 2.0 Flash,0.466296,1.918562,3.471032
VideoLLaMA 2,2.431225,4.620166,5.52472
Unified-IO 2,6.405868,-1.151202,4.17944
PandaGPT,5.97794,0.925743,2.745251
OLA,11.837778,-0.625064,2.873214


In [3]:
print(diff_background_music_table.to_latex(float_format="%.2f".__mod__))

\begin{tabular}{lrrr}
\toprule
metric & \multicolumn{3}{r}{f1} \\
modality & a & v & av \\
\midrule
Gemini 1.5 Flash & 1.17 & 2.39 & 4.17 \\
Gemini 1.5 Pro & 1.86 & 3.67 & 5.80 \\
Gemini 2.0 Flash & 0.47 & 1.92 & 3.47 \\
VideoLLaMA 2 & 2.43 & 4.62 & 5.52 \\
Unified-IO 2 & 6.41 & -1.15 & 4.18 \\
PandaGPT & 5.98 & 0.93 & 2.75 \\
OLA & 11.84 & -0.63 & 2.87 \\
\bottomrule
\end{tabular}



## Static Images

In [8]:
from vggsounder.benchmark import benchmark
from vggsounder.labels import VGGSounder

models_path = "../../supplimentary/models-results"
csv_path = "../../supplimentary/data/vggsounder+background-music.csv"
display_names = {
    "gemini-1.5-flash": "Gemini 1.5 Flash",
    "gemini-1.5-pro": "Gemini 1.5 Pro",
    "gemini-2.0-flash": "Gemini 2.0 Flash",
    "video-llama-2-av": "VideoLLaMA 2",
    "unified-io-2": "Unified-IO 2",
    "pandagpt": "PandaGPT",
    "ola": "OLA",
}

# static images only
vggsounder = VGGSounder(csv_path=csv_path)
vggsounder.set_meta_filters(static_image=True)
static_image_ids = [video.video_id for video in vggsounder]

static_image_table = benchmark(
    models_path=models_path,
    display_names=display_names,
    metrics=[
        ("f1", ["a", "v", "av"]),
    ],
    subset_ids=static_image_ids,
    dataset_path=csv_path
)
static_image_table

# no static images
vggsounder.set_meta_filters(static_image=False)
no_static_image_ids = [video.video_id for video in vggsounder]

no_static_image_table = benchmark(
    models_path=models_path,
    display_names=display_names,
    metrics=[
        ("f1", ["a", "v", "av"]),
    ],
    subset_ids=no_static_image_ids,
    dataset_path=csv_path
)
no_static_image_table

diff_static_image_table = no_static_image_table - static_image_table
diff_static_image_table



  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]



metric,f1,f1,f1
modality,a,v,av
Gemini 1.5 Flash,5.277933,7.313747,8.867581
Gemini 1.5 Pro,4.899848,6.86439,8.184127
Gemini 2.0 Flash,-1.962523,7.108631,4.389414
VideoLLaMA 2,-4.224274,9.516781,8.564778
Unified-IO 2,-1.881191,6.18815,2.378221
PandaGPT,3.682674,4.864189,5.275184
OLA,8.235859,5.401295,7.388457


In [9]:
print(diff_static_image_table.to_latex(float_format="%.2f".__mod__))

\begin{tabular}{lrrr}
\toprule
metric & \multicolumn{3}{r}{f1} \\
modality & a & v & av \\
\midrule
Gemini 1.5 Flash & 5.28 & 7.31 & 8.87 \\
Gemini 1.5 Pro & 4.90 & 6.86 & 8.18 \\
Gemini 2.0 Flash & -1.96 & 7.11 & 4.39 \\
VideoLLaMA 2 & -4.22 & 9.52 & 8.56 \\
Unified-IO 2 & -1.88 & 6.19 & 2.38 \\
PandaGPT & 3.68 & 4.86 & 5.28 \\
OLA & 8.24 & 5.40 & 7.39 \\
\bottomrule
\end{tabular}

