In [1]:
import torch
import torch.nn as nn
import pandas as pd
import pathlib
import os, sys
from typing import List

currentUrl = os.path.dirname("./notebooks")
parentUrl = os.path.abspath(os.path.join(currentUrl, os.pardir))
sys.path.append(parentUrl)

from src.models.MultiModalFusion import MultiModalFusion
from src.trainer.MultiModalFusionTrainer import MultiModalFusionTrainer
from src.utils.Retrieval import FetchSimilar
import yaml
from IPython.display import Audio, display, display_jpeg, Image
from IPython.core.display import HTML
from PIL import Image

with open('../configs/MultiModalFusion.yaml', 'r') as f:
    config = yaml.safe_load(f)

fetcher = FetchSimilar(
    chkpt_path="../logs/MultiModalFusion/uel3r2lo/checkpoints/epoch=95-val_loss=3.12-val_mean_similarity=0.37.ckpt",
    image_path="../datasets/flickr8k_audio_test/images/",
    audio_path="../datasets/flickr8k_audio_test/wavs/",
    device="cuda"
)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of Wav2Vec2ConformerModel were not initialized from the model checkpoint at facebook/wav2vec2-conformer-rope-large-960h-ft and are newly initialized: ['wav2vec2_conformer.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2_conformer.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return F.conv1d(input, weight, bias, self.stride,


In [2]:
captions = pd.read_csv("../datasets/flickr8k_audio_test/captions.txt", sep=',')
captions.loc[captions['image'] == '1000268201_693b08cb0e.jpg']#.iloc[0].caption

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [32]:
def extract_captions(file:str|pathlib.PosixPath) -> str:
    if type(file) == str:
        file = pathlib.Path(file)
    if file.suffix == ".wav":
        stem_split = file.stem.split('_')
        filename, speaker = "_".join(stem_split[:-1]) + ".jpg", int(stem_split[-1])
        return captions.loc[captions['image'] == filename].iloc[speaker].caption
    else:
        return " / ".join([caption for caption in captions.loc[captions['image'] == file.name].caption])

In [39]:
def render_top_k(query_path:str|pathlib.PosixPath, top_k:dict, query_class:str|None=None) -> None:
    data = []
    if type(query_path) == str:
        query_path = pathlib.Path(query_path)
    if query_path.suffix == ".wav":
        query_html = f'<audio controls src="{query_path}" style="display:block; margin:0 auto;"></audio>'
    else:
        query_html = f'<img src="{query_path}" alt="query image" style="max-width:300px; height:auto; display:block; margin:0 auto;">'
        
    for k, v in top_k.items():
        path, cls, embed, score = v.values()
        caption = extract_captions(path)
        data += [{ "path": path, "caption": caption, "embed": embed, "score": score, "modality": k.split("#")[1]}]
    # Define a function to render HTML for images and audio
    def render_table(idx, row):
        if row["modality"] == "image":
            display = f'<img src="{row["path"]}" alt="{row["caption"]}" style="width:100px;height:auto;">'
        else:
            display = f'<audio controls src="{row["path"]}" style="width:200px;"></audio>'
        return f'<tr><td>{idx+1}</td><td>{display}</td><td>{row["caption"]}</td><td>{row["score"]:.3f}</td></tr>'
    
    # Generate the table HTML
    table_html = """
    <table border="1" style="border-collapse:collapse; text-align:center; margin:auto;">
        <tr>
            <th>Rank</th>
            <th>Display</th>
            <th>Caption</th>
            <th>Similarity Score</th>
        </tr>
    """
    for idx, row in enumerate(data):
        table_html += render_table(idx, row)
    table_html += "</table>"

    # Combine top media and the table
    full_html = f"""
    <div style="text-align:center; margin-bottom:20px;">
        <b>{query_class if query_class is not None else str(query_path)}</b>
        {query_html}
    </div>
    {table_html}
    """
    
    # Display the complete HTML
    display(HTML(full_html))

In [40]:
query_path = "../datasets/flickr8k_audio_test/wavs/1977827746_4e13d7e19f_3.wav"
caption = extract_captions(query_path)
top_k, query_info = fetcher.top_k(
    path=query_path,
    modality="image",
    k=10
)
render_top_k(query_path, top_k, caption)

Rank,Display,Caption,Similarity Score
1,,"A young boy and girl sit in a wagon eating corndogs . / Two kids eat corndogs outside . / Two kids sit in a plastic tub outdoors , holding corndogs . / Two toddlers are eating corndogs in a small wagon or bin . / two young children eat corndogs .",0.183
2,,"A big brown dog runs with a stick in his mouth , and a big black down runs behind him . / A black dog and a brown dog with a stick in his mouth are running in the field . / A brown dog runs with a stick in its mouth and a black dog follows . / A dog running with a stick in its mouth . / Two dogs are running , one of them has a stick in its mouth .",0.142
3,,a girl in colorful leggings and a white shirt sits next to similarly dressed girl in a small cubicle . / Two girls crouch in a small stall . / Two women in a bathroom stall . / Two young woman dressed in white shirts sit in a small space . / Two young women sitting against a concrete wall .,0.139
4,,a man dragging between two horses holding on to the horse on his right / A man falls off his horse on a racetrack . / A person is kneeling between two horses / A rodeo contestent being drug between two horses . / Cowboy in blue and red pants getting bucked off a horse .,0.129
5,,A baby plays with a young boys face . / A baby touches the mans face while he is lying down . / A boy who seems ill is being touched in the face by a toddler . / A little baby holds the head of his older brother / An infant sitting on a cot reaching over to touch the face of an older boy .,0.128
6,,There are women in St Patrick 's Day costumes / Three women celebrate St Patrick 's day by dressing up in green and white on a busy street . / Three women dressed in green celebrating St Patricks Day . / Three women dressed up in green and shamrocks . / Three women wearing plenty of green and shamrocks .,0.125
7,,A brown dog is soaked and is walking out of the water . / a dog walks out of the water . / a long brown haired dog walking through a river . / A very wet brown dog is emerging from the water . / A wet dog walks out of the water .,0.124
8,,A girl and woman jump off of their swings . / Two females jump off of swings . / Two girls jump out of swings . / Two girls riding on swings . / Two girls swing over a red patterned surface .,0.12
9,,A man in a uniform stands next to a girl with purple hair holding a cup . / A military uniformed man with a purple accented hair female standing on concrete . / A smiling man in naval officers uniform stands next to a woman with purple hair . / A woman with purple hair and a man in military regalia . / Man in uniform stands near woman with purple hair .,0.112
10,,"A couple with a small child is standing on a busy street . / A family is standing in front of a silver car in a metro area . / A family waits to cross a busy street . / The man in blue , woman in white , and child in blue stand before a silver car . / Two adults and a child wait to cross a street .",0.093


In [41]:
query_path = "../datasets/flickr8k_audio_test/wavs/1977827746_4e13d7e19f_2.wav"
caption = extract_captions(query_path)
top_k, query_info = fetcher.top_k(
    path=query_path,
    modality="audio",
    k=10
)
render_top_k(query_path, top_k, caption)

Rank,Display,Caption,Similarity Score
1,,"Two kids sit in a plastic tub outdoors , holding corndogs .",1.0
2,,Two kids eat corndogs outside .,0.458
3,,A rodeo contestent being drug between two horses .,0.376
4,,Two young girls sumo wrestle while others watch .,0.346
5,,Two kids in bathing suits hugging .,0.345
6,,Girl sticks tongue out while hula hooping with two hoops,0.342
7,,Two toddlers are eating corndogs in a small wagon or bin .,0.333
8,,"Two bikers pose for a picture , one of which is wearing a skull mask .",0.329
9,,a young girl uses two hula hoops .,0.313
10,,two young children eat corndogs .,0.312


In [42]:
query_path = "../datasets/flickr8k_audio_test/images/2936590102_25036069a6.jpg"
caption = extract_captions(query_path)
top_k, query_info = fetcher.top_k(
    path=query_path,
    modality="audio",
    k=10
)
render_top_k(query_path, top_k, caption)

Rank,Display,Caption,Similarity Score
1,,White greyhound racing as dog number 1 .,0.363
2,,A white greyhound dog is wearing a red jacket with the number one printed on it .,0.354
3,,A large greyhound dog races wearing a red shirt .,0.303
4,,A greyhound with a red shirt and blue muzzle on running a race .,0.247
5,,A rodeo contestent being drug between two horses .,0.21
6,,a dog pounces into the water .,0.204
7,,A woman in a race turning a corner .,0.2
8,,A tan and white dog runs through water .,0.199
9,,A man with a red helmet and numbers on his arm and leg is riding a red racing bike .,0.198
10,,The dog has red straps in its back .,0.188


In [43]:
query_path = "../datasets/flickr8k_audio_test/images/2936590102_25036069a6.jpg"
caption = extract_captions(query_path)
top_k, query_info = fetcher.top_k(
    path=query_path,
    modality="image",
    k=10
)
render_top_k(query_path, top_k, caption)

Rank,Display,Caption,Similarity Score
1,,A Greyhound dog is wearing a red jersey with the number one on it and is mid run . / A greyhound with a red shirt and blue muzzle on running a race . / A large greyhound dog races wearing a red shirt . / A white greyhound dog is wearing a red jacket with the number one printed on it . / White greyhound racing as dog number 1 .,1.0
2,,Two white dogs are running on the grass . / Two white dogs are running through the grass . / Two white dogs running in a field / two white dogs running through the grass / Two white dogs with cutoff tails running in green grass .,0.434
3,,a man dragging between two horses holding on to the horse on his right / A man falls off his horse on a racetrack . / A person is kneeling between two horses / A rodeo contestent being drug between two horses . / Cowboy in blue and red pants getting bucked off a horse .,0.413
4,,"A race car moves down the road as two people watch from a distance . / A race car , numbered 104 is rounding a bend as two people watch and take pictures . / A silver and blue rally car is passing two spectators who are standing at the roadside . / Blue and silver car going around curve being watched by people standing in grass / Silver and blue car marked 104 raises dust on road as two background people watch .",0.365
5,,a brown and white dog jumps on the sidewalk . / A little white dog in running on the sidewalk . / A white dog with brown ears is running on the sidewalk . / Little brown and white dog running on the sidewalk . / The dog is running .,0.346
6,,"a dog jumps onto the sidewalk . / A small dog runs from the dirt onto the street / A tan , white , and black dog runs towards a sidewalk . / Dog leaps out . / The brown and white dog is standing on its hind legs beside the path .",0.317
7,,"A big brown dog runs with a stick in his mouth , and a big black down runs behind him . / A black dog and a brown dog with a stick in his mouth are running in the field . / A brown dog runs with a stick in its mouth and a black dog follows . / A dog running with a stick in its mouth . / Two dogs are running , one of them has a stick in its mouth .",0.307
8,,a cyclist / A man riding on a red bicycle . / a man wearing a white and red helmet riding his red bike down the street . / A man with a red helmet and numbers on his arm and leg is riding a red racing bike . / A man with a red helmet is riding on a red bicycle .,0.303
9,,A black dog carrying an object out of the water . / A black dog carrying some seaweed out of the water . / A black dog is coming out of the ocean water on the beach with something in its mouth . / A black dog with a stick in its mouth is walking out of the water onto a beach . / A dog retrieves a branch from on a beach .,0.303
10,,A grey colored dog walks in wet sand at a beach . / A grey dog plays in the sand at the ocean . / Light brown dog running towards something at the beach . / The brown dog is standing on the sandy beach . / The large grey colored dog is jumping on the beach .,0.286
