In [None]:
import pickle
from transformers import BartTokenizer


tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

vocab_pkl_path = "../data/vocab_info.pkl"
with open(vocab_pkl_path, "rb") as f:
    data = pickle.load(f)  # Load the data from the file

# Print the loaded data
vocabs = data["vocabulary"]
print("Total vocab size:", len(vocabs))


Total vocab size: 4360


In [74]:
unique_tokens = set()

word_to_token_id = {word: tokenizer.encode(word, add_special_tokens=False) for word in vocabs}

# Print the mapping
for word, token_ids in word_to_token_id.items():
    print(f"'{word}' -> {token_ids}")
    for token_id in token_ids:
        unique_tokens.add(token_id)
unique_tokens = sorted(unique_tokens)
print("Total unique tokens:", len(unique_tokens))
print("Unique tokens:", unique_tokens)


'its' -> [2629]
'after' -> [10669]
'sweet' -> [32588]
'bicycle' -> [428, 39943]
'weeds' -> [1694, 12080]
'thud' -> [212, 1906]
'contents' -> [10800, 4189]
'tuned' -> [24641, 196]
'thundering' -> [212, 5087, 154]
'spectators' -> [25594, 3629]
'shoes' -> [1193, 8013]
'factory' -> [506, 27670]
'releases' -> [241, 14931]
'vibrates' -> [705, 11804, 1626]
'slashing' -> [9996, 8141]
'swells' -> [4184, 17150]
'coaster' -> [876, 8831]
'lid' -> [462, 808]
'seashore' -> [1090, 1671, 1688]
'trapped' -> [9738, 5686]
'doorknob' -> [5016, 9657, 33212]
'facet' -> [506, 26799]
'upcoming' -> [658, 7936]
'discernible' -> [18909, 3281, 4748]
'today' -> [34375]
'whips' -> [605, 7903]
'stacked' -> [620, 10074]
'whimpers' -> [11613, 11850, 268]
'boxing' -> [35687]
'putting' -> [9179, 2577]
'groans' -> [15821, 1253]
'mountain' -> [17336, 1851]
'converse' -> [3865, 15189]
'prongs' -> [4862, 22321]
'my' -> [4783]
'river' -> [28199]
'leave' -> [38457]
'tumbled' -> [90, 10434]
'active' -> [12228]
'desired' -> [10

In [75]:
cus_tok_to_bart_tok = {0:0,
                       1:1,
                       2:2,
                       3:3,
                       4:50260}
for id in range(len(unique_tokens)):
    cus_tok_to_bart_tok[id+5] = unique_tokens[id]

bart_tok_to_cus_tok = {v: k for k, v in cus_tok_to_bart_tok.items()}
print("Custom vocab size:", len(cus_tok_to_bart_tok.keys()))


#saving the mapping in pkl file

custom_mapping_info = "../exp_settings/custom_mapping.pkl"
with open(custom_mapping_info, "wb") as store:
    pickle.dump({
        "cus_tok_to_bart_tok": cus_tok_to_bart_tok,
        "bart_tok_to_cus_tok": bart_tok_to_cus_tok}, store)
print("Save vocabulary info to", custom_mapping_info)

Custom vocab size: 3026
Save vocabulary info to ../exp_settings/custom_mapping.pkl


In [76]:
import pickle
vocab_pkl_path = "../exp_settings/custom_mapping.pkl"
with open(vocab_pkl_path, "rb") as f:
    data = pickle.load(f)  # Load the data from the file

# Print the loaded data
print(data.keys())
# vocabs = data["vocabulary"]
# print("Total vocab size:", len(vocabs))

dict_keys(['cus_tok_to_bart_tok', 'bart_tok_to_cus_tok'])


In [77]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch


tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

original_vocab_size = len(tokenizer)
print(f"Original BART Vocab Size: {original_vocab_size}")  # Should be ~50,265


Original BART Vocab Size: 50265


In [78]:
custom_vocab_map = data["cus_tok_to_bart_tok"]  # Define your mapping
new_vocab_size = len(custom_vocab_map)



original_embeddings = model.model.shared.weight  # Shape: (50265, hidden_dim)

# Extract only the relevant embeddings
new_embedding_matrix = torch.zeros((new_vocab_size, original_embeddings.shape[1]))

for new_id, bart_id in custom_vocab_map.items():
    new_embedding_matrix[new_id] = original_embeddings[bart_id]

print(f"New Embedding Matrix Shape: {new_embedding_matrix.shape}")


# # Create new vocab (reverse the mapping)
# new_vocab = {str(new_id): tokenizer.convert_ids_to_tokens(bart_id) for new_id, bart_id in custom_vocab_map.items()}

# # Save the new vocab as a file
# import json
# new_vocab_file = "../exp_settings/custom_bart_vocab.json"
# with open(new_vocab_file, "w") as f:
#     json.dump(new_vocab, f)

# # Load new tokenizer using PreTrainedTokenizerFast
# from transformers import PreTrainedTokenizerFast

# custom_tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
# custom_tokenizer.add_tokens(list(new_vocab.values()))  # Add only 3000 tokens

# print(f"Custom Tokenizer Vocab Size: {len(custom_tokenizer)}")  # Should be 3000


New Embedding Matrix Shape: torch.Size([3026, 768])


In [79]:
## Collating all the captions from dev, val, eval set into a single file all_captions.csv for custom token mapping

import pandas as pd 
dict = {}
splits = ["development", "validation", "evaluation"]
all_captions_file = "../data/all_captions.txt"
with open(all_captions_file, 'w', encoding='utf-8') as f:
    
    for split in splits:
        filepath = f"../data/{split}_captions.csv"
        df = pd.read_csv(filepath)
        for index, row in df.iterrows():
            f.write(row[2] + '\n') 
           

  f.write(row[2] + '\n')
  f.write(row[2] + '\n')
  f.write(row[2] + '\n')


In [80]:
# Create custom mapping to BART tokens

from collections import defaultdict
import json

def process_dataset_and_create_mapping(dataset_path):
    """
    Process dataset and create mapping to BART tokens.
    Handles both space-prefixed and non-prefixed versions of tokens.
    
    Args:
        dataset_path: Path to dataset.txt file
    
    Returns:
        token_mapping: Dict mapping custom ids to BART token ids
        token_freq: Dict containing frequency of each token
    """
    # Load BART tokenizer
    bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
    
    # Read dataset
    with open(dataset_path, 'r', encoding='utf-8') as f:
        sentences = f.readlines()
    
    # Clean sentences
    sentences = [s.strip() for s in sentences if s.strip()]
    
    # Dictionary to store token frequencies (both with and without Ġ)
    token_freq = defaultdict(int)
    
    # Set to store all unique BART tokens found in dataset
    unique_bart_tokens = set()
    
    # Process each sentence
    for sentence in sentences:
        # Get BART tokens for the sentence
        bart_tokens = bart_tokenizer.tokenize(sentence)
        
        # Update token frequencies and unique tokens
        for token in bart_tokens:
            token_freq[token] += 1
            unique_bart_tokens.add(token)
            
            # If token starts with Ġ, also add version without it
            if token.startswith('Ġ'):
                base_token = token[1:]  # Remove Ġ
                token_freq[base_token] += 1
                unique_bart_tokens.add(base_token)
    
    # Create mapping
    token_mapping = {}
    current_id = 0
    
    # First, add special tokens
    for special_token in [bart_tokenizer.bos_token, bart_tokenizer.eos_token, 
                         bart_tokenizer.pad_token, bart_tokenizer.unk_token]:
        token_mapping[current_id] = bart_tokenizer.convert_tokens_to_ids([special_token])[0]
        current_id += 1
    
    # Then add all other tokens
    for token in unique_bart_tokens:
        bart_id = bart_tokenizer.convert_tokens_to_ids([token])[0]
        if bart_id != bart_tokenizer.unk_token_id:  # Only add if it's a valid BART token
            token_mapping[current_id] = bart_id
            current_id += 1
    
    return token_mapping



token_mapping = process_dataset_and_create_mapping('../data/all_captions.txt')

with open("../exp_settings/custom_bart_mapping.json", 'w', encoding='utf-8') as f:
    json.dump(token_mapping, f, ensure_ascii=False, indent=2)

In [81]:
## Creating a custom tokenizer 

from transformers import BartTokenizer, BartForConditionalGeneration
import torch
import json
import os
from pathlib import Path
import shutil
from huggingface_hub import hf_hub_download

def reduce_bart_tokenizer(original_tokenizer, token_mapping, output_path):
    """
    Creates a new tokenizer with reduced vocabulary based on the mapping.
    
    Args:
        original_tokenizer: Original BART tokenizer
        token_mapping: Dict mapping new token ids to original BART token ids
        output_path: Path to save the modified tokenizer files
    """
    # Create output directory if it doesn't exist
    output_path = Path(output_path)
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Create a new vocabulary with only the mapped tokens
    new_vocab = {}
    reverse_mapping = {v: k for k, v in token_mapping.items()}
    
    for original_id in token_mapping.values():
        token = original_tokenizer.convert_ids_to_tokens([original_id])[0]
        new_id = reverse_mapping[original_id]
        new_vocab[token] = new_id
    
    # Save vocabulary file
    vocab_file = output_path / "vocab.json"
    with open(vocab_file, 'w', encoding='utf-8') as f:
        json.dump(new_vocab, f, ensure_ascii=False)
    
    # Get and save merges file
    try:
        merges_file = hf_hub_download(
            repo_id="facebook/bart-base",
            filename="merges.txt"
        )
        new_merges_file = output_path / "merges.txt"
        shutil.copy(merges_file, new_merges_file)
    except Exception as e:
        print(f"Error downloading merges file: {e}")
        print("Attempting to get merges from tokenizer cache...")
        cache_dir = Path(original_tokenizer.name_or_path)
        if cache_dir.exists():
            cached_merges = list(cache_dir.glob("**/merges.txt"))
            if cached_merges:
                shutil.copy(str(cached_merges[0]), output_path / "merges.txt")
            else:
                raise Exception("Could not find merges.txt in cache")
    
    # Create a new tokenizer instance with the saved files
    reduced_tokenizer = BartTokenizer(
        vocab_file=str(vocab_file),
        merges_file=str(output_path / "merges.txt"),
        bos_token=original_tokenizer.bos_token,
        eos_token=original_tokenizer.eos_token,
        pad_token=original_tokenizer.pad_token,
        unk_token=original_tokenizer.unk_token,
        mask_token=original_tokenizer.mask_token,
        sep_token=original_tokenizer.sep_token,
        cls_token=original_tokenizer.cls_token
    )
    
    # Save the tokenizer configuration
    reduced_tokenizer.save_pretrained(output_path)
    
    return reduced_tokenizer

def main():
    # Load original BART tokenizer
    bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

    # Open and read the JSON file
    with open('../exp_settings/custom_bart_mapping.json', 'r', encoding='utf-8') as file:
        mapping = json.load(file)
        # Convert string keys to integers if necessary
        mapping = {int(k): int(v) for k, v in mapping.items()}

    # Create reduced tokenizer
    tokenizer = reduce_bart_tokenizer(bart_tokenizer, mapping, "../custom_BART_config")
    return tokenizer

if __name__ == "__main__":
    tokenizer = main()

In [82]:
import string

voacb_size = tokenizer.vocab_size
print(voacb_size)

ou_e = "a shubham is turning a map over and over."
ou_e = ou_e.translate(str.maketrans('', '', string.punctuation))
ou_e = ou_e.lower()
print(ou_e)
tok_e = tokenizer(ou_e)
print(tok_e)
output = tokenizer.decode(tok_e['input_ids']).replace('</s>', '').replace('<s>', '')
print(output)

5965
a shubham is turning a map over and over
{'input_ids': [0, 381, 3128, 3, 3, 1071, 3213, 249, 2251, 2294, 3784, 2294, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
a sh<unk><unk> is turning a map over and over


In [83]:
#load custom bart tokenizer
from transformers import BartTokenizer
tkr = BartTokenizer.from_pretrained("../custom_BART_config")

In [84]:
## Change embedding matrix for custom BART

import torch
from transformers import BartForConditionalGeneration
import os
import json

def create_and_save_custom_bart_embeddings(
    mapping, 
    save_path,
    bart_model_name="facebook/bart-base"
):
    """
    Create custom embedding matrix and save it for future use.
    
    Args:
        number_to_bart_id_mapping (dict): Dictionary mapping custom numbers to BART token IDs
        save_path (str): Path to save the embeddings and mapping
        bart_model_name (str): Name of the pre-trained BART model
    """
    # Load pre-trained BART model
    bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name)
    
    # Get BART's embedding matrix
    bart_embeddings = bart_model.get_input_embeddings().weight.data
    
    # Create new embedding matrix
    custom_vocab_size = len(mapping)
    embedding_dim = bart_embeddings.shape[1]
    custom_embeddings = torch.zeros((custom_vocab_size, embedding_dim))
    
    # Transfer embeddings in the specified order
    for custom_num, bart_id in mapping.items():
        custom_idx = int(custom_num)
        custom_embeddings[custom_idx] = bart_embeddings[bart_id]
    
    # Save embeddings and mapping
    os.makedirs(save_path, exist_ok=True)
    torch.save(custom_embeddings, os.path.join(save_path, 'custom_bart_embeddings.pt'))

def load_bart_with_custom_embeddings(save_path, bart_model_name="facebook/bart-base", freeze_embeddings=True):
    """
    Load BART model with previously saved custom embeddings.
    
    Args:
        save_path (str): Path where embeddings and mapping were saved
        bart_model_name (str): Name of the pre-trained BART model
        
    Returns:
        model: BART model with custom embeddings
    """
    # Load the model
    model = BartForConditionalGeneration.from_pretrained(bart_model_name)
    
    # Load custom embeddings
    custom_embeddings = torch.load(os.path.join(save_path, 'custom_bart_embeddings.pt'))
    
    # Update both encoder and decoder embeddings
    model.model.encoder.embed_tokens.weight.data = custom_embeddings
    model.model.decoder.embed_tokens.weight.data = custom_embeddings
    
    # If using shared embeddings, this line is also necessary
    if model.config.tie_word_embeddings:
        model.model.shared.weight.data = custom_embeddings

    if freeze_embeddings:
        # Freeze encoder embeddings
        model.model.encoder.embed_tokens.weight.requires_grad = False
        
        # Freeze decoder embeddings
        model.model.decoder.embed_tokens.weight.requires_grad = False
        
        # Freeze shared embeddings if they exist
        if model.config.tie_word_embeddings:
            model.model.shared.weight.requires_grad = False
    
    return model



# First time: Create and save embeddings
save_path = "../exp_settings/"
with open('../exp_settings/custom_bart_mapping.json', 'r', encoding='utf-8') as file:
    mapping = json.load(file)


create_and_save_custom_bart_embeddings(mapping, save_path)

# Later: Load model with custom embeddings
# model = load_bart_with_custom_embeddings(save_path)

In [1]:
# Load and process audioset class mappings to extract embeddings from BART-base tokenizer
import pandas as pd
import re

def clean_element(element):
    # Extract substring before a comma or an open bracket
    return re.split(r',|\(', element)[0].strip()

csv_path = "../audioset_classes_embeddings/class_labels_indices.csv"
df = pd.read_csv(csv_path)
classes = df["display_name"]

processed_classes = []
for ele in classes:
    processed_classes.append(clean_element(ele))



In [4]:
from transformers import BartTokenizer, BartModel
import torch 

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartModel.from_pretrained("facebook/bart-base")

classes_tokens = []
for cls in processed_classes:
    token_ids = tokenizer.encode(cls, add_special_tokens=False)
    classes_tokens.append(token_ids)


In [3]:
import pickle

token_embeddings = model.shared.weight
class_embeddings = {}
max_length_embedding = 0
for idx, class_token in enumerate(classes_tokens):
    class_embeddings[idx] = token_embeddings[class_token]
    max_length_embedding = max(max_length_embedding, len(class_token))

print("Max token sequence", max_length_embedding )

# Save dictionary as a .pkl file
with open("../audioset_classes_embeddings/classes_embeddings.pkl", "wb") as pkl_file:
    pickle.dump(class_embeddings, pkl_file)

Max token sequence 6


In [2]:
# Using ConvNext model to extract top-5 classes out of 527 for each file in each split and saving their embeddings in a .pkl file
import sys
sys.path.append("../")
from convnext.convnext import convnext_tiny
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
audio_enc = convnext_tiny(pretrained=False, strict=False, drop_path_rate=0.0, after_stem_dim=[252, 56], use_speed_perturb=False)
state_dict = torch.load("../convnext/convnext_tiny_471mAP.pth", map_location=device)
try:
    audio_enc.load_state_dict(state_dict['model'])
    print("Model loaded successfully")
except:
    print("Model loading failed")

audio_enc.to(device)
audio_enc.eval()


  state_dict = torch.load("../convnext/convnext_tiny_471mAP.pth", map_location=device)


Model loaded successfully


ConvNeXt(
  (spectrogram_extractor): Spectrogram(
    (stft): STFT(
      (conv_real): Conv1d(1, 513, kernel_size=(1024,), stride=(320,), bias=False)
      (conv_imag): Conv1d(1, 513, kernel_size=(1024,), stride=(320,), bias=False)
    )
  )
  (logmel_extractor): LogmelFilterBank()
  (spec_augmenter): SpecAugmentation(
    (time_dropper): DropStripes()
    (freq_dropper): DropStripes()
  )
  (bn0): BatchNorm2d(224, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (downsample_layers): ModuleList(
    (0): Sequential(
      (0): Conv2d(1, 96, kernel_size=(4, 4), stride=(4, 4), padding=(4, 0))
      (1): LayerNorm()
    )
    (1): Sequential(
      (0): LayerNorm()
      (1): Conv2d(96, 192, kernel_size=(2, 2), stride=(2, 2))
    )
    (2): Sequential(
      (0): LayerNorm()
      (1): Conv2d(192, 384, kernel_size=(2, 2), stride=(2, 2))
    )
    (3): Sequential(
      (0): LayerNorm()
      (1): Conv2d(384, 768, kernel_size=(2, 2), stride=(2, 2))
    )
  )
  (stages): Mo

In [None]:
import h5py
import pickle
from tqdm import tqdm

with open("../audioset_classes_embeddings/classes_embeddings.pkl", "rb") as f:
    class_embeddings = pickle.load(f)

with torch.no_grad():
    for split in ["evaluation"]:   #evaluation  #validation #development
        top_5_classes_embed = dict()

        audio_path = f"../data_32k_224_mels/{split}_audio_logmels.hdf5"
        with h5py.File(audio_path, "r") as h5_file:
            keys = list(h5_file.keys())
            for key in tqdm(keys, desc="Processing audio files"):
                audio = h5_file[key][()]
                audio = torch.tensor(audio).unsqueeze(0).to(device)
                output = audio_enc(audio)
                clipwise_output = output["clipwise_output"]

                top_values, top_indices = torch.topk(clipwise_output, k=30)
                top_indices = top_indices.squeeze()

                top_5_embds = [class_embeddings[idx.item()] for idx in top_indices]
                top_5_embds = torch.cat(top_5_embds, dim=0)

                top_5_classes_embed[key] = top_5_embds
            
            with open(f"../audioset_classes_embeddings/{split}_top30_classes_embeddings.pkl", "wb") as pkl_file:
                pickle.dump(top_5_classes_embed, pkl_file)
            



            
            

Processing audio files: 100%|██████████| 1045/1045 [00:25<00:00, 41.07it/s]


In [2]:
import sys
sys.path.append("../")
from data_loader import get_dataset
from transformers import BartTokenizer
from utils.file_io import load_yaml_file
from pathlib import Path

settings = load_yaml_file(Path("../exp_settings/dcb.yaml"))
tokenizer = BartTokenizer.from_pretrained(settings['lm']['tokenizer'], use_fast=True)

data_eval, _ = get_dataset('evaluation', settings, tokenizer)

print('Loaded evaluation dataset.')
input = data_eval[700]

  from .autonotebook import tqdm as notebook_tqdm


Loaded evaluation dataset.


  return torch.load(io.BytesIO(b))


In [3]:
print(input)

{'audio_features': tensor([[-41.8279, -43.8636, -37.6985,  ..., -39.4410, -37.5987, -38.8433],
        [-38.6436, -43.7310, -37.5658,  ..., -32.3968, -34.8853, -35.6601],
        [-33.0148, -40.2031, -34.0380,  ..., -31.4097, -34.3736, -34.6979],
        ...,
        [  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0

In [4]:
input['keyword_embeddings'].shape

torch.Size([30, 768])

In [1]:
import torch
zero_padding = torch.zeros(30, 768)
zero_padding.shape

torch.Size([30, 768])

In [None]:
import random

def process_file(filename):
    ground_truth = []
    predictions = []

    with open(filename, 'r') as file:
        lines = file.readlines()

    # Process in chunks of 8 lines
    for i in range(0, len(lines), 8):
        chunk = lines[i:i+8]
        if len(chunk) < 8:
            continue  # Skip incomplete chunks

        # Remove first and second lines
        chunk = chunk[2:]

        # Select a random line from lines 3 to 7 (which are now chunk[0:5])
        gt = [random.choice(chunk[:5]).strip()]
        pred = chunk[5].strip().replace("Pred: ","")  # Line 8 (which is now at index 5)
        
        ground_truth.append(gt)
        predictions.append(pred)

    return ground_truth, predictions
    

# Example usage
filename = "../outputs/exp_013_custom_bart_custom_vocab_out/generated_captions_beam.txt"  # Change this to your file
ground_truth, predictions = process_file(filename)
# process_file(filename)

# print("Selected Lines:", selected)
# print("Last Lines:", last)


: 

In [2]:
print(ground_truth)
print(predictions)

['a machine is running at a constant rate', 'a man is speaking while a radio is speaking', 'a radio is being played on a radio', 'an object is being moved around in a room', 'a metal object is being moved around in a room', 'a group of people are talking in the background', 'a man speaks and then a man speaks again', 'a man is speaking while a man speaks', 'the wind is blowing at a steady pace', 'a metal object is being moved around in a container', 'a bell is ringing while people are talking in the background', 'a person is walking through a bag of paper', 'the wind is blowing while the wind blows in the background', 'a person is walking on a surface at a steady pace', 'a machine is running at a constant speed', 'a vehicle is driving by while people are talking in the background', 'crickets are chirping loudly in the background', 'a synthesizer is playing a synthesizer in the background', 'a radio is being turned on and then it is turned off', 'a person is using a piece of wood', 'a p

In [1]:
import sys
sys.path.append("../")
from eval_metrics import evaluate_metrics_from_lists

preds = ["A frog is jumping"]
gt = [["A frog jumps"], ["A frog jumps"], ["A frog jumps"], ["A frog jumps"], ["A frog jumps"]]
# metrics, per_file_metrics = evaluate_metrics_from_lists(predictions, ground_truth)  
metrics, per_file_metrics = evaluate_metrics_from_lists(preds, gt)  

# print(metrics)
# print(per_file_metrics)
# 0.059 0.061 0.046 0.048 0.048

loading annotations into memory...
0:00:00.000467
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 3 tokens at 32.74 tokens per second.
PTBTokenizer tokenized 4 tokens at 49.06 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 4, 'reflen': 3, 'guess': [4, 3, 2, 1], 'correct': [2, 1, 0, 0]}
ratio: 1.3333333328888892
Bleu_1: 0.500
Bleu_2: 0.408
Bleu_3: 0.000
Bleu_4: 0.000
computing METEOR score...
METEOR: 0.403
computing Rouge score...
ROUGE_L: 0.587
computing CIDEr score...
CIDEr: 0.000
computing SPICE score...


Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.5 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.1 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.7 sec].
Loading classif

SPICE evaluation took: 5.500 s
SPICE: 1.000
computing SPIDEr score...
SPIDEr: 0.500


In [1]:
import torch
import os
import sys
beats_path = os.path.abspath("../")  
sys.path.append(beats_path)

from beats.BEATs import BEATs, BEATsConfig

# load the pre-trained checkpoints
checkpoint = torch.load('../beats/BEATs_iter3.pt')

cfg = BEATsConfig(checkpoint['cfg'])
BEATs_model = BEATs(cfg)
BEATs_model.load_state_dict(checkpoint['model'])
BEATs_model.eval()


device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
BEATs_model.to(device)
# extract the the audio representation
audio_input_16khz = torch.randn(3, 10000).to(device)
padding_mask = torch.zeros(3, 10000).bool().to(device)

# representation = BEATs_model.extract_features(audio_input_16khz, padding_mask=padding_mask)[0]
representation = BEATs_model.extract_features(audio_input_16khz, padding_mask=padding_mask)

  checkpoint = torch.load('../beats/BEATs_iter3.pt')
  WeightNorm.apply(module, name, dim)


Before patch embedding shape:  torch.Size([3, 1, 61, 128])
After patch embedding shape:  torch.Size([3, 512, 3, 8])


In [3]:
representation[0].shape, representation[1].shape, representation[2], representation[3]

(torch.Size([3, 24, 768]), torch.Size([3, 24]), 3, 8)

In [None]:
import sys
sys.path.insert(1,"../")
from data_loader_beats import AACDataset, default_data_collator
from transformers import BartTokenizer
import yaml
from torch.utils.data import DataLoader
from pathlib import Path


with open("../exp_settings/dcb.yaml", "r") as file:
    settings = yaml.safe_load(file)

data_dir = settings['data']['root_dir']
data_dir = Path(data_dir)
caption_dir = settings['data']['caption_dir']
caption_dir = Path(caption_dir)

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base", use_fast=True)

beats_dataset = AACDataset(settings, data_dir, caption_dir, 'evaluation', tokenizer)
beats_dataloader = DataLoader(beats_dataset, batch_size=2, shuffle=True, collate_fn=default_data_collator)


from data_loader import AACDataset, default_data_collator
convnext_dataset = AACDataset(settings, data_dir, caption_dir, 'evaluation', tokenizer)
convnext_dataloader = DataLoader(convnext_dataset, batch_size=2, shuffle=True, collate_fn=default_data_collator)


In [2]:
c_it = iter(convnext_dataloader)
b_it = iter(beats_dataloader)

print(next(c_it))
print(next(b_it))

{'audio_features': tensor([[-0.0008, -0.0010, -0.0005,  ...,  0.0003, -0.0011, -0.0006],
        [ 0.0338,  0.0538,  0.0495,  ...,  0.0000,  0.0000,  0.0000]]), 'attention_mask': tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ...,  True,  True,  True]]), 'decoder_attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'file_name': ['Water_Lapping_River.wav', '1122thrum.wav'], 'labels': tensor([[    0,   102,  3539, 19143,    63,  7886,    25,    24,    16,  2468,
            66,     9,     5,   514,    30,    10,  5651,  9438,     2,     1,
        

In [26]:
from tqdm import tqdm
l = []

with torch.no_grad():
    for idx, batch in tqdm(enumerate(dataloader)):
        representation = BEATs_model.extract_features(batch['audio_features'].to(device), batch['attention_mask'].to(device))

349it [01:03,  5.49it/s]


KeyboardInterrupt: 

In [5]:
it = iter(dataloader)
data = next(it)
print(data['audio_features'].shape)
print(data['attention_mask'].shape)

torch.Size([2, 449385])
torch.Size([2, 449385])


In [3]:
!pip install tiktoken
import tiktoken

# Load tokenizer for gpt-3.5-turbo
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

# Example multilingual text
text = "Bonjour! 你好! नमस्ते! Hello!"

# Tokenize
tokens = enc.encode(text)
print(tokens, len(tokens))

# Decode
decoded = enc.decode(tokens)
print(decoded)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[82681, 0, 220, 57668, 53901, 0, 15272, 101, 88344, 79468, 31584, 97, 35470, 0, 22691, 0] 16
Bonjour! 你好! नमस्ते! Hello!


In [15]:
# Load json and give the metrics
import json
filename = "/home/akhil/shubham/dcase-2023-baseline/outputs/exp_019_beats_as_ft_model_finetune_custom_bart_custom_vocab_out/metrics_coco_beam.json"
# Load JSON file
with open(filename, 'r') as file:
    data = json.load(file)

# Now `data` is a Python dict (or list, depending on the JSON structure)
# print(data)
for metric in data.keys():
    print(metric, data[metric]['score'])

bleu_1 0.5942677548988798
bleu_2 0.4041782016978456
bleu_3 0.274992113493107
bleu_4 0.17724958734160204
meteor 0.17671923646536505
rouge_l 0.3892016210670046
cider 0.42100996803087415
spice 0.12091237446803768
spider 0.2709611712494559
spider_fl 0.2551646382463473


In [None]:
dict_keys(['bleu_1', 'bleu_2', 'bleu_3', 'bleu_4', 'meteor', 'rouge_l', 'cider', 'spice', 'spider', 'spider_fl'])