In [1]:
import torch
from transformers import ASTFeatureExtractor, ASTModel
import torchaudio
import os

# -------------------------------------------------------------------------- #
# ------------------------------ Model Loading ----------------------------- #
# -------------------------------------------------------------------------- #

# Load the feature extractor and model
feature_extractor = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
model = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")


def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    return waveform, sample_rate


def get_embeddings(file_path):
    # Load audio
    waveform, sample_rate = load_audio(file_path)
    
    # Resample to 16 kHz if not already
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    
    # Preprocess to match model input
    inputs = feature_extractor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
    
    # Move model and inputs to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()  # Averaging across time dimension
        
    return embeddings.cpu()  # Move embeddings back to CPU if needed


folder_path = "/kaggle/input/testingembeddings/TestingEmbedding"

# Loop through the audio files and extract embeddings
embeddings_dict = {}
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    if file_path.endswith('.mp3') or file_path.endswith('.wav'):  # Check audio format
        embeddings = get_embeddings(file_path)
        embeddings_dict[file_name] = embeddings


print(embeddings_dict)

preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

{'House (3).mp3': tensor([[ 0.7692, -0.7077,  0.5604,  ..., -0.2571, -0.4534,  0.5017],
        [ 0.7342, -0.6028,  0.6345,  ..., -0.2790, -0.4677,  0.6714]]), 'DeepHouse1 (1).mp3': tensor([[ 0.9456,  0.3324,  0.1903,  ..., -0.2377, -0.9160,  1.0630],
        [ 0.9730,  0.2654,  0.2404,  ..., -0.3548, -0.9983,  0.9740]]), 'House (1).mp3': tensor([[ 1.5623, -0.1902, -0.3728,  ..., -0.4106, -0.1305,  0.9027],
        [ 1.4673, -0.1913, -0.1964,  ..., -0.5215, -0.1411,  0.8944]]), 'AfroHouse1.mp3': tensor([[ 0.6066, -0.2492, -0.4886,  ..., -0.5231, -0.0573,  0.4633],
        [ 0.5589, -0.3098, -0.3416,  ..., -0.5105, -0.1697,  0.6014]]), 'AfroHouse2.mp3': tensor([[ 0.9118, -0.0214,  0.4618,  ..., -0.9112, -0.6064,  0.6468],
        [ 0.9019, -0.0609,  0.3931,  ..., -0.9399, -0.6030,  0.6325]]), 'House (2).mp3': tensor([[ 0.7855, -0.2513,  0.4842,  ..., -0.5188, -0.4897,  1.0515],
        [ 0.7929, -0.2414,  0.4718,  ..., -0.4597, -0.5172,  0.9927]]), 'MelodicHouse (3).mp3': tensor([[ 0.92

In [3]:
import torch

def find_most_similar_audio(embeddings_dict):
    # Convert dictionary of embeddings to lists for easy indexing
    file_names = list(embeddings_dict.keys())
    
    # Average embeddings along the first dimension to get shape [768] for each file
    embeddings = [embedding.mean(dim=0) for embedding in embeddings_dict.values()]
    
    # Store most similar files and similarity scores
    most_similar_files = {}
    
    # Loop through each file to find the most similar one
    for i in range(len(embeddings)):
        current_embedding = embeddings[i]  # Select the current embedding
        max_similarity = -1  # Initialize to minimum similarity
        most_similar_file = None
        
        # Compute similarity with all other files
        for j in range(len(embeddings)):
            if i != j:  # Skip comparison with itself
                # Compute cosine similarity for embeddings with shape [768]
                similarity = torch.cosine_similarity(current_embedding, embeddings[j], dim=0).item()
                
                # Update most similar file if similarity is higher
                if similarity > max_similarity:
                    max_similarity = similarity
                    most_similar_file = file_names[j]
        
        # Store result for the current file
        most_similar_files[file_names[i]] = (most_similar_file, max_similarity)
    
    # Display results
    for file, (similar_file, similarity) in most_similar_files.items():
        print(f"The most similar file to '{file}' is '{similar_file}' with a similarity score of {similarity:.4f}")

# Usage
find_most_similar_audio(embeddings_dict)

The most similar file to 'House (3).mp3' is 'MelodicHouse (1).mp3' with a similarity score of 0.9357
The most similar file to 'DeepHouse1 (1).mp3' is 'House (2).mp3' with a similarity score of 0.9402
The most similar file to 'House (1).mp3' is 'AfroHouse1.mp3' with a similarity score of 0.8859
The most similar file to 'AfroHouse1.mp3' is 'MelodicHouse (2).mp3' with a similarity score of 0.8984
The most similar file to 'AfroHouse2.mp3' is 'AfroHouse3.mp3' with a similarity score of 0.9989
The most similar file to 'House (2).mp3' is 'DeepHouse1 (3).mp3' with a similarity score of 0.9541
The most similar file to 'MelodicHouse (3).mp3' is 'AfroHouse3.mp3' with a similarity score of 0.9187
The most similar file to 'DeepHouse1 (3).mp3' is 'House (2).mp3' with a similarity score of 0.9541
The most similar file to 'DeepHouse1 (2).mp3' is 'AfroHouse1.mp3' with a similarity score of 0.8624
The most similar file to 'MelodicHouse (2).mp3' is 'AfroHouse1.mp3' with a similarity score of 0.8984
The m

{'House (3).mp3': tensor([[ 0.7692, -0.7077,  0.5604,  ..., -0.2571, -0.4534,  0.5017],
        [ 0.7342, -0.6028,  0.6345,  ..., -0.2790, -0.4677,  0.6714]]), 'DeepHouse1 (1).mp3': tensor([[ 0.9456,  0.3324,  0.1903,  ..., -0.2377, -0.9160,  1.0630],
        [ 0.9730,  0.2654,  0.2404,  ..., -0.3548, -0.9983,  0.9740]]), 'House (1).mp3': tensor([[ 1.5623, -0.1902, -0.3728,  ..., -0.4106, -0.1305,  0.9027],
        [ 1.4673, -0.1913, -0.1964,  ..., -0.5215, -0.1411,  0.8944]]), 'AfroHouse1.mp3': tensor([[ 0.6066, -0.2492, -0.4886,  ..., -0.5231, -0.0573,  0.4633],
        [ 0.5589, -0.3098, -0.3416,  ..., -0.5105, -0.1697,  0.6014]]), 'AfroHouse2.mp3': tensor([[ 0.9118, -0.0214,  0.4618,  ..., -0.9112, -0.6064,  0.6468],
        [ 0.9019, -0.0609,  0.3931,  ..., -0.9399, -0.6030,  0.6325]]), 'House (2).mp3': tensor([[ 0.7855, -0.2513,  0.4842,  ..., -0.5188, -0.4897,  1.0515],
        [ 0.7929, -0.2414,  0.4718,  ..., -0.4597, -0.5172,  0.9927]]), 'MelodicHouse (3).mp3': tensor([[ 0.92