In [1]:
from datasets import load_dataset

# load the dataset from huggingface model hub
data = load_dataset("ashraq/esc50", split="train")
data

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 345/345 [00:00<00:00, 279kB/s]
Repo card metadata block was not found. Setting CardData to empty.
Downloading metadata: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.61k/1.61k [00:00<00:00, 4.44MB/s]
Downloading data files:   0%|                                                                                                                                        | 0/1 [00:00<?, ?it/s]
Downloading data:   0%|                                                                                                                                         | 0.00/387M [00:00<?, ?B/s][A
Downloading data:   1%|█▍                                                                                                       

Dataset({
    features: ['filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take', 'audio'],
    num_rows: 2000
})

In [2]:
audios = data["audio"]
audios[:3]

[{'path': None,
  'array': array([0., 0., 0., ..., 0., 0., 0.]),
  'sampling_rate': 44100},
 {'path': None,
  'array': array([-0.01184082, -0.10336304, -0.14141846, ...,  0.06985474,
          0.04049683,  0.00274658]),
  'sampling_rate': 44100},
 {'path': None,
  'array': array([-0.00695801, -0.01251221, -0.01126099, ...,  0.215271  ,
         -0.00875854, -0.28903198]),
  'sampling_rate': 44100}]

In [3]:
import numpy as np

# select only the audio data from the dataset and store in a numpy array
audios = np.array([a["array"] for a in data["audio"]])

In [5]:
from panns_inference import AudioTagging

# load the default model into the gpu.
model = AudioTagging(checkpoint_path=None, device='cuda')

--2023-08-03 05:05:46--  http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 2607:f8b0:4006:809::2010, 2607:f8b0:4006:821::2010, 2607:f8b0:4006:806::2010, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|2607:f8b0:4006:809::2010|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14675 (14K) [application/octet-stream]
Saving to: '/Users/sayalidalvi/panns_data/class_labels_indices.csv'

     0K .......... ....                                       100%  545K=0.03s

2023-08-03 05:05:46 (545 KB/s) - '/Users/sayalidalvi/panns_data/class_labels_indices.csv' saved [14675/14675]

--2023-08-03 05:05:46--  https://zenodo.org/record/3987831/files/Cnn14_mAP%3D0.431.pth?download=1
Resolving zenodo.org (zenodo.org)... 

Checkpoint path: /Users/sayalidalvi/panns_data/Cnn14_mAP=0.431.pth


188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 327428481 (312M) [application/octet-stream]
Saving to: '/Users/sayalidalvi/panns_data/Cnn14_mAP=0.431.pth'

     0K .......... .......... .......... .......... ..........  0%  393K 13m33s
    50K .......... .......... .......... .......... ..........  0%  401K 13m25s
   100K .......... .......... .......... .......... ..........  0%  396K 13m26s
   150K .......... .......... .......... .......... ..........  0% 87.0M 10m5s
   200K .......... .......... .......... .......... ..........  0%  396K 10m45s
   250K .......... .......... .......... .......... ..........  0%  396K 11m12s
   300K .......... .......... .......... .......... ..........  0%  402K 11m30s
   350K .......... .......... .......... .......... ..........  0%  396K 11m44s
   400K .......... .......... .......... .......... ..........  0%  119M 10m26s
   450K .......... .......... .

Using CPU.


In [8]:
import pinecone
index_name = "audio-search-demo"
pinecone.init(
        api_key="0a1e3175-246a-4327-be8f-c442ab03da50",
        environment="asia-southeast1-gcp-free"
    )
# check if the audio-search index exists
if index_name not in pinecone.list_indexes():
    # create the index if it does not exist
    pinecone.create_index(
        index_name,
        dimension=2048,
        metric="cosine"
    )

# connect to audio-search index we created
index = pinecone.Index(index_name)

In [9]:
from tqdm.auto import tqdm

# we will use batches of 64
batch_size = 64

for i in tqdm(range(0, len(audios), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(audios))
    # extract batch
    batch = audios[i:i_end]
    # generate embeddings for all the audios in the batch
    _, emb = model.inference(batch)
    # create unique IDs
    ids = [f"{idx}" for idx in range(i, i_end)]
    # add all to upsert list
    to_upsert = list(zip(ids, emb.tolist()))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [03:34<00:00,  6.69s/it]


{'dimension': 2048,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2000}},
 'total_vector_count': 2000}

In [10]:
from IPython.display import Audio, display

# we set an audio number to select from the dataset
audio_num = 400
# get the audio data of the audio number
query_audio = data[audio_num]["audio"]["array"]
# get the category of the audio number
category = data[audio_num]["category"]
# print the category and play the audio
print("Query Audio:", category)
Audio(query_audio, rate=44100)

Query Audio: car_horn


In [11]:
# reshape query audio
query_audio = query_audio[None, :]
# get the embeddings for the audio from the model
_, xq = model.inference(query_audio)
xq.shape

(1, 2048)

In [12]:
# query pinecone index with the query audio embeddings
results = index.query(xq.tolist(), top_k=3)
results

{'matches': [{'id': '400', 'score': 1.0, 'values': []},
             {'id': '1667', 'score': 0.842124581, 'values': []},
             {'id': '1666', 'score': 0.831768692, 'values': []}],
 'namespace': ''}

In [13]:
# play the top 3 similar audios
for r in results["matches"]:
    # select the audio data from the databse using the id as an index
    a = data[int(r["id"])]["audio"]["array"]
    display(Audio(a, rate=44100))

In [14]:
def find_similar_audios(id):
    print("Query Audio:")
    # select the audio data from the databse using the id as an index
    query_audio = data[id]["audio"]["array"]
    # play the query audio
    display(Audio(query_audio, rate=44100))
    # query pinecone index with the query audio id
    result = index.query(id=str(id), top_k=5)
    print("Result:")
    # play the top 5 similar audios
    for r in result["matches"]:
        a = data[int(r["id"])]["audio"]["array"]
        display(Audio(a, rate=44100))

In [15]:
find_similar_audios(1642)

Query Audio:


Result:
