In [None]:
!pip install librosa
!pip install datasets
!pip install soundfile
!pip install transformers
!pip install huggingface_hub

In [49]:
# Standard Library Imports
import warnings

# audio files
import librosa
from IPython.display import Audio as IPythonAudio

# PyTorch
import torch

## Hugging Face Transformers
from transformers import pipeline
from transformers.utils import logging

# Datasets to load
from datasets import load_dataset, load_from_disk, Audio

# Hugging Face Hub
from huggingface_hub import login

## Google Colab
from google.colab import userdata

# Configuration
logging.set_verbosity_error()
warnings.filterwarnings('ignore')

### Loading audio dataset

In [None]:
# This dataset is a collection of different sounds of 5 seconds
dataset = load_dataset("ashraq/esc50",
                       split="train[0:10]")

In [4]:
# review sample data observation
audio_sample = dataset[0:5]
audio_sample

{'filename': ['1-100032-A-0.wav',
  '1-100038-A-14.wav',
  '1-100210-A-36.wav',
  '1-100210-B-36.wav',
  '1-101296-A-19.wav'],
 'fold': [1, 1, 1, 1, 1],
 'target': [0, 14, 36, 36, 19],
 'category': ['dog',
  'chirping_birds',
  'vacuum_cleaner',
  'vacuum_cleaner',
  'thunderstorm'],
 'esc10': [True, False, False, False, False],
 'src_file': [100032, 100038, 100210, 100210, 101296],
 'take': ['A', 'A', 'A', 'B', 'A'],
 'audio': [{'path': None,
   'array': array([0., 0., 0., ..., 0., 0., 0.]),
   'sampling_rate': 44100},
  {'path': None,
   'array': array([-0.01184082, -0.10336304, -0.14141846, ...,  0.06985474,
           0.04049683,  0.00274658]),
   'sampling_rate': 44100},
  {'path': None,
   'array': array([-0.00695801, -0.01251221, -0.01126099, ...,  0.215271  ,
          -0.00875854, -0.28903198]),
   'sampling_rate': 44100},
  {'path': None,
   'array': array([0.53897095, 0.39627075, 0.26739502, ..., 0.09729004, 0.11227417,
          0.07983398]),
   'sampling_rate': 44100},
  {

In [12]:
# Print the keys of the dictionary
# print(audio_sample.keys())

# Print values for each key to explore the structure
# for key in audio_sample.keys():
#     print(f"Key: {key}")
#     print(f"Value: {audio_sample[key]}")

# Display audio files
for audio_info in audio_sample['audio']:
        audio_array = audio_info['array']
        sampling_rate = audio_info['sampling_rate']
#        print(f"Sampling Rate: {sampling_rate}")
        display(IPythonAudio(audio_array, rate=sampling_rate))

Output hidden; open in https://colab.research.google.com to view.

In [None]:
zero_shot_classifier = pipeline(
    task="zero-shot-audio-classification",
    model="laion/clap-htsat-unfused")

In [14]:
# Model Sampling Rate
zero_shot_classifier.feature_extractor.sampling_rate

48000

In [21]:
# Input Sampling Rate
audio_sample['audio'][0]['sampling_rate']

44100

### Set the correct sampling rate for the input and the model.

In [23]:
dataset = dataset.cast_column(
    "audio",
     Audio(sampling_rate=48_000))

In [26]:
dataset['audio'][0]['sampling_rate']

48000

In [59]:
# setup candidate labels
candidate_labels = ["Sound of a child crying",
                    "Sound of vacuum cleaner",
                    "Sound of a bird singing",
                    "Sound of an airplane"]

zero_shot_classifier(audio_sample["audio"][0]["array"],
                     candidate_labels=candidate_labels)


[{'score': 0.6457511782646179, 'label': 'Sound of a bird singing'},
 {'score': 0.15903766453266144, 'label': 'Sound of vacuum cleaner'},
 {'score': 0.1563412994146347, 'label': 'Sound of an airplane'},
 {'score': 0.038869865238666534, 'label': 'Sound of a child crying'}]