# **CLAP implementation**
Please refer to the respective sections in the book for further details.


## **Step 1: Installing Libraries & Data Loading**

In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.chdir("/content/drive/My Drive/Colab Notebooks")

In [None]:
!pip install transformers -U

Collecting transformers
  Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m67.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.37.2


In [None]:
from transformers import AutoProcessor, ClapModel
import librosa

In [None]:
wav_file_path = 'cow_moo.wav'
loaded_audio, _ = librosa.load(wav_file_path, sr=48000)

## **Step 2: Model Inference**

In [None]:
clap_model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
clap_processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

In [None]:
audio_descriptions = ["Sound of a cow", "Sound of a human", "Sound of a cat"]

In [None]:
processed_inputs = clap_processor(text=audio_descriptions, audios=loaded_audio, return_tensors="pt", padding=True, sampling_rate=48000)

In [None]:
model_predictions = clap_model(**processed_inputs)
similarity_scores = model_predictions.logits_per_audio
probability_scores = similarity_scores.softmax(dim=-1)

In [None]:
probabilities = probability_scores.detach().numpy()[0]
for audio_description, probability in zip(audio_descriptions, probabilities):
    print(f"{audio_description} - {probability*100:.0f}%")

Sound of a cow - 99%
Sound of a human - 1%
Sound of a cat - 0%
