#### Speech Command Recognition - An example of Audio applications

In [None]:
import torch
# import torchaudio
import soundfile as sf
import librosa
import librosa.display

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

import numpy as np
import matplotlib.pyplot as plt

from IPython.display import Audio

In [None]:
audio_path = 'yes.mp3'

In [None]:
# Try to read the wav file directly using soundfile
waveform, sample_rate = sf.read('yes.mp3')
print("Loaded waveform shape:", waveform.shape)

In [None]:
Audio(data=audio_path, rate=sample_rate)

In [None]:
# Load the pretrained Wav2Vec 2.0 model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Set model to evaluation mode
model.eval()

# Function to load audio using soundfile and convert it to the appropriate format
def load_audio_with_soundfile(audio_path, target_sr=16000):
    # Load audio file using soundfile
    waveform, sample_rate = sf.read(audio_path)
    
    # If stereo, convert to mono
    if len(waveform.shape) > 1:
        waveform = librosa.to_mono(waveform.T)
    
    # Resample if the sample rate is not the target (Wav2Vec expects 16kHz)
    if sample_rate != target_sr:
        waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=target_sr)
        sample_rate = target_sr

    return waveform, sample_rate

# Define a function to plot the Mel Spectrogram
def plot_mel_spectrogram(waveform, sr):
    # Compute the Mel Spectrogram
    mel_spec = librosa.feature.melspectrogram(y=waveform, sr=sr, n_mels=128)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    # Plot the Mel Spectrogram
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mel_spec_db, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel Spectrogram')
    plt.xlabel('Time (s)')
    plt.ylabel('Frequency (Mel)')
    plt.show()

# Define a function to transcribe audio
def transcribe_and_visualize(audio_path):
    # Load and preprocess the audio file using soundfile
    waveform, sample_rate = load_audio_with_soundfile(audio_path, target_sr=16000)
    
    # Plot the Mel Spectrogram of the audio
    plot_mel_spectrogram(waveform, sample_rate)
    
    # Preprocess the waveform for the Wav2Vec2 model
    inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)
    
    # Get the predicted tokens from the model
    with torch.no_grad():
        logits = model(inputs.input_values).logits

    # Get the predicted text from the logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])

    return transcription

In [None]:
# Test the transcription function and visualize the Mel Spectrogram on an example speech command audio file
audio_path = 'yes.mp3'
transcription = transcribe_and_visualize(audio_path)
print(f"Transcription: {transcription}")