### Pretrained Transformer models doing Audio to Text ###

In [None]:
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

from IPython.display import Audio

In [None]:
# !pip install soundfile

In [None]:
torchaudio.set_audio_backend("soundfile")

In [None]:
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")


In [None]:
def preprocess_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    
    # Resample the waveform to 16kHz (if not already at this sample rate)
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
        waveform = resampler(waveform)
    
    return waveform


In [None]:
def transcribe(file_path):
    # Preprocess the audio
    waveform = preprocess_audio(file_path)
    
    # Tokenize and predict
    input_values = tokenizer(waveform.squeeze().numpy(), return_tensors="pt").input_values
    logits = model(input_values).logits
    
    # Decode the predicted IDs
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.decode(predicted_ids[0])
    
    return transcription


In [None]:
file_path = "fables_01_01_aesop.mp3" #Conference.wav"
waveform, sample_rate = torchaudio.load(file_path)


Audio(data=waveform, rate=sample_rate)


In [None]:
transcription = transcribe(file_path)
print(transcription)