# Libraries

In [21]:
import tensorflow as tf
import torchaudio
import librosa
from transformers import Wav2Vec2Processor, TFWav2Vec2ForCTC

In [16]:
def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    return waveform, sample_rate


In [22]:
def preprocess_audio(waveform, sample_rate):
    waveform = waveform.numpy()
    waveform_resampled = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)
    if waveform_resampled.ndim > 1:
        waveform_resampled = librosa.to_mono(waveform_resampled)
    waveform_resampled = tf.convert_to_tensor(waveform_resampled)

    return waveform_resampled

In [23]:
def speech_to_text(waveform):
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
    inputs = processor(waveform, sampling_rate=16000, return_tensors="tf", padding=True)
    logits = model(inputs.input_values).logits
    predicted_ids = tf.argmax(logits, axis=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription

In [24]:
def process_audio_file(file_path):
    waveform, sample_rate = load_audio(file_path)
    waveform = preprocess_audio(waveform, sample_rate)
    transcription = speech_to_text(waveform)

    return transcription

In [31]:
file_path = "/content/Recording (7).m4a"
transcription = process_audio_file(file_path)
print(f"Transcription: {transcription}")


TFWav2Vec2ForCTC has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tune this model, you need a GPU or a TPU
All PyTorch model weights were used when initializing TFWav2Vec2ForCTC.

Some weights or buffers of the TF 2.0 model TFWav2Vec2ForCTC were not initialized from the PyTorch model and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Transcription: THIS IS AGAM PATEL SPEAKING
