In [None]:
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import numpy as np

# Load model and processor
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")

# Function to load and process audio files into tensors
def load_and_preprocess_audio(audio_file):
    speech, sr = librosa.load(audio_file, sr=16000)  # Load and resample to 16kHz
    return speech

# Function to create batch and pad them to the same length
def prepare_batch(audio_files):
    # Load all the audio files
    speeches = [load_and_preprocess_audio(file) for file in audio_files]

    # Process the audio using the Wav2Vec2 processor
    inputs = processor(speeches, return_tensors="pt", padding=True, sampling_rate=16000)

    return inputs.input_values

# Function to perform batch transcription
def transcribe_batch(audio_files):
    # Prepare the batch
    input_values = prepare_batch(audio_files)

    # Run batch through model
    with torch.no_grad():
        logits = model(input_values).logits

    # Get the predicted IDs from the logits
    predicted_ids = torch.argmax(logits, dim=-1)

    # Decode the IDs to text using the processor
    transcriptions = processor.batch_decode(predicted_ids)

    return transcriptions

# Example audio files (replace with actual paths)
audio_files = ["Speaker26_000.wav", "Speaker26_001.wav", "Speaker26_002.wav"]

# Perform batch transcription
batch_transcriptions = transcribe_batch(audio_files)

# Print the results
for idx, transcription in enumerate(batch_transcriptions):
    print(f"Transcription for call {idx + 1}: {transcription}")


Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s

Transcription for call 1: SECTION ZERO OF ESOP'S FABLES A NEW REVISED VERSION BY ESOP THIS LABER BOX RECORDING IS IN THE PUBLIC DOMAIN PREFACE THE FOLLOWING ARE SUM OF ESOP'S BEST LOVED FABLES THE GOOSE WITH THE GOLDEN EGGS A CERTAIN MAN HAD THE GOOD FORTUNE TO POSSESS A GOOSE THAT LAID HIM A GOLDEN EGG EVERY DAY BUT DISSATISFIED WITH SO SLOW AN INCOME AND THINKING TO SEIZE THE WHOLE TREASURE AT ONCE HE KILLED THE GOOSE AND CUTTING HER OPEN FOUND HER JUST WHAT ANY OTHER GOOSE WOULD BE MUCH ONCE MORE AND LOSES ALL THE TOWN MOUSE AND THE COUNTRY MOUSE A COUNTRY MOUSE INVITED A TOWN MOUSE AND INTIMATE FRIEND TO PAY HIM A VISIT AND PARTAKE OF HIS COUNTRY FARE AS THEY WERE ON THE BARE PLOUGED LANDS EATING THEIR WHEAT STA
Transcription for call 2: UCKS AND ROOTS PULLED UP FROM THE HEDGEROW THE TOWN MOUSE SAID TO HIS FRIEND YOU LIVE HERE THE LIFE OF THE ANTS WHILE IN MY HOUSE IS THE HORN OF PLENTY I AM SURROUNDED WITH EVERY LUXURY AND IF YOU WILL COME WITH ME AS I MUCH WISH YOU WOULD YOU SHAL