In [23]:
import os

In [24]:
import librosa

In [25]:
import torch

In [26]:
import torchaudio


In [27]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

In [28]:
from jiwer import cer

In [29]:
# Function to load LibriSpeech data from local directory
def load_librispeech_data(main_directory):
    data = {}
    for folder_name in ["dev_clean" , "test-clean" , "test-other"]:
        folder_path = os.path.join(main_directory, folder_name)
        audio_files = []
        transcriptions = []

        # Iterate through each file in the folder
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".wav"):
                audio_path = os.path.join(folder_path, file_name)
                transcription_path = audio_path.replace(".wav", ".lab")  # Assuming transcription files are .txt
                if os.path.exists(transcription_path):
                    with open(transcription_path, "r") as f:
                        transcription = f.read().strip()
                    audio_files.append(audio_path)
                    transcriptions.append(transcription)
        if not audio_files:
            print(f"No audio files found in directory: {folder_name}")
        data[folder_name] = {"audio_files": audio_files, "transcriptions": transcriptions}
    return data

In [30]:
# Transcribe audio using Wav2Vec2 model
def transcribe_audio(waveform, sample_rate):
    inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(inputs.input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    return transcription[0]

In [31]:
# Load audio files and transcriptions from LibriSpeech dataset
main_directory = "C:/Users/tanya/OneDrive/Desktop/pytrch/myvenv/Librispeech"
librispeech_data = load_librispeech_data(main_directory)


In [32]:
# Load pre-trained Wav2Vec2 model and processor
model_name = "facebook/wav2vec2-large-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s

In [33]:
# Calculate CER for each directory
cer_results = {}

for dir_name, data in librispeech_data.items():
    if not data["audio_files"]:
        print(f"No audio files found in directory: {dir_name}")
        cer_results[dir_name] = None
        continue

    total_cer = 0
    audio_files = data["audio_files"]
    transcriptions = data["transcriptions"]

    for audio_path, reference in zip(audio_files, transcriptions):
        waveform, sample_rate = librosa.load(audio_path, sr=None)
        transcription = transcribe_audio(waveform, sample_rate)
        
        total_cer += cer(reference, transcription)

    average_cer = total_cer / len(audio_files) if audio_files else None
    cer_results[dir_name] = average_cer
    print(f"CER for {dir_name}: {average_cer}")

print(cer_results)

CER for dev_clean: 0.000480581759616936
CER for test-clean: 0.0015161220170302819
CER for test-other: 0.0020595485028878967
{'dev_clean': 0.000480581759616936, 'test-clean': 0.0015161220170302819, 'test-other': 0.0020595485028878967}
