In [44]:
import os
import librosa
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from jiwer import cer
# function to load NPTEL data from local Directory
def load_NPTEL_data(main_directory):
    data = {}
    for folder_name in ["vil"]:
        folder_path = os.path.join(main_directory, folder_name)
        audiofiles = []
        transcriptions = []

        # Iterate through each file in Folder
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".wav"):
                audio_path = os.path.join(folder_path, file_name)
                transcription_path = audio_path.replace(".wav", ".txt")  
                if os.path.exists(transcription_path):
                    with open(transcription_path, "r") as f:
                        transcription = f.read().strip()
                    audiofiles.append(audio_path)
                    transcriptions.append(transcription)
        if not audiofiles:
            print(f"No audio files found in directory: {folder_name}")
        data[folder_name] = {"audiofiles": audiofiles, "transcriptions": transcriptions}
    return data
# Transcribe Audio using Wav2Vec2 model
def transcribe_audio(waveform, sample_rate):
    inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(inputs.input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    return transcription[0]
# Load Audio files and transcriptions from the NPTEL dataset
main_directory = "C:/Users/tanya/OneDrive/Desktop/pytrch/New folder/nptel-pure"
NPTEL_data = load_NPTEL_data(main_directory)
# Load Pre-trained Wav2Vec2 model and processor
model_name = "facebook/wav2vec2-large-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)



Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s

In [45]:
# Calculate CER for each directory
cer_results = {}

for dir_name, data in NPTEL_data.items():
    if not data["audiofiles"]:
        print(f"No audio files found in directory: {dir_name}")
        cer_results[dir_name] = None
        continue

    total_cer = 0
    audiofiles = data["audiofiles"]
    transcriptions = data["transcriptions"]

    for audio_path, reference in zip(audiofiles, transcriptions):
        waveform, sample_rate = librosa.load(audio_path, sr=None)
        transcription = transcribe_audio(waveform, sample_rate)
        
        total_cer += cer(reference, transcription)

    average_cer = total_cer / len(audiofiles) if audiofiles else None
    cer_results[dir_name] = average_cer
    print(f"CER for {dir_name}: {average_cer}")

print(cer_results)

 

CER for vil: 0.2688981703466041
{'vil': 0.2688981703466041}
