In [None]:
# Install necessary packages
!pip install datasets transformers openai-whisper jiwer
!pip install soundfile
!pip install librosa
!pip install evaluate

from datasets import load_dataset
import numpy as np
import whisper
import jiwer
import evaluate
import torch

# Load the LibriSpeech dataset (using a small subset for demonstration)
dataset = load_dataset("librispeech_asr", "clean", split="test[:1%]", trust_remote_code=True)

# Load the pre-trained Whisper model
model = whisper.load_model("base")

# Function to transcribe audio using the Whisper model
def transcribe(batch):
    audio = batch["audio"]
    # Whisper model expects 16kHz audio
    audio_array = torch.tensor(audio["array"], dtype=torch.float32)
    result = model.transcribe(audio_array, fp16=False)
    batch["transcription"] = result["text"]
    return batch

# Apply transcription to the dataset
dataset = dataset.map(transcribe)

# Load the CER metric using evaluate
cer_metric = evaluate.load("cer", trust_remote_code=True)

# Compute CER
transcriptions = dataset["transcription"]
references = dataset["text"]
cer = cer_metric.compute(predictions=transcriptions, references=references)

print(f"CER: {cer:.4f}")


Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
   ---------------------------------------- 0.0/84.1 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/84.1 kB ? eta -:--:--
   --------- ------------------------------ 20.5/84.1 kB 330.3 kB/s eta 0:00:01
   ------------------------ --------------- 51.2/84.1 kB 440.4 kB/s eta 0:00:01
   ---------------------------------------- 84.1/84.1 kB 527.8 kB/s eta 0:00:00
Installing collected packages: evaluate
Successfully installed evaluate-0.4.2



Map: 100%|██████████| 26/26 [01:18<00:00,  3.03s/ examples]
Downloading builder script: 100%|██████████| 5.60k/5.60k [00:00<00:00, 5.54MB/s]


CER: 0.8285


In [4]:
pip install datasets


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194

In [2]:
pip install openai-whisper


Collecting openai-whisper
  Downloading openai-whisper-20231117.tar.gz (798 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.6/798.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->openai-whisper)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->openai-whisper)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.

In [3]:
pip install torch



In [4]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.2


In [10]:
pip install jiwer

Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.4 rapidfuzz-3.9.3


In [27]:
import os
import tarfile
import librosa
import whisper
import torch
from datasets import Dataset, load_metric
from tqdm import tqdm

# Function to extract the tar.gz file with error handling
def extract_tar(tar_file, extracted_folder):
    if not os.path.exists(extracted_folder):
        os.makedirs(extracted_folder)
        with tarfile.open(tar_file, "r:gz") as tar:
            for member in tqdm(tar.getmembers(), desc="Extracting files"):
                if member.name.endswith(".wav"):
                    continue  # Skip extracting audio files
                try:
                    tar.extract(member, path=extracted_folder)
                except tarfile.TarError:
                    print(f"Failed to extract {member.name}. Removing corrupted file.")
                    os.remove(os.path.join(extracted_folder, member.name))

# Extract the tar.gz file
tar_file = "nptel-pure-set.tar.gz"
extracted_folder = "nptel_pure_set"

extract_tar(tar_file, extracted_folder)

# Function to load transcriptions from the extracted files
def load_data(extracted_folder):
    data = {"text": []}
    audio_paths = []
    missing_files = []
    for root, dirs, files in os.walk(extracted_folder):
        for file in files:
            if file.endswith(".txt"):
                text_path = os.path.join(root, file)
                audio_path = os.path.splitext(text_path)[0] + ".wav"
                if os.path.exists(audio_path):  # Check if corresponding audio file exists
                    try:
                        with open(text_path, "r") as f:
                            transcription = f.read().strip()
                        data["text"].append(transcription)
                        audio_paths.append(audio_path)
                    except Exception as e:
                        print(f"Error reading {text_path}: {e}")
                else:
                    print(f"Audio file not found for {text_path}")
                    missing_files.append(text_path)
    return data, audio_paths, missing_files

# Load the data
data, audio_paths, missing_files = load_data(extracted_folder)

# Check loaded data and audio paths
print("Audio paths:")
print(audio_paths)
print("Data:")
print(data)

# Print missing files for further inspection
if missing_files:
    print("\nMissing audio files:")
    for missing_file in missing_files:
        print(missing_file)

# Load the pre-trained Whisper model
try:
    model = whisper.load_model("base")
except Exception as e:
    print(f"Error loading Whisper model: {e}")
    model = None

# Function to transcribe audio using the Whisper model with enhanced error handling and debugging
def transcribe_audio(audio_path):
    if model is None:
        print("Whisper model not loaded correctly. Skipping transcription.")
        return None

    try:
        audio, sr = librosa.load(audio_path, sr=16000)
        # Whisper model expects 16kHz audio
        audio_array = torch.tensor(audio, dtype=torch.float32)
        result = model.transcribe(audio_array.numpy(), fp16=False)
        if "text" in result:
            return result["text"]
        else:
            print(f"No transcription result for {audio_path}")
            return None
    except Exception as e:
        print(f"Error transcribing {audio_path}: {e}")
        return None

# Transcribe all audio files
transcriptions = []
failed_transcriptions = []
for audio_path in tqdm(audio_paths, desc="Transcribing audio"):
    transcription = transcribe_audio(audio_path)
    if transcription:  # Only add non-empty transcriptions
        transcriptions.append(transcription)
    else:
        failed_transcriptions.append(audio_path)

# Add transcriptions to the dataset if successful
if transcriptions:
    data["transcription"] = transcriptions
else:
    print("No transcriptions were successfully generated. Check logs for details.")

# Convert data to a Hugging Face dataset
dataset = Dataset.from_dict(data)

# Ensure references and predictions are not empty
if "text" in dataset.column_names:
    references = dataset["text"]
else:
    references = []

if "transcription" in dataset.column_names:
    predictions = dataset["transcription"]
else:
    predictions = []

print(f"Number of references: {len(references)}")
print(f"Number of predictions: {len(predictions)}")

if references and predictions:
    # Load the CER metric using evaluate
    cer_metric = load_metric("cer")

    # Compute CER
    cer = cer_metric.compute(predictions=predictions, references=references)
    print(f"CER: {cer:.4f}")
else:
    print("Either references or predictions list is empty. Cannot compute CER.")

# Print failed transcriptions for debugging
if failed_transcriptions:
    print("\nFailed to transcribe the following audio files:")
    for audio_path in failed_transcriptions:
        print(audio_path)


Audio file not found for nptel_pure_set/nptel-pure/corrected_txt/000a17c8ff1a9353b35fe2ceda654796af4e5f83807692ced2c8164f.txt
Audio file not found for nptel_pure_set/nptel-pure/corrected_txt/00023ef1cd73a67ad1c3a271858d3b3cdd847ce6bc2359c225d61175.txt
Audio file not found for nptel_pure_set/nptel-pure/corrected_txt/00054485e7592f089160e4cf66741e61e1be88668b4a7c9169c7f35d.txt
Audio file not found for nptel_pure_set/nptel-pure/corrected_txt/000495bb6870672d71b5073ec00bb34396a25834c46ef6f35fc6e272.txt
Audio file not found for nptel_pure_set/nptel-pure/corrected_txt/00062eed7cb5d3c4556b36016b6fb4ef0c204c52783b4b2dfee0500f.txt
Audio file not found for nptel_pure_set/nptel-pure/corrected_txt/00073f4070e0e71cbb1ed0e55b828a428c3840ac1f10287e87909aad.txt
Audio file not found for nptel_pure_set/nptel-pure/corrected_txt/0000eedda73b90b0097ebe60a5953291bfd003ea54f7f69bcd0d4b94.txt
Audio file not found for nptel_pure_set/nptel-pure/corrected_txt/00062955956c5b6d01e0f00c2a62b43ccc4fb0bd03b1a310f0426

Transcribing audio: 0it [00:00, ?it/s]

No transcriptions were successfully generated. Check logs for details.
Number of references: 0
Number of predictions: 0
Either references or predictions list is empty. Cannot compute CER.





In [61]:
import os
import tarfile
import librosa
import whisper
import torch
from datasets import Dataset, load_metric
from tqdm import tqdm

# Function to extract the tar.gz file with error handling
def extract_tar(tar_file, extracted_folder):
    if not os.path.exists(extracted_folder):
        os.makedirs(extracted_folder)
        with tarfile.open(tar_file, "r:gz") as tar:
            for member in tqdm(tar.getmembers(), desc="Extracting files"):
                if member.name.endswith(".wav"):
                    tar.extract(member, path=extracted_folder)
                else:
                    continue  # Skip extracting non-audio files

# Extract the tar.gz file
tar_file = "nptel-pure-set.tar.gz"
extracted_folder = "/content/nptel_pure_set"

extract_tar(tar_file, extracted_folder)

# Function to load transcriptions from the extracted files
def load_data(extracted_folder):
    data = {"text": [], "metadata": [], "transcription": []}
    audio_paths = []
    missing_files = []

    # Path to the wav folder
    wav_folder = os.path.join(extracted_folder, "nptel-pure", "wav")

    if os.path.exists(wav_folder):
        for root, _, files in os.walk(wav_folder):
            for file in files:
                if file.endswith(".wav"):
                    audio_path = os.path.join(root, file)
                    audio_paths.append(audio_path)
                    # Assuming corresponding corrected_txt file exists in the same structure
                    text_path = os.path.splitext(audio_path)[0] + ".txt"
                    metadata_path = os.path.splitext(audio_path)[0] + ".json"
                    if os.path.exists(text_path):
                        try:
                            with open(text_path, "r") as f:
                                transcription = f.read().strip()
                            data["text"].append(transcription)
                        except Exception as e:
                            print(f"Error reading {text_path}: {e}")
                    else:
                        print(f"Text file not found for {audio_path}")
                        missing_files.append(text_path)

                    if os.path.exists(metadata_path):
                        try:
                            with open(metadata_path, "r") as f:
                                metadata = f.read().strip()
                            data["metadata"].append(metadata)
                        except Exception as e:
                            print(f"Error reading {metadata_path}: {e}")
                    else:
                        print(f"Metadata file not found for {audio_path}")
                        missing_files.append(metadata_path)

    return data, audio_paths, missing_files

# Load the data
data, audio_paths, missing_files = load_data(extracted_folder)

# Check loaded data and audio paths
print("Audio paths:")
print(audio_paths)
print("Data:")
print(data)

# Print missing files for further inspection
if missing_files:
    print("\nMissing text/metadata files:")
    for missing_file in missing_files:
        print(missing_file)

# Load the pre-trained Whisper model with error handling
try:
    model = whisper.load_model("base")
except Exception as e:
    print(f"Error loading Whisper model: {e}")
    model = None

# Function to transcribe audio using the Whisper model with enhanced error handling
def transcribe_audio(audio_path):
    if model is None:
        print("Whisper model not loaded correctly. Skipping transcription.")
        return None

    try:
        audio, sr = librosa.load(audio_path, sr=16000)
        # Whisper model expects 16kHz audio
        audio_tensor = torch.tensor(audio, dtype=torch.float32)
        result = model.transcribe(audio_tensor.numpy(), fp16=False)
        if "text" in result:
            return result["text"]
        else:
            print(f"No transcription result for {audio_path}")
            return None
    except Exception as e:
        print(f"Error transcribing {audio_path}: {e}")
        return None

# Transcribe all audio files
transcriptions = []
failed_transcriptions = []
for audio_path in tqdm(audio_paths, desc="Transcribing audio"):
    transcription = transcribe_audio(audio_path)
    if transcription:
        transcriptions.append(transcription)
    else:
        failed_transcriptions.append(audio_path)

# Add transcriptions to the dataset if successful
if transcriptions:
    data["transcription"] = transcriptions
else:
    print("No transcriptions were successfully generated. Check logs for details.")

# Convert data to a Hugging Face dataset
dataset = Dataset.from_dict(data)

# Ensure references and predictions are not empty
if "text" in dataset.column_names and "transcription" in dataset.column_names:
    references = dataset["text"]
    predictions = dataset["transcription"]

    # Compute CER
    cer_metric = load_metric("cer")
    cer = cer_metric.compute(predictions=predictions, references=references)
    print(f"CER: {cer:.4f}")

    # Print failed transcriptions for debugging
    if failed_transcriptions:
        print("\nFailed to transcribe the following audio files:")
        for audio_path in failed_transcriptions:
            print(audio_path)
else:
    print("Either references or predictions list is empty. Cannot compute CER.")


Text file not found for /content/nptel_pure_set/nptel-pure/wav/00000682f31904acc560fa359512e7bdd487b11efe36145a56874e30.wav
Metadata file not found for /content/nptel_pure_set/nptel-pure/wav/00000682f31904acc560fa359512e7bdd487b11efe36145a56874e30.wav
Text file not found for /content/nptel_pure_set/nptel-pure/wav/0000a7be825f70cbe4c49acf9a8b7804d05a8a4701a2b42a343a694e.wav
Metadata file not found for /content/nptel_pure_set/nptel-pure/wav/0000a7be825f70cbe4c49acf9a8b7804d05a8a4701a2b42a343a694e.wav
Text file not found for /content/nptel_pure_set/nptel-pure/wav/00008c385446154c62236f9420099d05f6fb9ad79e0953dff3b9ca69.wav
Metadata file not found for /content/nptel_pure_set/nptel-pure/wav/00008c385446154c62236f9420099d05f6fb9ad79e0953dff3b9ca69.wav
Text file not found for /content/nptel_pure_set/nptel-pure/wav/0000724068ed76075c94a0306aa98bbdfb73d31e87da10fb91f9b1fa.wav
Metadata file not found for /content/nptel_pure_set/nptel-pure/wav/0000724068ed76075c94a0306aa98bbdfb73d31e87da10fb91f9b

Transcribing audio:  16%|█▌        | 16/102 [01:50<09:54,  6.91s/it]


KeyboardInterrupt: 

In [33]:
cat nptel-test.tar.gz.part*>nptel-test.tar.gz

cat: 'nptel-test.tar.gz.part*': No such file or directory


In [32]:
cat nptel-train.tar.gz.part*>nptel-train.tar.gz

cat: 'nptel-train.tar.gz.part*': No such file or directory


In [31]:
cat nptel-valid.tar.gz.part* > nptel-valid.tar.gz


cat: 'nptel-valid.tar.gz.part*': No such file or directory


In [71]:
import os
import tarfile
import json
import librosa
import whisper
import torch
from datasets import Dataset, load_metric
from tqdm import tqdm

# Function to extract the tar.gz file with error handling
def extract_tar(tar_file, extracted_folder):
    if not os.path.exists(extracted_folder):
        os.makedirs(extracted_folder)
        with tarfile.open(tar_file, "r:gz") as tar:
            for member in tqdm(tar.getmembers(), desc="Extracting files"):
                if member.name.endswith(".wav"):
                    tar.extract(member, path=extracted_folder)
                else:
                    continue  # Skip extracting non-audio files

# Extract the tar.gz file
tar_file = "nptel-pure-set.tar.gz"
extracted_folder = "/content/nptel_pure_set"

extract_tar(tar_file, extracted_folder)

# Function to load transcriptions from the extracted files
def load_data(extracted_folder):
    data = {"text": [], "metadata": [], "transcription": []}
    audio_paths = []
    missing_files = []

    # Path to the wav folder
    wav_folder = os.path.join(extracted_folder, "nptel-pure", "wav")
    if os.path.exists(wav_folder):
        for root, _, files in os.walk(wav_folder):
            for file in files:
                if file.endswith(".wav"):
                    audio_path = os.path.join(root, file)
                    audio_paths.append(audio_path)

                    # Assuming corresponding text files exist in corrected_txt, original_txt, or metadata folders
                    filename = os.path.splitext(file)[0]
                    text_path_corrected = os.path.join(extracted_folder, "nptel-pure", "corrected_txt", f"{filename}.txt")
                    text_path_original = os.path.join(extracted_folder, "nptel-pure", "original_txt", f"{filename}.txt")
                    metadata_path = os.path.join(extracted_folder, "nptel-pure", "metadata", f"{filename}.json")

                    if os.path.exists(text_path_corrected):
                        try:
                            with open(text_path_corrected, "r") as f:
                                transcription = f.read().strip()
                            data["text"].append(transcription)
                        except Exception as e:
                            print(f"Error reading {text_path_corrected}: {e}")
                            missing_files.append(text_path_corrected)
                    elif os.path.exists(text_path_original):
                        try:
                            with open(text_path_original, "r") as f:
                                transcription = f.read().strip()
                            data["text"].append(transcription)
                        except Exception as e:
                            print(f"Error reading {text_path_original}: {e}")
                            missing_files.append(text_path_original)
                    if os.path.exists(metadata_path):
                        try:
                            with open(metadata_path, "r") as f:
                                metadata = json.load(f)
                            data["metadata"].append(metadata)
                        except Exception as e:
                            print(f"Error reading {metadata_path}: {e}")
                            missing_files.append(metadata_path)
                    else:
                        print(f"Text or metadata file not found for {audio_path}")
                        missing_files.append(text_path_corrected)

    return data, audio_paths, missing_files

# Load the data
data, audio_paths, missing_files = load_data(extracted_folder)

# Check loaded data and audio paths
print("Audio paths:")
print(audio_paths)
print("Data:")
print(data)

# Print missing files for further inspection
if missing_files:
    print("\nMissing text/metadata files:")
    for missing_file in missing_files:
        print(missing_file)

# Load the pre-trained Whisper model with error handling
try:
    model = whisper.load_model("base")
except Exception as e:
    print(f"Error loading Whisper model: {e}")
    model = None

# Function to transcribe audio using the Whisper model with enhanced error handling
def transcribe_audio(audio_path):
    if model is None:
        print("Whisper model not loaded correctly. Skipping transcription.")
        return None

    try:
        audio, sr = librosa.load(audio_path, sr=16000)
        # Whisper model expects 16kHz audio
        audio_tensor = torch.tensor(audio, dtype=torch.float32)
        result = model.transcribe(audio_tensor.numpy(), fp16=False)
        if "text" in result:
            return result["text"]
        else:
            print(f"No transcription result for {audio_path}")
            return None
    except Exception as e:
        print(f"Error transcribing {audio_path}: {e}")
        return None

# Transcribe all audio files
transcriptions = []
failed_transcriptions = []
for audio_path in tqdm(audio_paths, desc="Transcribing audio"):
    transcription = transcribe_audio(audio_path)
    if transcription:
        transcriptions.append(transcription)
    else:
        failed_transcriptions.append(audio_path)

# Check consistency and synchronize text and transcription lists
if len(data["text"]) != len(transcriptions):
    print("Warning: Number of transcriptions does not match number of texts.")
    print(f"Number of texts: {len(data['text'])}")
    print(f"Number of transcriptions: {len(transcriptions)}")
    # Adjust lengths to match, truncate longer list if needed
    min_length = min(len(data["text"]), len(transcriptions))
    data["text"] = data["text"][:min_length]
    transcriptions = transcriptions[:min_length]

# Add transcriptions to the data if successful
data["transcription"] = transcriptions

# Ensure metadata length is 101
if len(data["metadata"]) != 101:
    data["metadata"] = data["metadata"][:101]

# Convert data to a Hugging Face dataset
dataset = Dataset.from_dict(data)

# Ensure references and predictions are not empty before computing CER
if "text" in dataset.column_names and "transcription" in dataset.column_names:
    references = dataset["text"]
    predictions = dataset["transcription"]

    if not references or not predictions:
        print("Either references or predictions list is empty. Cannot compute CER.")
    else:
        # Compute CER
        cer_metric = load_metric("cer")
        cer = cer_metric.compute(predictions=predictions, references=references)
        print(f"CER: {cer:.4f}")

        # Print failed transcriptions for debugging
        if failed_transcriptions:
            print("\nFailed to transcribe the following audio files:")
            for audio_path in failed_transcriptions:
                print(audio_path)
else:
    print("Either references or predictions list is empty. Cannot compute CER.")


Audio paths:
['/content/nptel_pure_set/nptel-pure/wav/00000682f31904acc560fa359512e7bdd487b11efe36145a56874e30.wav', '/content/nptel_pure_set/nptel-pure/wav/0000a7be825f70cbe4c49acf9a8b7804d05a8a4701a2b42a343a694e.wav', '/content/nptel_pure_set/nptel-pure/wav/00008c385446154c62236f9420099d05f6fb9ad79e0953dff3b9ca69.wav', '/content/nptel_pure_set/nptel-pure/wav/0000724068ed76075c94a0306aa98bbdfb73d31e87da10fb91f9b1fa.wav', '/content/nptel_pure_set/nptel-pure/wav/00005b044247d9f9346e4c26cbfefa17ab81637b9bf7659d0e4631f8.wav', '/content/nptel_pure_set/nptel-pure/wav/0000003b8fd9bc22877135b42b04c49d4860312b001be688723ecc5d.wav', '/content/nptel_pure_set/nptel-pure/wav/000043dcf2f0dd86460ce0f00e41877741e1b6726df4c99b46190d19.wav', '/content/nptel_pure_set/nptel-pure/wav/0000fb088b71114cabb450234fc926abd1d79fdee305e5a1b1d16f36.wav', '/content/nptel_pure_set/nptel-pure/wav/000020db7e99a1de454a92346534c843f7f712fd7d3160a7d575af4a.wav', '/content/nptel_pure_set/nptel-pure/wav/00000da4b2da194ac31

Transcribing audio: 100%|██████████| 102/102 [11:43<00:00,  6.90s/it]


Number of texts: 102
Number of transcriptions: 101


ValueError: one or more references are empty strings

In [72]:
import os
import tarfile
import json
import librosa
import whisper
import torch
from datasets import Dataset, load_metric
from tqdm import tqdm

# Function to extract the tar.gz file with error handling
def extract_tar(tar_file, extracted_folder):
    if not os.path.exists(extracted_folder):
        os.makedirs(extracted_folder)
        with tarfile.open(tar_file, "r:gz") as tar:
            for member in tqdm(tar.getmembers(), desc="Extracting files"):
                if member.name.endswith(".wav"):
                    tar.extract(member, path=extracted_folder)
                else:
                    continue  # Skip extracting non-audio files

# Extract the tar.gz file
tar_file = "nptel-pure-set.tar.gz"
extracted_folder = "/content/nptel_pure_set"

extract_tar(tar_file, extracted_folder)

# Function to load transcriptions from the extracted files
def load_data(extracted_folder):
    data = {"text": [], "metadata": [], "transcription": []}
    audio_paths = []
    missing_files = []

    # Path to the wav folder
    wav_folder = os.path.join(extracted_folder, "nptel-pure", "wav")
    if os.path.exists(wav_folder):
        for root, _, files in os.walk(wav_folder):
            for file in files:
                if file.endswith(".wav"):
                    audio_path = os.path.join(root, file)
                    audio_paths.append(audio_path)

                    # Assuming corresponding text files exist in corrected_txt, original_txt, or metadata folders
                    filename = os.path.splitext(file)[0]
                    text_path_corrected = os.path.join(extracted_folder, "nptel-pure", "corrected_txt", f"{filename}.txt")
                    text_path_original = os.path.join(extracted_folder, "nptel-pure", "original_txt", f"{filename}.txt")
                    metadata_path = os.path.join(extracted_folder, "nptel-pure", "metadata", f"{filename}.json")

                    if os.path.exists(text_path_corrected):
                        try:
                            with open(text_path_corrected, "r") as f:
                                transcription = f.read().strip()
                            data["text"].append(transcription)
                        except Exception as e:
                            print(f"Error reading {text_path_corrected}: {e}")
                            missing_files.append(text_path_corrected)
                    elif os.path.exists(text_path_original):
                        try:
                            with open(text_path_original, "r") as f:
                                transcription = f.read().strip()
                            data["text"].append(transcription)
                        except Exception as e:
                            print(f"Error reading {text_path_original}: {e}")
                            missing_files.append(text_path_original)
                    if os.path.exists(metadata_path):
                        try:
                            with open(metadata_path, "r") as f:
                                metadata = json.load(f)
                            data["metadata"].append(metadata)
                        except Exception as e:
                            print(f"Error reading {metadata_path}: {e}")
                            missing_files.append(metadata_path)
                    else:
                        print(f"Text or metadata file not found for {audio_path}")
                        missing_files.append(text_path_corrected)

    return data, audio_paths, missing_files

# Load the data
data, audio_paths, missing_files = load_data(extracted_folder)

# Check loaded data and audio paths
print("Audio paths:")
print(audio_paths)
print("Data:")
print(data)

# Print missing files for further inspection
if missing_files:
    print("\nMissing text/metadata files:")
    for missing_file in missing_files:
        print(missing_file)

# Load the pre-trained Whisper model with error handling
try:
    model = whisper.load_model("base")
except Exception as e:
    print(f"Error loading Whisper model: {e}")
    model = None

# Function to transcribe audio using the Whisper model with enhanced error handling
def transcribe_audio(audio_path):
    if model is None:
        print("Whisper model not loaded correctly. Skipping transcription.")
        return None

    try:
        audio, sr = librosa.load(audio_path, sr=16000)
        # Whisper model expects 16kHz audio
        audio_tensor = torch.tensor(audio, dtype=torch.float32)
        result = model.transcribe(audio_tensor.numpy(), fp16=False)
        if "text" in result:
            return result["text"]
        else:
            print(f"No transcription result for {audio_path}")
            return None
    except Exception as e:
        print(f"Error transcribing {audio_path}: {e}")
        return None

# Transcribe all audio files
transcriptions = []
failed_transcriptions = []
for audio_path in tqdm(audio_paths, desc="Transcribing audio"):
    transcription = transcribe_audio(audio_path)
    if transcription:
        transcriptions.append(transcription)
    else:
        failed_transcriptions.append(audio_path)

# Check consistency and synchronize text and transcription lists
if len(data["text"]) != len(transcriptions):
    print("Warning: Number of transcriptions does not match number of texts.")
    print(f"Number of texts: {len(data['text'])}")
    print(f"Number of transcriptions: {len(transcriptions)}")
    # Adjust lengths to match, truncate longer list if needed
    min_length = min(len(data["text"]), len(transcriptions))
    data["text"] = data["text"][:min_length]
    transcriptions = transcriptions[:min_length]

# Add transcriptions to the data if successful
data["transcription"] = transcriptions

# Ensure metadata length is 101
if len(data["metadata"]) != 101:
    data["metadata"] = data["metadata"][:101]

# Convert data to a Hugging Face dataset
dataset = Dataset.from_dict(data)

# Filter out empty references and their corresponding predictions
references = dataset["text"]
predictions = dataset["transcription"]

filtered_references = []
filtered_predictions = []

for ref, pred in zip(references, predictions):
    if ref:
        filtered_references.append(ref)
        filtered_predictions.append(pred)

# Ensure references and predictions are not empty before computing CER
if not filtered_references or not filtered_predictions:
    print("Either filtered references or predictions list is empty. Cannot compute CER.")
else:
    # Compute CER
    cer_metric = load_metric("cer")
    cer = cer_metric.compute(predictions=filtered_predictions, references=filtered_references)
    print(f"CER: {cer:.4f}")

    # Print failed transcriptions for debugging
    if failed_transcriptions:
        print("\nFailed to transcribe the following audio files:")
        for audio_path in failed_transcriptions:
            print(audio_path)


Audio paths:
['/content/nptel_pure_set/nptel-pure/wav/00000682f31904acc560fa359512e7bdd487b11efe36145a56874e30.wav', '/content/nptel_pure_set/nptel-pure/wav/0000a7be825f70cbe4c49acf9a8b7804d05a8a4701a2b42a343a694e.wav', '/content/nptel_pure_set/nptel-pure/wav/00008c385446154c62236f9420099d05f6fb9ad79e0953dff3b9ca69.wav', '/content/nptel_pure_set/nptel-pure/wav/0000724068ed76075c94a0306aa98bbdfb73d31e87da10fb91f9b1fa.wav', '/content/nptel_pure_set/nptel-pure/wav/00005b044247d9f9346e4c26cbfefa17ab81637b9bf7659d0e4631f8.wav', '/content/nptel_pure_set/nptel-pure/wav/0000003b8fd9bc22877135b42b04c49d4860312b001be688723ecc5d.wav', '/content/nptel_pure_set/nptel-pure/wav/000043dcf2f0dd86460ce0f00e41877741e1b6726df4c99b46190d19.wav', '/content/nptel_pure_set/nptel-pure/wav/0000fb088b71114cabb450234fc926abd1d79fdee305e5a1b1d16f36.wav', '/content/nptel_pure_set/nptel-pure/wav/000020db7e99a1de454a92346534c843f7f712fd7d3160a7d575af4a.wav', '/content/nptel_pure_set/nptel-pure/wav/00000da4b2da194ac31

Transcribing audio: 100%|██████████| 102/102 [12:08<00:00,  7.14s/it]


Number of texts: 102
Number of transcriptions: 101
CER: 1.1709

Failed to transcribe the following audio files:
/content/nptel_pure_set/nptel-pure/wav/0000a7be825f70cbe4c49acf9a8b7804d05a8a4701a2b42a343a694e.wav
