In [1]:
#import libraries
import os
import pandas as pd
import torchaudio
import torch
from torch.utils.data import Dataset as TorchDataset, DataLoader
from transformers import WhisperProcessor, WhisperForConditionalGeneration, TrainingArguments, AdamW, get_scheduler
from datasets import Dataset
import torchaudio.transforms as T
from tqdm.auto import tqdm

#Load text files 
def load_dataset(root_folder):
    text_files = {'train': [], 'test': []}
    
    train_folder = os.path.join(root_folder, 'TRAIN')  # data/TRAIN
    test_folder = os.path.join(root_folder, 'TEST')    # data/TEST
    
    for folder in [train_folder, test_folder]:
        subset = 'train' if 'TRAIN' in folder else 'test'
        for DR in os.listdir(folder):
            DR_folder = os.path.join(folder, DR)  # ex: data/TRAIN/DR1
            if os.path.isdir(DR_folder):
                for text in os.listdir(DR_folder):
                    text_folder = os.path.join(DR_folder, text)  # ex: data/TRAIN/DR1/FCJF0
                    if os.path.isdir(text_folder):
                        for file in os.listdir(text_folder):
                            if file.endswith('.TXT'):
                                file_path = os.path.join(text_folder, file)  # ex: data/TRAIN/DR1/FCJF0/SA1.TXT
                                text_files[subset].append(file_path)
                                
    return text_files

# Load transcription data
def load_transcription_data(txt_files):
    data = []
    for file_path in txt_files:
        with open(file_path, 'r') as file:
            transcription = file.read().strip()
        data.append({
            'audio_path': file_path.replace('.TXT', '.WAV'),  # Replaces .TXT with .WAV to get the audio path
            'transcription': transcription
        })
    return pd.DataFrame(data) #Returns a DataFrame with audio_path and transcription


root_folder = "/kaggle/input/timitdataset/Dataset/data"
Text_files = load_dataset(root_folder)

# Load and preprocess data
train_txt_files = [path for path in Text_files['train'] if path.endswith('.TXT')]  #creates a list of file paths  that end with .TXT.
test_txt_files = [path for path in Text_files['test'] if path.endswith('.TXT')]

train_data = load_transcription_data(train_txt_files)
test_data = load_transcription_data(test_txt_files)

# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)


# Whisper processor and model
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

class WhisperDataset(TorchDataset):
    def __init__(self, data, processor, max_audio_length=3000, max_label_length=128):
        self.data = data.to_dict('records')    #Converts the DataFrame to a list of dictionaries, each dictionary represents an audio-transcription pair.
        self.processor = processor      # Stores the Whisper processor for feature extraction and tokenization.
        self.max_audio_length = max_audio_length  #Maximum length for audio input (samples)
        self.max_label_length = max_label_length  #Maximum length for labels (transcriptions)
        self.audio_transform = T.Resample(orig_freq=16000, new_freq=16000)  # Resample if needed

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]   #Retrieves the sample at index 
        audio, _ = torchaudio.load(item['audio_path'])
        
        if audio.size(0) > 1:  # Ensure single channel audio
            audio = audio.mean(dim=0, keepdim=True)
        
        audio = self.audio_transform(audio)
        audio = audio.squeeze(0)  # Remove channel dimension

        # Use processor for encoding
        inputs = self.processor(audio.numpy(), return_tensors="pt", sampling_rate=16000)

        # Check for available keys and update accordingly
        if 'input_features' not in inputs:
            raise KeyError("Key 'input_features' not found in processor output")

        # Get the input values for the model (Extracts and reshapes the input features)
        input_features = inputs['input_features'].squeeze(0)

        # Tokenize transcription
        labels = self.processor.tokenizer(item['transcription'], return_tensors="pt").input_ids.squeeze(0)
        
        # Pad or truncate labels to max_label_length
        if labels.size(0) < self.max_label_length:
            padding = self.max_label_length - labels.size(0)
            labels = torch.cat([labels, torch.tensor([self.processor.tokenizer.pad_token_id] * padding)])
        elif labels.size(0) > self.max_label_length:
            labels = labels[:self.max_label_length]

        return {
            'input_features': input_features,
            'labels': labels
        }

    
# Create DataLoaders
train_loader = DataLoader(
    WhisperDataset(train_data, processor),
    batch_size=4,
    shuffle=True,
    num_workers=4  # Adjust based on CPU cores
)

eval_loader = DataLoader(
    WhisperDataset(test_data, processor),
    batch_size=4,
    num_workers=4  # Adjust based on your CPU cores
)


# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    output_dir="./results",
    evaluation_strategy="epoch", #evaluation will be done at the end of each epoch.
    logging_dir="./logs",
    logging_steps=10,
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to=["none"],  # This should disable wandb
)


# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5) #AdamW: Optimizer for weight decay and learning rate by adjusting the weights based on the gradients computed during backpropagation
num_training_steps = training_args.num_train_epochs * len(train_loader)
lr_scheduler = get_scheduler(  #adjusts the learning rate according to a predefined schedule
    name="linear",   #linearly increases the learning rate during the warmup phase and then decreases it.
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Training loop
progress_bar = tqdm(range(num_training_steps)) #track the number of training steps.

model.train()


for epoch in range(training_args.num_train_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()  #Clears the gradients to avoid accumulation.
        progress_bar.update(1)

        
# Evaluation loop
model.eval()

eval_loss = 0
for batch in eval_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        eval_loss += outputs.loss.item()

eval_loss = eval_loss / len(eval_loader)
print(f"Evaluation loss: {eval_loss}")


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]



  0%|          | 0/3465 [00:00<?, ?it/s]

2024-08-05 02:13:29.443538: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-05 02:13:29.443547: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-05 02:13:29.443530: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-05 02:13:29.443545: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-05 02:13:29.443624: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory 

Evaluation loss: 0.0451078276948205


In [2]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.0.4-py3-none-any.whl (21 kB)
Downloading rapidfuzz-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.4 rapidfuzz-3.9.5


In [9]:
def transcribe_audio(audio_path, model, processor, device):
    # Load and preprocess audio
    audio, sample_rate = torchaudio.load(audio_path)
#     print(f"Loaded audio shape: {audio.shape}, Sample rate: {sample_rate}")
    
    if audio.size(0) > 1:  # Ensure single channel audio
        audio = audio.mean(dim=0, keepdim=True)
#     print(f"Audio shape after ensuring single channel: {audio.shape}")
    
    # Resample if needed (only if original sample rate is different from 16000)
    if sample_rate != 16000:
        audio = T.Resample(orig_freq=sample_rate, new_freq=16000)(audio)
    audio = audio.squeeze(0)
#     print(f"Audio shape after resampling: {audio.shape}")

    # Use processor for encoding
    inputs = processor(audio.numpy(), return_tensors="pt", sampling_rate=16000)
#     print(f"Inputs shape: {inputs['input_features'].shape}")
    input_features = inputs['input_features'].to(device)

    # Model inference
    with torch.no_grad():
        generated_ids = model.generate(input_features)
#         print(f"Generated IDs: {generated_ids}")
    
    #Decodes the generated IDs into a human-readable transcription and skips any special tokens.
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return transcription

unseen_audio_path = "/kaggle/input/english-audio/OSR_us_000_0014_8k.wav"
model.eval()
transcription = transcribe_audio(unseen_audio_path, model, processor, device)
print(f"Transcription: {transcription}")


Transcription: 0 74650 A king ruled the state in the early days, the ship was torn apart on the sharp reef. The sun is kept him home the third week. The wide road shimmered in the hot sun. The lazy cow lay in the cool grass. This is square stone over the fence. The rope will bind the seven books at once. Hop over the fence and punger, friendly gang left the drugstore. Mesh wire keeps chicks inside.


# Word Error Rate (WER)
common metric used to evaluate the performance of speech recognition systems. It measures the differences between the reference (ground truth) transcriptions and the predicted transcriptions.

In [13]:
from jiwer import wer
from tqdm.auto import tqdm

def evaluate_wer(model, processor, test_loader, device):
    model.eval()
    predictions = []
    references = []

    for batch in tqdm(test_loader, desc="Evaluating", leave=False):
        audio_paths = batch['audio_path']
        transcriptions = batch['transcription']

        for audio_path, transcription in zip(audio_paths, transcriptions):
            predicted_transcription = transcribe_audio(audio_path, model, processor, device)
            predictions.append(predicted_transcription)
            references.append(transcription)
    
    error = wer(references, predictions)
    return error

# Create a new DataLoader for evaluation with file paths and transcriptions
class WhisperEvalDataset(TorchDataset):
    def __init__(self, data):
        self.data = data.to_dict('records')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            'audio_path': item['audio_path'],
            'transcription': item['transcription']
        }

eval_loader = DataLoader(
    WhisperEvalDataset(test_data),
    batch_size=1,  
    shuffle=False,
    num_workers=4
)

# Calculate WER on the test dataset
error = evaluate_wer(model, processor, eval_loader, device)
print(f"Word Error Rate (WER): {error:.4f}")


  self.pid = os.fork()


Evaluating:   0%|          | 0/1680 [00:00<?, ?it/s]

Word Error Rate (WER): 0.1760
