In [2]:
!pip install -q transformers accelerate datasets torchaudio librosa


In [3]:
import os
import pandas as pd
import numpy as np
from datasets import Dataset


In [4]:
metadata_path = "../data/final_metadata.csv"
df = pd.read_csv(metadata_path)

print("Total samples:", len(df))
df.head()


Total samples: 4829


Unnamed: 0,file_path,duration,language,speaker_id,transcript,quality_flag
0,../data/common_voice_hindi/processed_clips\com...,4.068,hi,0f018a99663f33afbb7d38aee281fb1afcfd07f9e7acd0...,हमने उसका जन्मदिन मनाया।,good
1,../data/common_voice_hindi/processed_clips\com...,7.524,hi,0f018a99663f33afbb7d38aee281fb1afcfd07f9e7acd0...,"साउथ दिल्ली नगर निगम सख्त, शॉपिंग मॉल के बाहर ...",good
2,../data/common_voice_hindi/processed_clips\com...,6.084,hi,0f018a99663f33afbb7d38aee281fb1afcfd07f9e7acd0...,उत्तर कोरिया ने अमेरिका को दी हमले की धमकी,good
3,../data/common_voice_hindi/processed_clips\com...,5.184,hi,0f018a99663f33afbb7d38aee281fb1afcfd07f9e7acd0...,अगले कमरे में अनेक रोमन मूर्तियाँ हैं।,good
4,../data/common_voice_hindi/processed_clips\com...,4.716,hi,0f018a99663f33afbb7d38aee281fb1afcfd07f9e7acd0...,तुम ने टॉम को कहाँ भेज दिया?,good


In [5]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(
    "openai/whisper-tiny",
    language=None,
    task="transcribe"
)

print("Whisper processor loaded")


Whisper processor loaded


In [6]:
dataset = Dataset.from_pandas(df)

print(dataset)


Dataset({
    features: ['file_path', 'duration', 'language', 'speaker_id', 'transcript', 'quality_flag'],
    num_rows: 4829
})


In [7]:
import librosa
import torch

In [8]:
def prepare_example(example):
    import librosa
    from transformers import WhisperProcessor

    # Load processor INSIDE worker
    processor = WhisperProcessor.from_pretrained(
        "openai/whisper-tiny",
        language=None,
        task="transcribe"
    )

    # 1. Load audio
    audio, sr = librosa.load(example["file_path"], sr=16000)

    # 2. Audio → Whisper features
    input_features = processor.feature_extractor(
        audio,
        sampling_rate=16000
    ).input_features[0]

    # 3. Tokenize transcript
    labels = processor.tokenizer(
        example["transcript"],
        truncation=True
    ).input_ids

    return {
        "input_features": input_features,
        "labels": labels
    }


In [None]:
processed_dataset = dataset.map(
    prepare_example,
    remove_columns=dataset.column_names,
    num_proc=1
)

print(processed_dataset)



Map (num_proc=1):   0%|          | 0/4829 [00:00<?, ? examples/s]

In [None]:
processed_dataset.save_to_disk("../data/processed_whisper_dataset")
