In [1]:
import pandas as pd

df = pd.read_parquet("data/train-00000-of-00026.parquet")
print(df.head())


         id                                              audio  \
0  00000001  {'bytes': b'RIFF$\xa6\x0e\x00WAVEfmt \x10\x00\...   
1  00000002  {'bytes': b'RIFFz\x0f\r\x00WAVEfmt \x10\x00\x0...   
2  00000003  {'bytes': b'RIFFz\xc7\x07\x00WAVEfmt \x10\x00\...   
3  00000004  {'bytes': b'RIFFz_\x0e\x00WAVEfmt \x10\x00\x00...   
4  00000005  {'bytes': b'RIFF$\xa6\x0e\x00WAVEfmt \x10\x00\...   

                                                text  
0  katta obyomda zapchas sotishadi, katta obyomda...  
1  lider ekan, oʻzini ishini ustasi ekan, yoki bo...  
2  Ana chiroyli ko'rinasiz. Onsoroq. Ham chiroyli...  
3  bitta joyingiz bedavo kasalga chalingan bo'lsa...  
4  ikkalasi ham, nu faqat meniki emas, onasini, m...  


In [6]:
import os
import json
import soundfile as sf
import pandas as pd
import json

In [7]:
import io

output_dir = "audio_files"
os.makedirs(output_dir, exist_ok=True)

json_list = []

for idx, row in df.iloc[:100].iterrows():
    audio_bytes = row["audio"]["bytes"]
    audio_array, sampling_rate = sf.read(io.BytesIO(audio_bytes))
    audio_path = os.path.join(output_dir, f"audio_{idx}.flac")
    sf.write(audio_path, audio_array, sampling_rate)
    json_list.append({
        "audio_path": audio_path,
        "text": row["text"]
    })

with open("metadata.json", "w", encoding="utf-8") as f:
    for entry in json_list:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

print("✅ Done! Saved first 100 audio files in:", output_dir)


✅ Done! Saved first 100 audio files in: audio_files


In [None]:
# Load data into Hugging Face Dataset using metadata.json
from datasets import Dataset, Audio
import json

# Read metadata.json file
data_list = []
with open("metadata.json", "r", encoding="utf-8") as f:
    for line in f:
        data_list.append(json.loads(line.strip()))

print(f"Loaded {len(data_list)} samples")
print("Sample:", data_list[0])

# Create Hugging Face dataset
dataset = Dataset.from_list(data_list)

# Cast audio column to Audio feature for automatic audio loading
dataset = dataset.cast_column("audio_path", Audio(sampling_rate=None))

print("\nDataset info:")
print(dataset)
print("\nDataset features:")
print(dataset.features)

  from .autonotebook import tqdm as notebook_tqdm


Loaded 100 samples
Sample: {'audio_path': 'audio_files/audio_0.flac', 'text': "katta obyomda zapchas sotishadi, katta obyomda. Minusga o'tirishadi. Hozir konkurensiya nerealniy-da. Foydasi-chi, ba'zi paytda minusga ishlashadi yo foydaga ishlashadi? I menga aytyapti-da, hozircha bozorda ishlaydigan vaqt emas deyapti-da. Hozir qayerdadir Toshkentda servis qilish kerak deyapti-da. Servis podxod qilgin-da, zo'r muomala qilib qo'y, zo'r servis qilib qo'ygin-da, qimmat qilib qo'yib qo'y narxini-da. Odamlar mana shunga qiziqyapti-da. Qimmat bo'lsin, lekin kerak bo'lsa uyimdan olib ketsin-u, uyimga olib kelib bersin, tushundingmi?"}

Dataset info:
Dataset({
    features: ['audio_path', 'text'],
    num_rows: 100
})

Dataset features:
{'audio_path': Audio(sampling_rate=None, decode=True, stream_index=None), 'text': Value('string')}


In [None]:
import soundfile as sf
import io

# Replace audio_path with actual audio bytes
for entry in data_list:
    audio_path = entry["audio_path"]
    with open(audio_path, "rb") as f:
        audio_bytes = f.read()
    entry["audio"] = {"bytes": audio_bytes}
    del entry["audio_path"]

# Create dataset with audio bytes
dataset_with_audio = Dataset.from_list(data_list)
print(dataset_with_audio)


In [2]:
# Alternative approach: Create dataset without auto-decoding audio
from datasets import Dataset, Audio, Features, Value
import json

# Read metadata.json file
data_list = []
with open("metadata.json", "r", encoding="utf-8") as f:
    for line in f:
        data_list.append(json.loads(line.strip()))

# Option 1: Simple dataset without Audio feature (just paths)
simple_dataset = Dataset.from_list(data_list)
print("Simple dataset:")
print(simple_dataset)

# Option 2: Dataset with Audio feature but decode=False (safer)
features = Features({
    "audio_path": Audio(decode=False),  # Don't auto-decode
    "text": Value("string")
})
audio_dataset = Dataset.from_list(data_list, features=features)
print("\nAudio dataset (no auto-decode):")
print(audio_dataset)

# Access sample data
sample = simple_dataset[0]
print(f"\nSample text: {sample['text'][:100]}...")
print(f"Audio path: {sample['audio_path']}")

# Split dataset (example: 90% train, 10% test)
train_test_split = simple_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

print(f"\nTrain samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")

Simple dataset:
Dataset({
    features: ['audio_path', 'text'],
    num_rows: 100
})

Audio dataset (no auto-decode):
Dataset({
    features: ['audio_path', 'text'],
    num_rows: 100
})

Sample text: katta obyomda zapchas sotishadi, katta obyomda. Minusga o'tirishadi. Hozir konkurensiya nerealniy-da...
Audio path: audio_files/audio_0.flac

Train samples: 90
Test samples: 10


In [8]:
from IPython.display import Audio, display

import soundfile as sf

sample = audio_dataset[0]
# audio_array, sampling_rate = sf.read(sample["audio_path"])
# display(Audio(audio_array, rate=sampling_rate))
# print("Transcript:", sample["text"])

In [9]:
sample

{'audio_path': {'bytes': None, 'path': 'audio_files/audio_0.flac'},
 'text': "katta obyomda zapchas sotishadi, katta obyomda. Minusga o'tirishadi. Hozir konkurensiya nerealniy-da. Foydasi-chi, ba'zi paytda minusga ishlashadi yo foydaga ishlashadi? I menga aytyapti-da, hozircha bozorda ishlaydigan vaqt emas deyapti-da. Hozir qayerdadir Toshkentda servis qilish kerak deyapti-da. Servis podxod qilgin-da, zo'r muomala qilib qo'y, zo'r servis qilib qo'ygin-da, qimmat qilib qo'yib qo'y narxini-da. Odamlar mana shunga qiziqyapti-da. Qimmat bo'lsin, lekin kerak bo'lsa uyimdan olib ketsin-u, uyimga olib kelib bersin, tushundingmi?"}

In [10]:
from datasets import load_dataset, Audio

# Load jsonl file
dataset = load_dataset("json", data_files="metadata.json", split="train")

# Cast audio column so HF can decode audio
dataset = dataset.cast_column("audio_path", Audio())

print(dataset[0])

Generating train split: 100 examples [00:00, 6200.28 examples/s]

{'audio_path': <datasets.features._torchcodec.AudioDecoder object at 0x7c360f42df40>, 'text': "katta obyomda zapchas sotishadi, katta obyomda. Minusga o'tirishadi. Hozir konkurensiya nerealniy-da. Foydasi-chi, ba'zi paytda minusga ishlashadi yo foydaga ishlashadi? I menga aytyapti-da, hozircha bozorda ishlaydigan vaqt emas deyapti-da. Hozir qayerdadir Toshkentda servis qilish kerak deyapti-da. Servis podxod qilgin-da, zo'r muomala qilib qo'y, zo'r servis qilib qo'ygin-da, qimmat qilib qo'yib qo'y narxini-da. Odamlar mana shunga qiziqyapti-da. Qimmat bo'lsin, lekin kerak bo'lsa uyimdan olib ketsin-u, uyimga olib kelib bersin, tushundingmi?"}





In [18]:
from IPython.display import Audio, display

In [20]:
example = dataset[0]

In [22]:
display(Audio(example["audio_path"]["array"], rate=example["audio_path"]["sampling_rate"]))

In [21]:
display(Audio(example["audio_path"]["array"], rate=example["audio_path"]["sampling_rate"]))

In [11]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# Example encoding
example = dataset[0]
inputs = processor(example["audio_path"]["array"], sampling_rate=example["audio_path"]["sampling_rate"], text=example["text"], return_tensors="pt")
print(inputs)

Fetching 1 files: 100%|██████████| 1/1 [00:01<00:00,  1.02s/it]


{'input_features': tensor([[[-0.6400, -0.6400, -0.6400,  ..., -0.6400, -0.6400, -0.6400],
         [-0.6400, -0.6400, -0.6400,  ..., -0.6400, -0.6400, -0.6400],
         [-0.6400, -0.6400, -0.6400,  ..., -0.6400, -0.6400, -0.6400],
         ...,
         [-0.6400, -0.6400, -0.6400,  ..., -0.6400, -0.6400, -0.6400],
         [-0.6400, -0.6400, -0.6400,  ..., -0.6400, -0.6400, -0.6400],
         [-0.6400, -0.6400, -0.6400,  ..., -0.6400, -0.6400, -0.6400]]]), 'labels': tensor([[50258, 50363,    74, 18405,  1111,    88,   298,  2675, 14223, 41299,
           262,   310,   742,  5688,    11, 16536,  1328,  1111,    88,   298,
          2675,    13,  2829,   301,  3680,   277,   380,   347,   742,  5688,
            13,  3631,    89,   347, 21428,   540,  3695, 23622,   297, 46036,
          3722,    88,    12,  2675,    13,   479,   939,    67,  8483,    12,
          8036,    11,  4773,     6,  3992,  1689,    83,  2675,  3175,  3680,
           307, 22950,  1299,  5688,  5290,   726,  66

In [14]:
# Generate prediction
predicted_ids = model.generate(inputs.input_features)

# Decode the prediction to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print(transcription)

Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 Qot təbəyə umudur, zəbkə suç ya da qot təbəyə umudur. Mizir qətətə çədə. Azıq kənkurincinin yeri yəni de, fəudəcə, bəzbət de, minüski işlətəyətə, yəni fəudəcə işlətə. İmən getə, o da xozarca bozar da işlətiyon vaxtmaz da, o da xozar qattıdır, toşqiyonun serizq rışqəli, seriz patxutqı qəndə, zor mamlaqqıqıqıqıqıqıqıqıqıqəndə, qınmad qobqıqıq, nəqanə de. Odaq məşingə qızıq qoadə de, qınmad bəsən, ləki kəri bəsə ümlə opqə etsin, ümgü opqə bilirsiniz,


In [None]:
example = dataset[0]
inputs = processor(
    example["audio_path"]["array"],
    sampling_rate=example["audio_path"]["sampling_rate"],
    text=example["text"],
    return_tensors="pt"
)
