In [None]:
!pip install datasets hf-xet

In [None]:
from datasets import load_dataset, DownloadConfig
import os
from tqdm import tqdm
import concurrent.futures
import time
import random
import pandas as pd
import zipfile
import io
from huggingface_hub import hf_hub_download, HfFolder

# # In terminal before running script to Avoid rate limits
# huggingface-cli login
# # Or in your script
from huggingface_hub import login
login("your_token_here")  # Get token from huggingface.co/settings/tokens

# Configuration
DATASET_NAME    = "coild/dhravani"
OUTPUT_DIR      = "datasets/te/audio"       # Optional: for local saves
METADATA_CSV    = "datasets/te/metadata.csv" # Optional: for local save
PARQUET_FILE    = "te/te.parquet"
SAVE_LOCAL      = False  # Toggle to save files locally
SAVE_ZIP        = True
ZIP_PATH        = "coild_dhravani_te.zip"
LANGUAGE        = "te"
MAX_WORKERS     = 8
MAX_RETRIES     = 3
RETRY_BASE_DELAY = 5

# Ensure local directories if saving
if SAVE_LOCAL:
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    os.makedirs(os.path.dirname(METADATA_CSV), exist_ok=True)

# Start timing
start_time = time.time()

# 1. Download and load parquet metadata file
print("Downloading and loading parquet metadata file...")
parquet_local = hf_hub_download(
    repo_id=DATASET_NAME,
    filename=PARQUET_FILE,
    repo_type="dataset"
)
df = pd.read_parquet(parquet_local)

# 2. Filter Telugu examples
print("Filtering Telugu examples from metadata...")
df_filt = df[df.language == LANGUAGE].copy()
print(f"Found {len(df_filt)} Telugu examples")

# Prepare metadata list
results = []

# 3. Download and optionally save audio, store bytes in memory
print(f"Downloading {len(df_filt)} audio files into memory...")

def download_audio_bytes(src_path):
    for attempt in range(MAX_RETRIES):
        try:
            local_cache = hf_hub_download(
                repo_id=DATASET_NAME,
                filename=src_path,
                repo_type="dataset"
            )
            with open(local_cache, 'rb') as f:
                data = f.read()
            return data
        except Exception as e:
            if attempt < MAX_RETRIES - 1:
                delay = RETRY_BASE_DELAY * (2 ** attempt) + random.uniform(0, 1)
                print(f"Error fetching {src_path}: {e}. Retrying in {delay:.2f}s...")
                time.sleep(delay)
            else:
                print(f"Failed to fetch {src_path} after {MAX_RETRIES} attempts.")
                return None

with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {}
    for _, row in df_filt.iterrows():
        src = row["audio_path"]
        futures[executor.submit(download_audio_bytes, src)] = (src, row["transcription"])

    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Downloading audio bytes"):
        src_path, transcription = futures[future]
        audio_bytes = future.result()
        if audio_bytes:
            filename = os.path.basename(src_path)
            results.append({"filename": filename, "audio_bytes": audio_bytes, "transcription": transcription})
            if SAVE_LOCAL:
                # Save to disk if requested
                dst = os.path.join(OUTPUT_DIR, filename)
                with open(dst, 'wb') as wf:
                    wf.write(audio_bytes)

# 4. Build metadata DataFrame
metadata_df = pd.DataFrame([{"audio_path": r["filename"], "transcription": r["transcription"]} for r in results])
if SAVE_LOCAL:
    metadata_df.to_csv(METADATA_CSV, index=False)
    print(f"✅ Saved metadata locally to {METADATA_CSV}")

# 5. Create ZIP in-memory including audio and CSV
if SAVE_ZIP:
    print("Creating in-memory ZIP archive...")
    with zipfile.ZipFile(ZIP_PATH, mode='w', compression=zipfile.ZIP_DEFLATED) as zf:
        # Add audio files
        for r in results:
            arcname = os.path.join('audio', r['filename'])
            zf.writestr(arcname, r['audio_bytes'])
        # Add metadata CSV
        csv_buffer = io.StringIO()
        metadata_df.to_csv(csv_buffer, index=False)
        zf.writestr('metadata.csv', csv_buffer.getvalue())
    print(f"✅ Created ZIP archive at {ZIP_PATH}")

# End timing
end_time = time.time()
print(f"✅ Finished! Processed {len(results)} files.")
print(f"⏱️ Execution time: {end_time - start_time:.2f} seconds")
