In [3]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp312-cp312-win_amd64.whl.metadata (10 kB)
Downloading sentencepiece-0.2.1-cp312-cp312-win_amd64.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ---------------------------------------- 1.1/1.1 MB 16.9 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.1


In [27]:
import json
import sys
import tempfile
from pathlib import Path
from typing import Dict, List
import sentencepiece as spm
import yaml

In [29]:
PROJECT_ROOT = Path("C:/Users/anami/Desktop/ML")
PROCESSED_DATA_ROOT = Path("C:/Users/anami/Desktop/ML/processed_data")
SAMPLE_RATE = 16000

In [31]:
ASR_DATA_ROOT = PROCESSED_DATA_ROOT / "common_voice_16khz"
CONFIG_DIR = PROJECT_ROOT / "fairseq" / "configs"
MANIFEST_DIR = ASR_DATA_ROOT / "manifests"
SPM_DIR = ASR_DATA_ROOT / "spm"

In [33]:
DEV_MANIFEST = PROCESSED_DATA_ROOT / "dev_manifest.jsonl"
MANIFEST_SOURCES: Dict[str, Path] = {
    "train": PROCESSED_DATA_ROOT / "train_manifest.jsonl",
    "dev": DEV_MANIFEST if DEV_MANIFEST.exists() else PROCESSED_DATA_ROOT / "validation_manifest.jsonl",
    "test": PROCESSED_DATA_ROOT / "test_manifest.jsonl",
}

In [39]:
def load_manifest_entries(manifest_path: Path) -> List[dict]:
    """Loads entries from a JSONL manifest."""
    entries: List[dict] = []
    with manifest_path.open("r", encoding="utf-8") as f_in:
        for line_no, line in enumerate(f_in, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                entries.append(json.loads(line))
            except json.JSONDecodeError as exc:
                print(f"[WARNING] Skipping malformed line {line_no} in {manifest_path.name}: {exc}")
    return entries

In [37]:
def setup_directories():
    print("--- 1. Setting up directories and paths ---")
    for path in (CONFIG_DIR, MANIFEST_DIR, SPM_DIR):
        path.mkdir(parents=True, exist_ok=True)

    print(f"Project Root: {PROJECT_ROOT}")
    print(f"Processed Data Root: {PROCESSED_DATA_ROOT}")
    print(f"ASR Data (Common Voice) Root: {ASR_DATA_ROOT}")
    print(f"Manifest Directory: {MANIFEST_DIR}")
    print(f"SentencePiece Directory: {SPM_DIR}")
    print(f"Configuration Directory: {CONFIG_DIR}\n")

    if not ASR_DATA_ROOT.exists():
        print(f"[ERROR] ASR data directory not found at: {ASR_DATA_ROOT}")
        print("Please double-check the 'PROCESSED_DATA_ROOT' path in the script.")
        sys.exit(1)

In [41]:
def resolve_audio_path(audio_filepath: str) -> Path:
    audio_path = Path(audio_filepath)
    if not audio_path.is_absolute():
        audio_path = (PROCESSED_DATA_ROOT / audio_path).resolve(strict=False)
    else:
        audio_path = audio_path.resolve(strict=False)

    try:
        audio_rel = audio_path.relative_to(ASR_DATA_ROOT.resolve(strict=False))
    except ValueError as exc:
        raise ValueError(f"Audio file {audio_path} is not inside {ASR_DATA_ROOT}") from exc
    return audio_rel

In [43]:
def prepare_fairseq_manifests() -> List[str]:
    """Converts JSONL manifests into Fairseq TSV manifests and returns train transcripts."""
    print("--- 2. Building Fairseq TSV manifests ---")
    train_transcripts: List[str] = []

    for split, manifest_path in MANIFEST_SOURCES.items():
        if not manifest_path.exists():
            print(f"[ERROR] Required manifest not found: {manifest_path}")
            sys.exit(1)

        entries = load_manifest_entries(manifest_path)
        if not entries:
            print(f"[ERROR] Manifest {manifest_path} is empty.")
            sys.exit(1)

        rows = []
        skipped = 0
        for entry in entries:
            text = entry.get("text")
            audio_fp = entry.get("audio_filepath")
            if not text or not audio_fp:
                skipped += 1
                continue

            try:
                audio_rel = resolve_audio_path(audio_fp)
            except ValueError as exc:
                print(f"[WARNING] {exc}")
                skipped += 1
                continue

            n_frames = entry.get("num_frames") or entry.get("n_frames")
            if n_frames is None:
                duration = entry.get("duration")
                if duration is None:
                    skipped += 1
                    continue
                n_frames = int(round(duration * SAMPLE_RATE))

            speaker = entry.get("speaker") or entry.get("client_id") or "unknown"
            utt_id = entry.get("id") or audio_rel.stem
            rows.append((utt_id, audio_rel.as_posix(), n_frames, text, speaker))

            if split == "train":
                train_transcripts.append(text)

        if not rows:
            print(f"[ERROR] No usable rows found in {manifest_path}.")
            sys.exit(1)

        output_path = MANIFEST_DIR / f"{split}.tsv"
        with output_path.open("w", encoding="utf-8") as f_out:
            f_out.write("id\taudio\tn_frames\ttgt_text\tspeaker\n")
            for utt_id, audio_rel, n_frames, text, speaker in rows:
                f_out.write(f"{utt_id}\t{audio_rel}\t{n_frames}\t{text}\t{speaker}\n")

        print(f"[OK] Wrote {len(rows)} rows to {output_path} ({skipped} skipped).")

    if not train_transcripts:
        print("[ERROR] No transcripts collected from the training manifest.")
        sys.exit(1)

    print()
    return train_transcripts


In [45]:
def train_tokenizer_and_build_dict(transcripts: List[str], model_prefix: Path, dict_path: Path, vocab_size: int):
    print("--- 3. Generating Tokenizer and Dictionary ---")
    model_prefix.parent.mkdir(parents=True, exist_ok=True)

    with tempfile.NamedTemporaryFile("w", encoding="utf-8", delete=False) as tmp:
        for line in transcripts:
            tmp.write(f"{line}\n")
        tmp_path = Path(tmp.name)

    spm_command = (
        f"--input={tmp_path} --model_prefix={model_prefix} "
        f"--vocab_size={vocab_size} --character_coverage=1.0 "
        f"--model_type=unigram --pad_id=3 --pad_piece=<pad> "
        f"--unk_id=0 --bos_id=1 --eos_id=2"
    )
    spm.SentencePieceTrainer.train(spm_command)
    model_file = model_prefix.with_suffix(".model")
    tmp_path.unlink(missing_ok=True)
    print(f"SentencePiece model saved to: {model_file}")

    sp = spm.SentencePieceProcessor()
    sp.load(str(model_file))

    with dict_path.open("w", encoding="utf-8") as f_out:
        for i in range(sp.get_piece_size()):
            f_out.write(f"{sp.id_to_piece(i)} 1\n")
    print(f"Fairseq dictionary saved to: {dict_path}\n")

In [47]:
def generate_fairseq_configs():
    """Generates the main training and dataset-specific YAML files for Fairseq."""
    print("--- 4. Generating Fairseq YAML Configuration Files ---")

    asr_pretrain_config = {
        "defaults": ["_self_"],
        "common": {"fp16": True, "log_format": "json", "log_interval": 100, "seed": 1337},
        "task": {
            "_name": "speech_to_text",
            "data": str(MANIFEST_DIR),
            "config_yaml": "config_asr.yaml",
            "max_source_positions": 3000,
            "max_target_positions": 1024,
        },
        "model": {
            "_name": "s2t_conformer",
            "input_feat_per_channel": 80,
            "encoder_layers": 12,
            "encoder_embed_dim": 512,
            "encoder_ffn_embed_dim": 2048,
            "encoder_attention_heads": 8,
            "decoder_layers": 6,
            "decoder_embed_dim": 512,
            "decoder_ffn_embed_dim": 2048,
            "decoder_attention_heads": 8,
            "dropout": 0.15,
        },
        "criterion": {"_name": "label_smoothed_cross_entropy", "label_smoothing": 0.1, "report_accuracy": True},
        "optimizer": {"_name": "adam", "adam_betas": "(0.9, 0.98)", "lr": [0.002]},
        "lr_scheduler": {"_name": "inverse_sqrt", "warmup_updates": 10000},
        "checkpoint": {
            "save_dir": str(PROJECT_ROOT / "fairseq" / "checkpoints" / "asr_pretrain"),
            "best_checkpoint_metric": "wer",
            "patience": 10,
            "save_interval_updates": 500,
            "keep_interval_updates": 5,
        },
        "dataset": {
            "num_workers": 4,
            "max_tokens": 32000,
            "batch_size": None,
            "valid_subset": "dev",
            "skip_invalid_size_inputs_valid_test": True,
        },
        "eval": {"eval_wer": True, "eval_wer_config": {"beam": 5, "max_len_a": 0, "max_len_b": 200}},
    }
    main_config_path = CONFIG_DIR / "asr_pretrain.yaml"
    with main_config_path.open("w", encoding="utf-8") as f_out:
        yaml.dump(asr_pretrain_config, f_out, sort_keys=False, indent=2)
    print(f"Main training config written to: {main_config_path}")

    dataset_config = {
        "sample_rate": SAMPLE_RATE,
        "input_channels": 1,
        "input_feat_per_channel": 80,
        "use_audio_input": True,
        "standardize_audio": False,
        "audio_root": "",
        "vocab_filename": "dict.txt",
        "bpe_tokenizer": {
            "bpe": "sentencepiece",
            "sentencepiece_model": str(Path("..") / "spm" / "mr_asr.model"),
        },
        "transforms": {
            "_train": ["utterance_cmvn", "specaugment"],
            "valid": ["utterance_cmvn"],
            "test": ["utterance_cmvn"],
        },
        "specaugment": {
            "time_wrap_W": 0,
            "freq_mask_N": 1,
            "freq_mask_F": 27,
            "time_mask_N": 1,
            "time_mask_T": 100,
            "time_mask_p": 1.0,
        },
    }
    dataset_config_path = MANIFEST_DIR / "config_asr.yaml"
    with dataset_config_path.open("w", encoding="utf-8") as f_out:
        yaml.dump(dataset_config, f_out, sort_keys=False, indent=2)
    print(f"Dataset config written to: {dataset_config_path}\n")

In [56]:
def main():
    """Main execution function to run the setup workflow."""
    setup_directories()

    # Load transcripts from the manifest file
    train_manifest_path = ASR_DATA_ROOT / "manifests" / "train.tsv"
    try:
        df = pd.read_csv(train_manifest_path, sep="\t", header=None, skiprows=1)
        df.columns = ['id', 'audio', 'n_frames', 'tgt_text', 'speaker']
        marathi_transcripts = df["tgt_text"].dropna().tolist()
        print(f"Successfully loaded {len(marathi_transcripts)} training transcripts.")
    except Exception as e:
        print(f"[ERROR] Could not process manifest file at {train_manifest_path}: {e}")
        sys.exit(1)

    # Generate tokenizer and dictionary
    train_tokenizer_and_build_dict(
        transcripts=marathi_transcripts,
        model_prefix=ASR_DATA_ROOT / "spm" / "mr_asr",
        dict_path=ASR_DATA_ROOT / "dict.txt",
        vocab_size=4000
    )

    # Generate Fairseq configuration files
    generate_fairseq_configs()

    # Print the final command for the user
    training_command = (
        f"fairseq-hydra-train \\\n"
        f"  --config-dir {CONFIG_DIR.as_posix()} \\\n"
        f"  --config-name asr_pretrain"
    )
    
    print("--- 4. Ready for Training ---")
    print("All configuration files have been generated successfully.")
    print("\nTo launch the training job, run the following command in your terminal:")
    print(" (Ensure your virtual environment is active and you are in the project root directory)")
    print("\n" + "#" * 70)
    print(training_command)
    print("#" * 70)
    print("\nTo perform a quick test of your configuration, add the --dry-run flag.")


if __name__ == "__main__":
    # To run this script, open a terminal, navigate to your project directory 
    # (C:/Users/anami/Desktop/ML), activate your environment, and then run:
    # python run_pretraining_setup.py
    main()


--- 1. Setting up directories and paths ---
Project Root: C:\Users\anami\Desktop\ML
Processed Data Root: C:\Users\anami\Desktop\ML\processed_data
ASR Data (Common Voice) Root: C:\Users\anami\Desktop\ML\processed_data\common_voice_16khz
Manifest Directory: C:\Users\anami\Desktop\ML\processed_data\common_voice_16khz\manifests
SentencePiece Directory: C:\Users\anami\Desktop\ML\processed_data\common_voice_16khz\spm
Configuration Directory: C:\Users\anami\Desktop\ML\fairseq\configs

Successfully loaded 1330 training transcripts.
--- 3. Generating Tokenizer and Dictionary ---
SentencePiece model saved to: C:\Users\anami\Desktop\ML\processed_data\common_voice_16khz\spm\mr_asr.model
Fairseq dictionary saved to: C:\Users\anami\Desktop\ML\processed_data\common_voice_16khz\dict.txt

--- 4. Generating Fairseq YAML Configuration Files ---
Main training config written to: C:\Users\anami\Desktop\ML\fairseq\configs\asr_pretrain.yaml
Dataset config written to: C:\Users\anami\Desktop\ML\processed_data\c