In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!nvidia-smi

In [None]:
!pip install Cython
!pip install -U "nemo_toolkit[asr]"

In [None]:
!pip uninstall -y numpy
!pip install numpy==1.26.4
import numpy as np
print("NumPy version:", np.__version__)  # Should output 1.26.4

In [None]:
import os
import json
from pathlib import Path
import soundfile as sf
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import numpy as np  # Import numpy for manual splitting

def get_entry(item, asr_dir):
    wav = asr_dir / item["audio"]
    info = sf.info(str(wav))
    txt = item["transcript"].strip()
    return {
        "audio_filepath": str(wav),
        "duration": float(info.duration),
        "text": txt,
    }

def manual_train_test_split(data, test_size=0.1, random_state=42):
    """
    Manually split a list of data into train and test sets using numpy.
    Args:
        data: List of items to split (e.g., list of dictionaries).
        test_size: Fraction of data to use as test set (default: 0.1).
        random_state: Seed for reproducibility (default: 42).
    Returns:
        train_data: List of items for training.
        test_data: List of items for testing.
    """
    # Convert data to numpy array for shuffling
    data_array = np.array(data)
    n_samples = len(data_array)

    # Set random seed for reproducibility
    if random_state is not None:
        np.random.seed(random_state)

    # Shuffle indices
    indices = np.random.permutation(n_samples)

    # Calculate split point
    test_size_count = int(test_size * n_samples)

    # Split indices into test and train
    test_indices = indices[:test_size_count]
    train_indices = indices[test_size_count:]

    # Return split data as lists
    train_data = data_array[train_indices].tolist()
    test_data = data_array[test_indices].tolist()

    return train_data, test_data

def build_manifests():
    ASR_DIR = Path("/content/drive/MyDrive/kaggle_datasets/til-asr/asr")
    IN_MANIFEST = ASR_DIR / "asr.jsonl"
    TRAIN_MAN = Path("train_manifest.jsonl")
    FULL_MAN = Path("full_manifest.jsonl")
    EVAL_MAN = Path("eval_manifest.jsonl")

    with open(IN_MANIFEST, "r") as fin:
        raw_items = [json.loads(line) for line in fin if line.strip()]

    entries = []
    with ThreadPoolExecutor() as executor:
        for entry in tqdm(executor.map(lambda x: get_entry(x, ASR_DIR), raw_items),
                          total=len(raw_items), desc="Processing audio metadata"):
            entries.append(entry)

    # Replace train_test_split with manual splitting
    train, val = manual_train_test_split(entries, test_size=0.1, random_state=42)

    for path, data in [(FULL_MAN, entries), (TRAIN_MAN, train), (EVAL_MAN, val)]:
        with open(path, "w") as fout:
            for e in data:
                fout.write(json.dumps(e, ensure_ascii=False) + "\n")

    print(f"Wrote {len(train)} train, {len(val)} eval samples and {len(entries)} into full.")
    return TRAIN_MAN, EVAL_MAN


def freeze_layers(model):
    for p in model.encoder.pre_encode.parameters():
        p.requires_grad = False
    for i, layer in enumerate(model.encoder.layers):
        if i < 8:
            for p in layer.parameters():
                p.requires_grad = False
    for p in model.decoder.prediction["embed"].parameters():
        p.requires_grad = False

In [None]:
TRAIN_MAN, EVAL_MAN = build_manifests()

In [None]:
from numba import cuda
import nemo.collections.asr as nemo_asr
import lightning.pytorch as pl
from lightning.pytorch.loggers import TensorBoardLogger
from nemo.utils import logging
import torch
from pytorch_lightning.loggers import CSVLogger  # Import CSVLogger here

def main():
    os.environ["NUMBA_CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY"] = "0"  # Disable for CUDA >= 12.0
    os.environ["NUMBA_CUDA_ENABLE_PYNVJITLINK"] = "1"  # Enable for CUDA >= 12.0
    os.environ["NUMBA_CUDA_USE_NVIDIA_BINDING"] = "1"
    os.environ["OMP_NUM_THREADS"] = "1"

    from numba import config
    config.CUDA_ENABLE_PYNVJITLINK = 1
    print("Pynvjit link enabled? ", config.CUDA_ENABLE_PYNVJITLINK)

    model = nemo_asr.models.ASRModel.from_pretrained(
        "nvidia/parakeet-tdt-0.6b-v2",
        map_location="cpu"
    )
    model.encoder.checkpointing = True
    freeze_layers(model)

    sr = 16000
    labels = model.joint.vocabulary

    train_cfg = {
        "manifest_filepath": str(TRAIN_MAN),
        "sample_rate": sr,
        "labels": labels,
        "batch_size": 4,
        "shuffle": True,
        "num_workers": 1,
        "pin_memory": False
    }
    val_cfg = dict(train_cfg, shuffle=False, manifest_filepath=str(EVAL_MAN))

    model.setup_training_data(train_data_config=train_cfg)
    model.setup_validation_data(val_data_config=val_cfg)

    # Use CSV logger instead of TensorBoard
    # from pytorch_lightning.loggers import CSVLogger # Already imported above
    csv_logger = CSVLogger(
        save_dir="/content/logs",
        name="parakeet_finetune"
    )

    torch.cuda.empty_cache()

    trainer = pl.Trainer(
        accelerator="gpu",
        devices=1,
        strategy="auto",
        precision=32,
        max_epochs=8,
        gradient_clip_val=1.0,
        accumulate_grad_batches=4,
        logger=csv_logger,
        log_every_n_steps=10
    )

    trainer.fit(model)
    model.save_to("parakeet_finetuned.nemo")

    # Return trainer for accessing metrics
    return trainer, csv_logger

if __name__ == "__main__":
    trainer, logger = main()

In [None]:
import json
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import torch
import logging
import nemo.collections.asr as nemo_asr
from jiwer import Compose, ToLowerCase, SubstituteRegexes, RemovePunctuation, ReduceToListOfListOfWords, wer

def main(is_eval):

    BATCH_SIZE = 2

    # Load fine-tuned model onto GPU
    model = nemo_asr.models.ASRModel.restore_from("parakeet_finetuned.nemo", map_location="cuda:0").cuda().eval()

    if is_eval:
      manifest = Path("eval_manifest.jsonl")
    else:
      manifest = Path("full_manifest.jsonl")

    # Load eval manifest
    with open(manifest, encoding="utf-8") as f:
        instances = [json.loads(line) for line in f if line.strip()]

    audio_paths = [item["audio_filepath"] for item in instances]
    refs = [item["text"] for item in instances]
    keys = [item.get("key", idx) for idx, item in enumerate(instances)]

    # Transcribe in batches of 16 (adjustable)
    hyps = []
    with torch.no_grad():
        for i in tqdm(range(0, len(audio_paths), BATCH_SIZE), desc="Inference on GPU"):
            batch = audio_paths[i:i+2]
            out_batch = model.transcribe(batch, batch_size=BATCH_SIZE)
            hyps.extend([o.text if hasattr(o, "text") else o for o in out_batch])

    # Define normalization and WER transformation pipeline
    transform = Compose([
        ToLowerCase(),
        SubstituteRegexes({"-": " "}),
        RemovePunctuation(),
        ReduceToListOfListOfWords()
    ])

    score = 1 - wer(refs, hyps, truth_transform=transform, hypothesis_transform=transform)
    print(f"\n‚úÖ 1 - WER on eval set: {score:.3f}\n")

    # Print first 10 mismatches
    print("üîç First 10 mismatches:")
    mismatch_count = 0
    for k, r, h in zip(keys, refs, hyps):
        if transform(r) != transform(h):
            print(f"- {k}\n    REF: {r}\n    HYP: {h}")
            mismatch_count += 1
            if mismatch_count >= 10:
                break

    # Save full results to CSV
    if is_eval:
      pd.DataFrame({"key": keys, "ref": refs, "hyp": hyps}).to_csv("eval_results.csv", index=False)
    else:
      pd.DataFrame({"key": keys, "ref": refs, "hyp": hyps}).to_csv("full_results.csv", index=False)

if __name__ == "__main__":
    main(True)
    main(False)

In [None]:
# prompt: save parakeet_finetuned.nemo, eval_results.csv and full_results.csv on g drive

!cp parakeet_finetuned.nemo /content/drive/MyDrive/ASRM3/
!cp eval_results.csv /content/drive/MyDrive/ASRM3/
!cp full_results.csv /content/drive/MyDrive/ASRM3/

In [None]:
# 1) Install & load spaCy English model (run once in Colab or your env)
!pip install spacy matplotlib
!python -m spacy download en_core_web_sm

# 2) Imports
import json
from collections import Counter
from pathlib import Path
import spacy
import matplotlib.pyplot as plt

# 3) Load spaCy model
nlp = spacy.load("en_core_web_sm")

# 4) Read manifest and collect proper nouns
FULL_MAN = Path("full_manifest.jsonl")  # adjust path if needed
propn_counts = Counter()

with FULL_MAN.open("r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        text = entry.get("text", "")
        # POS‚Äêtag and extract PROPN tokens
        doc = nlp(text)
        for token in doc:
            if token.pos_ == "PROPN":
                # normalize (e.g. title‚Äêcase)
                propn_counts[token.text.strip().title()] += 1

# 5) Plot top‚Äê20 proper nouns
top_n = 50
most_common = propn_counts.most_common(top_n)
names, freqs = zip(*most_common)

plt.figure(figsize=(12, 6))
bars = plt.barh(names, freqs, color="skyblue")
plt.gca().invert_yaxis()  # highest on top
plt.title(f"Top {top_n} Proper Nouns in FULL_MAN")
plt.xlabel("Frequency")
plt.tight_layout()
plt.show()