<a href="https://colab.research.google.com/github/BarakatPay/stt_pashto/blob/main/STT_Pashto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# -----------------------------------------------------------
# 📓 Cell 1 – Install / Import dependencies
# -----------------------------------------------------------
!pip install -q --upgrade transformers datasets soundfile evaluate tqdm
!pip install jiwer

import os, json, re, torch, soundfile as sf
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import evaluate


Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.13.0


In [3]:
# -----------------------------------------------------------
# 📓 Cell 2 – Mount Drive & set paths
# -----------------------------------------------------------
from google.colab import drive
drive.mount('/content/drive')

# Adjust these to where you’ve placed your test.json, audio files, and model folder
JSON_PATH = "/content/drive/MyDrive/STT_Pashto/test.json"
AUDIO_DIR = "/content/drive/MyDrive/STT_Pashto/audios"
MODEL_DIR = "/content/drive/MyDrive/STT_Pashto/Model"

assert os.path.exists(JSON_PATH), f"JSON not found → {JSON_PATH}"
assert os.path.isdir(AUDIO_DIR),  f"Audio folder not found → {AUDIO_DIR}"
assert os.path.isdir(MODEL_DIR),  f"Model folder not found → {MODEL_DIR}"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# -----------------------------------------------------------
# 📓 Cell 3 – Load processor & model
# -----------------------------------------------------------
processor = Wav2Vec2Processor.from_pretrained(MODEL_DIR)
model     = Wav2Vec2ForCTC.from_pretrained(MODEL_DIR)
device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()

print("✓ model loaded on", device)


✓ model loaded on cuda


In [5]:
# -----------------------------------------------------------
# 📓 Cell 4 – Load test JSON → Hugging Face Dataset
# -----------------------------------------------------------
with open(JSON_PATH, "r", encoding="utf-8") as f:
    test_list = json.load(f)

test_ds = Dataset.from_list(test_list)  # expects columns: id, file, sentence, …
print("Rows in test set →", len(test_ds))


Rows in test set → 21


In [6]:
# -----------------------------------------------------------
# 📓 Cell 5 – (Option) Pashto text normaliser
# -----------------------------------------------------------
def normalise(text: str) -> str:
    text = re.sub(r"[,?.!\-;:\"'%�—…–()]", "", text.lower())
    tokens = [w for w in text.split() if not re.search(r"[a-z0-9]", w)]
    return " ".join(tokens)

test_ds = test_ds.map(lambda ex: {"sentence_norm": normalise(ex["sentence"])})


Map:   0%|          | 0/21 [00:00<?, ? examples/s]

In [7]:
# -----------------------------------------------------------
# 📓 Cell 6 – Audio loader helper
# -----------------------------------------------------------
def load_wav(path: str, target_sr: int = 16_000):
    audio, sr = sf.read(path)
    if audio.ndim > 1:  # stereo → mono
        audio = audio[:, 0]
    if sr != target_sr:
        raise ValueError(f"Sample-rate {sr} ≠ {target_sr} (please resample)")
    return audio

def file_to_input_values(wav_path: str):
    wav = load_wav(wav_path)
    return processor(wav, sampling_rate=16_000, return_tensors="pt").input_values[0]


In [8]:
# -----------------------------------------------------------
# 📓 Cell 7 – Batch inference
# -----------------------------------------------------------
def predict_batch(batch):
    wav_paths = [os.path.join(AUDIO_DIR, fn) for fn in batch["file"]]
    inputs    = [file_to_input_values(p) for p in wav_paths]

    # pad & stack
    inputs_pad = processor.pad({"input_values": inputs},
                               padding=True, return_tensors="pt").to(device)

    with torch.no_grad():
        logits = model(inputs_pad.input_values).logits
    pred_ids  = torch.argmax(logits, dim=-1)

    batch["prediction"] = processor.batch_decode(pred_ids, skip_special_tokens=True)
    return batch

predicted = test_ds.map(predict_batch, batched=True, batch_size=8)




Map:   0%|          | 0/21 [00:00<?, ? examples/s]

In [13]:
# -----------------------------------------------------------
# 📓 Cell 8 – Compute WER / CER
# -----------------------------------------------------------
wer = evaluate.load("wer")
cer = evaluate.load("cer")

refs = predicted["sentence_norm"]
hyps = predicted["prediction"]

scores = {
    "WER": wer.compute(predictions=hyps, references=refs),
    "CER": cer.compute(predictions=hyps, references=refs)
}

print("📊 Evaluation scores:", scores)


Downloading builder script: 0.00B [00:00, ?B/s]

📊 Evaluation scores: {'WER': 0.31189083820662766, 'CER': 0.09425190194420964}


In [14]:
# -----------------------------------------------------------
# 📓 Cell 9 – View / save predictions
# -----------------------------------------------------------
df = pd.DataFrame({
    "id":           predicted["id"],
    "file":         predicted["file"],
    "reference":    predicted["sentence_norm"],
    "hypothesis":   predicted["prediction"]
})
df["same"] = df.reference.str.strip() == df.hypothesis.str.strip()

# Inspect mismatches
display(df[df.same == False].head(20))

# (Optional) Save CSV back to Drive
CSV_OUT = os.path.join(MODEL_DIR, "test_predictions.csv")
df.to_csv(CSV_OUT, index=False, encoding="utf-8")
print("✓ predictions saved to", CSV_OUT)


Unnamed: 0,id,file,reference,hypothesis,same
0,41,common_voice_ps_867624441144210149510537625194...,انسان د خدای ج تر ټولو غوره مخلوق دی، چې دغه غ...,انسان د ختی ج تر ټولو غوره مخلوق دی چې دغه غور...,False
1,42,common_voice_ps_112081786120046985781292310308...,هغه څه چې د افکارو د څرګندولو ډېره ښه وسیله کې...,هغه څه چې د افکارو د څرګندولو ډېره ښه وسیله کې...,False
2,43,common_voice_ps_773846969122979182352897775343...,نو ویلی شو، چې ژبه یوه ټولنیزه، اکتسابي او ثقا...,نو ویلی شو، چې ژبه یوه ټول نیزه اکتصابي او صقا...,False
3,44,common_voice_ps_773467273126716576115421998424...,ټولنیزه یانې څوک چې په هره ټولنه کې اوسېږي، د ...,ټولنیزه یان ې څوک چې په هره ټولنه کې اوسېږي د ...,False
4,45,common_voice_ps_40207995567583246555733317145.wav,اکتسابي یانې ژبه کسبېږي او زده کېږي، ژبه له او...,اکتصابي ینې ژبه کې سپېږي اوزده کېږي ژبه له اول...,False
5,46,common_voice_ps_210715119141424292118319111274...,ثقافتي یانې دود او دستور رسم او رواج هم پر ژبه...,صقافتي یانې دود او دستور رسم او رواج هم پر ژبه...,False
6,47,common_voice_ps_308484645140210818711207847084...,ژبه د پوهیدو او را پوهولو وسیله ده، چې انسان د...,ژبه د پوهېدو او را پوهولو وسیله ده چې انسان د ...,False
8,49,common_voice_ps_97155046649860492966265030249.wav,پښتو ژبه له لرغونو اريایي ژبو څخه يوه خپلواکه ...,پښتو ژبه له لرغونو اریایي ژبو څخه یوه خپلواکه ...,False
9,50,common_voice_ps_137450322584400871290840524550...,دا ژبه د پښتو، پختو، پوختو، په هندي کې د پټاني...,دا ژبه د پښتو پختو، پوختو په هندي کې د پټاني ا...,False
10,51,common_voice_ps_185485579215492298477286615685...,د هندو اروپایي ژبو کورنۍ په څلورو لویو برخو وې...,د هندو اروپایي ژبو کورنۍ په څلورو لویو برخوېشل...,False


✓ predictions saved to /content/drive/MyDrive/STT_Pashto/Model/test_predictions.csv
