In [2]:
# Cell 1 - imports & helpers
from pathlib import Path
import shutil
import unicodedata
import re
import csv
from typing import List, Tuple


In [2]:
# Cell 2 - scan authors and show subfolders
def scan_authors(root: str) -> List[Tuple[str, List[str]]]:
    root = Path(root)
    authors = [p for p in root.iterdir() if p.is_dir()]
    summary = []
    for a in sorted(authors):
        dirs = [d.name for d in a.iterdir() if d.is_dir()]
        summary.append((a.name, sorted(dirs)))
    return summary

# Usage
root = Path("dataset")   # adjust path if needed
for author, dirs in scan_authors(root):
    print(author, "->", dirs)


ahmad-faraz -> ['en', 'hi', 'ur']
akbar-allahabadi -> ['en', 'hi', 'ur']
allama-iqbal -> ['en', 'hi', 'ur']
altaf-hussain-hali -> ['en', 'hi', 'ur']
ameer-khusrau -> ['en', 'hi', 'ur']
bahadur-shah-zafar -> ['en', 'hi', 'ur']
dagh-dehlvi -> ['en', 'hi', 'ur']
fahmida-riaz -> ['en', 'hi', 'ur']
faiz-ahmad-faiz -> ['en', 'hi', 'ur']
firaq-gorakhpuri -> ['en', 'hi', 'ur']
gulzar -> ['en', 'hi', 'ur']
habib-jalib -> ['en', 'hi', 'ur']
jaan-nisar-akhtar -> ['en', 'hi', 'ur']
jaun-eliya -> ['en', 'hi', 'ur']
javed-akhtar -> ['en', 'hi', 'ur']
jigar-moradabadi -> ['en', 'hi', 'ur']
kaifi-azmi -> ['en', 'hi', 'ur']
meer-anees -> ['en', 'hi', 'ur']
meer-taqi-meer -> ['en', 'hi', 'ur']
mirza-ghalib -> ['en', 'hi', 'ur']
mohsin-naqvi -> ['en', 'hi', 'ur']
naji-shakir -> ['en', 'hi', 'ur']
naseer-turabi -> ['en', 'hi', 'ur']
nazm-tabatabai -> ['en', 'hi', 'ur']
nida-fazli -> ['en', 'hi', 'ur']
noon-meem-rashid -> ['en', 'hi', 'ur']
parveen-shakir -> ['en', 'hi', 'ur']
sahir-ludhianvi -> ['en', 'hi

In [4]:
# Cell 3 - restructure: move non-ur/en to a _removed folder (dry-run first)
def restructure_dataset(root: str, removed_dir_name="_removed", dry_run=True):
    root = Path(root)
    removed_root = root / removed_dir_name
    actions = []   # collect actions for display

    for author in sorted([p for p in root.iterdir() if p.is_dir()]):
        for child in sorted(author.iterdir()):
            if not child.is_dir():
                # keep files as-is (or optionally move them); we only handle dirs here
                continue
            if child.name.lower() in ("ur", "en"):
                continue
            # this dir should be moved
            dest = removed_root / author.name / child.name
            actions.append((str(child), str(dest)))
            if not dry_run:
                dest.parent.mkdir(parents=True, exist_ok=True)
                shutil.move(str(child), str(dest))

    return actions

# Dry-run
actions = restructure_dataset(root="dataset", dry_run=False)
print("== DRY RUN: these directories WOULD be moved to dataset/_removed/author_name/ … ==")
for src, dst in actions:
    print(src, "->", dst)
print(f"\nTotal to move: {len(actions)}")


== DRY RUN: these directories WOULD be moved to dataset/_removed/author_name/ … ==
dataset\ahmad-faraz\hi -> dataset\_removed\ahmad-faraz\hi
dataset\akbar-allahabadi\hi -> dataset\_removed\akbar-allahabadi\hi
dataset\allama-iqbal\hi -> dataset\_removed\allama-iqbal\hi
dataset\altaf-hussain-hali\hi -> dataset\_removed\altaf-hussain-hali\hi
dataset\ameer-khusrau\hi -> dataset\_removed\ameer-khusrau\hi
dataset\bahadur-shah-zafar\hi -> dataset\_removed\bahadur-shah-zafar\hi
dataset\dagh-dehlvi\hi -> dataset\_removed\dagh-dehlvi\hi
dataset\fahmida-riaz\hi -> dataset\_removed\fahmida-riaz\hi
dataset\faiz-ahmad-faiz\hi -> dataset\_removed\faiz-ahmad-faiz\hi
dataset\firaq-gorakhpuri\hi -> dataset\_removed\firaq-gorakhpuri\hi
dataset\gulzar\hi -> dataset\_removed\gulzar\hi
dataset\habib-jalib\hi -> dataset\_removed\habib-jalib\hi
dataset\jaan-nisar-akhtar\hi -> dataset\_removed\jaan-nisar-akhtar\hi
dataset\jaun-eliya\hi -> dataset\_removed\jaun-eliya\hi
dataset\javed-akhtar\hi -> dataset\_remov

In [5]:
# Cell 4 - normalization functions
def normalize_urdu(s: str) -> str:
    # Basic normalizing (Unicode form, collapse whitespace). Adjust as needed.
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r'\r\n?', '\n', s)          # normalize CRLF -> LF
    s = re.sub(r'\s+', ' ', s).strip()     # collapse whitespace
    return s

def normalize_roman(s: str) -> str:
    s = unicodedata.normalize("NFKC", s)
    s = s.lower()                          # roman lowercasing
    s = re.sub(r'\s+', ' ', s).strip()
    return s


In [6]:
# Cell 5 - create parallel corpus
def create_parallel_dataset(root: str,
                            out_dir_name: str = "parallel",
                            align_mode: str = "strict",   # "strict" or "merge"
                            dry_run: bool = True):
    """
    align_mode:
      - "strict": only add pairs when the number of non-empty lines matches exactly
      - "merge": if line counts differ, treat the whole file pair as a single example (entire file -> entire file)
    dry_run: if True, don't write outputs; return statistics and sample entries
    """
    root = Path(root)
    out = root / out_dir_name
    pairs = []     # list of tuples: (src_line, tgt_line, author, filename)
    mismatches = []  # (author, filename, ur_lines, en_lines)

    for author in sorted([p for p in root.iterdir() if p.is_dir() and p.name != "_removed"]):
        ur_dir = author / "ur"
        en_dir = author / "en"
        if not ur_dir.exists() or not en_dir.exists():
            continue
        for ur_file in sorted(ur_dir.glob("*")):
            # only consider files (txt or otherwise)
            if not ur_file.is_file():
                continue
            en_file = en_dir / ur_file.name
            if not en_file.exists():
                # try matching by extension-insensitive names if needed (optional)
                mismatches.append((author.name, ur_file.name, "no_en_file", "0"))
                continue

            ur_text = ur_file.read_text(encoding="utf-8", errors="replace").strip()
            en_text = en_file.read_text(encoding="utf-8", errors="replace").strip()
            ur_lines = [ln.strip() for ln in ur_text.splitlines() if ln.strip()]
            en_lines = [ln.strip() for ln in en_text.splitlines() if ln.strip()]

            if len(ur_lines) == len(en_lines) and len(ur_lines) > 0:
                for u, e in zip(ur_lines, en_lines):
                    pairs.append((normalize_urdu(u), normalize_roman(e), author.name, ur_file.name))
            else:
                if align_mode == "merge":
                    # treat whole file as a single pair (useful for short files if you prefer)
                    pairs.append((normalize_urdu(ur_text), normalize_roman(en_text), author.name, ur_file.name))
                else:
                    # record mismatch for manual inspection
                    mismatches.append((author.name, ur_file.name, len(ur_lines), len(en_lines)))

    # Dry-run: return counts and small samples
    if dry_run:
        return {
            "pairs_count": len(pairs),
            "mismatches_count": len(mismatches),
            "pairs_sample": pairs[:6],
            "mismatches_sample": mismatches[:6]
        }

    # Write outputs
    out.mkdir(parents=True, exist_ok=True)
    src_path = out / "src.txt"
    tgt_path = out / "tgt.txt"
    meta_path = out / "metadata.csv"

    with src_path.open("w", encoding="utf-8") as sf, tgt_path.open("w", encoding="utf-8") as tf, meta_path.open("w", newline='', encoding="utf-8") as mf:
        writer = csv.writer(mf)
        writer.writerow(["src_text", "tgt_text", "author", "filename"])
        for u, e, author_name, fname in pairs:
            sf.write(u.replace("\n", " ") + "\n")   # keep single-line per example
            tf.write(e.replace("\n", " ") + "\n")
            writer.writerow([u, e, author_name, fname])

    # write mismatches for review
    with (out / "mismatches.csv").open("w", newline='', encoding="utf-8") as mm:
        w = csv.writer(mm)
        w.writerow(["author", "filename", "ur_lines", "en_lines"])
        for rec in mismatches:
            w.writerow(rec)

    return {
        "pairs_written": len(pairs),
        "mismatches_written": len(mismatches),
        "out_dir": str(out)
    }

# Dry-run example (strict alignment)
stats = create_parallel_dataset("dataset", align_mode="strict", dry_run=True)
print("DRY RUN (strict) -> pairs:", stats["pairs_count"], "mismatches:", stats["mismatches_count"])
print("Sample pairs:", stats["pairs_sample"])
print("Sample mismatches:", stats["mismatches_sample"])


DRY RUN (strict) -> pairs: 20856 mismatches: 12
Sample pairs: [('آنکھ سے دور نہ ہو دل سے اتر جائے گا', 'aañkh se duur na ho dil se utar jā.egā', 'ahmad-faraz', 'aankh-se-duur-na-ho-dil-se-utar-jaaegaa-ahmad-faraz-ghazals'), ('وقت کا کیا ہے گزرتا ہے گزر جائے گا', 'vaqt kā kyā hai guzartā hai guzar jā.egā', 'ahmad-faraz', 'aankh-se-duur-na-ho-dil-se-utar-jaaegaa-ahmad-faraz-ghazals'), ('اتنا مانوس نہ ہو خلوت غم سے اپنی', 'itnā mānūs na ho ḳhalvat-e-ġham se apnī', 'ahmad-faraz', 'aankh-se-duur-na-ho-dil-se-utar-jaaegaa-ahmad-faraz-ghazals'), ('تو کبھی خود کو بھی دیکھے گا تو ڈر جائے گا', 'tū kabhī ḳhud ko bhī dekhegā to dar jā.egā', 'ahmad-faraz', 'aankh-se-duur-na-ho-dil-se-utar-jaaegaa-ahmad-faraz-ghazals'), ('ڈوبتے ڈوبتے کشتی کو اچھالا دے دوں', 'dūbte dūbte kashtī ko uchhālā de duuñ', 'ahmad-faraz', 'aankh-se-duur-na-ho-dil-se-utar-jaaegaa-ahmad-faraz-ghazals'), ('میں نہیں کوئی تو ساحل پہ اتر جائے گا', 'maiñ nahīñ koī to sāhil pe utar jā.egā', 'ahmad-faraz', 'aankh-se-duur-na-ho-dil-se-

In [7]:
# Cell 6 - create for real (uncomment to run)
result = create_parallel_dataset("dataset", align_mode="strict", dry_run=False)
print(result)


{'pairs_written': 20856, 'mismatches_written': 12, 'out_dir': 'dataset\\parallel'}


In [11]:
from pathlib import Path
from pprint import pprint

p = Path("dataset/parallel")
if p.exists():
    pprint(list(sorted(p.iterdir())))
    
    src_file = p / "src.txt"
    if src_file.exists():
        print("\nFirst 3 lines (src.txt):")
        print("\n".join(src_file.read_text(encoding='utf-8').splitlines()[:3]))
    else:
        print("src.txt not found in dataset/parallel")
else:
    print("No parallel dir yet. Run create_parallel_dataset with dry_run=False.")


[WindowsPath('dataset/parallel/metadata.csv'),
 WindowsPath('dataset/parallel/mismatches.csv'),
 WindowsPath('dataset/parallel/src.txt'),
 WindowsPath('dataset/parallel/tgt.txt')]

First 3 lines (src.txt):
آنکھ سے دور نہ ہو دل سے اتر جائے گا
وقت کا کیا ہے گزرتا ہے گزر جائے گا
اتنا مانوس نہ ہو خلوت غم سے اپنی


In [12]:
p = Path("dataset/parallel")
if p.exists():
    pprint(list(sorted(p.iterdir())))
    
    src_file = p / "tgt.txt"
    if src_file.exists():
        print("\nFirst 3 lines (src.txt):")
        print("\n".join(src_file.read_text(encoding='utf-8').splitlines()[:3]))
    else:
        print("src.txt not found in dataset/parallel")
else:
    print("No parallel dir yet. Run create_parallel_dataset with dry_run=False.")


[WindowsPath('dataset/parallel/metadata.csv'),
 WindowsPath('dataset/parallel/mismatches.csv'),
 WindowsPath('dataset/parallel/src.txt'),
 WindowsPath('dataset/parallel/tgt.txt')]

First 3 lines (src.txt):
aañkh se duur na ho dil se utar jā.egā
vaqt kā kyā hai guzartā hai guzar jā.egā
itnā mānūs na ho ḳhalvat-e-ġham se apnī


In [15]:

# Path to your Roman Urdu target file
tgt_file = Path("dataset/parallel/tgt.txt")

if tgt_file.exists():
    text = tgt_file.read_text(encoding="utf-8", errors="ignore")
    
    # Collect all unique characters
    unique_chars = sorted(set(text))
    
    print("Total unique characters:", len(unique_chars))
    print("Characters:")
    for ch in unique_chars:
        if ch == "\n":
            print("\\n (newline)")
        elif ch == "\t":
            print("\\t (tab)")
        elif ch == " ":
            print("␣ (space)")
        else:
            print(ch)
else:
    print("tgt.txt not found at", tgt_file)


Total unique characters: 43
Characters:
\n (newline)
␣ (space)
!
'
,
-
.
?
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
ñ
ā
ē
ġ
ī
ū
۔
ḍ
ḳ


In [None]:
from pathlib import Path

# Path to tgt file
tgt_file = Path("dataset/combined/tgt.txt")
out_file = Path("dataset/combined/tgt_normalized.txt")

# Mapping dictionary
replacements = {
    "ñ": "n",
    "ā": "a",
    "ē": "e",
    "ġ": "g",
    "ī": "i",
    "ū": "u",
    "ḍ": "d",
    "ḳ": "k",
    ".": "",
    "۔": "",
    "\u200c": "",  
}

def normalize_text(text: str) -> str:
    for src, tgt in replacements.items():
        text = text.replace(src, tgt)
    return text

if tgt_file.exists():
    # Read
    text = tgt_file.read_text(encoding="utf-8", errors="ignore")
    
    # Normalize
    normalized = normalize_text(text)
    
    # Save
    out_file.write_text(normalized, encoding="utf-8")
    
    print(f"✅ Normalized file written to: {out_file}")
    print("Sample before → after:\n")
    for i, line in enumerate(text.splitlines()[:5]):
        print("ORIG:", line)
        print("NORM:", normalize_text(line))
        print()
else:
    print("❌ tgt.txt not found at", tgt_file)


✅ Normalized file written to: dataset\parallel\tgt_normalized.txt
Sample before → after:

ORIG: aañkh se duur na ho dil se utar jā.egā
NORM: aankh se duur na ho dil se utar jaega

ORIG: vaqt kā kyā hai guzartā hai guzar jā.egā
NORM: vaqt ka kya hai guzarta hai guzar jaega

ORIG: itnā mānūs na ho ḳhalvat-e-ġham se apnī
NORM: itna manus na ho khalvat-e-gham se apni

ORIG: tū kabhī ḳhud ko bhī dekhegā to dar jā.egā
NORM: tu kabhi khud ko bhi dekhega to dar jaega

ORIG: dūbte dūbte kashtī ko uchhālā de duuñ
NORM: dubte dubte kashti ko uchhala de duun



In [3]:

# Path to your Roman Urdu target file
tgt_file = Path("dataset/parallel/tgt_normalized.txt")

if tgt_file.exists():
    text = tgt_file.read_text(encoding="utf-8", errors="ignore")
    
    # Collect all unique characters
    unique_chars = sorted(set(text))
    
    print("Total unique characters:", len(unique_chars))
    print("Characters:")
    for ch in unique_chars:
        if ch == "\n":
            print("\\n (newline)")
        elif ch == "\t":
            print("\\t (tab)")
        elif ch == " ":
            print("␣ (space)")
        else:
            print(ch)
else:
    print("tgt.txt not found at", tgt_file)


tgt.txt not found at dataset\parallel\tgt_normalized.txt


In [24]:

if tgt_file.exists():
    text = tgt_file.read_text(encoding="utf-8", errors="ignore")
    
    # Split the text into a list of words
    words = text.split()
    
    # Collect all unique words
    unique_words = sorted(set(words))
    
    # Print the total number of unique words
    print("Total unique words:", len(unique_words))
    print("Words:")
    
    # Print each unique word
    for word in unique_words:
        print(word)
else:
    print("tgt.txt not found at", tgt_file)
    

Total unique words: 16801
Words:
'
''aap
''nazm''
'akbar'
'akbar'-e-mast-o-be-khabar
'ali'
'alvi'
'anis'
'asad'
'asad'-e-fitna-intizar
'asad'-e-khasta-jan
'asad'-ullah
'attar'
'azurda'
'dagh'
'dagh'-e-siyah-ru
'faiz'
'faraz'
'farhad'
'firaq'
'firaq'-e-sukhan-ara
'firaq'-e-vatan-avara
'ghalib'
'ghalib'-e-ashufta-nava
'ghalib'-e-ashufta-sar
'ghalib'-e-bad-khu
'ghalib'-e-khasta
'ghalib'-e-shorida
'ghalib'-e-vahshi
'ghazali'
'haidar'
'hali'
'hali'-e-ranjur
'iqbal'
'iqbal'-e-be-nava
'jalib'
'jaun'
'jaun-elia'
'jigar'
'josh'
'kabira'
'kaifi'
'khusrav'
'majruh'
'mamnun'
'mansur'
'mir'
'mir'-e-jigar-sokhta
'mir'-ji
'mir'-o-'ghalib'
'mira'
'mohsin'
'momin'
'mushafi'
'naji'
'nasir'
'nayyar'
'nazm'
'nazm'-e-mubtala
'rashid'
'razi'
'rumi'
'sahbai'
'sahir'
'sanai'
'shefta'
'vahshat'
'vali'
'vasim'
'yagana'
'yaqub'
'yusuf'
'zafar'
'zauq'
aa
aab
aabju
aabla
aable
aabru
aad
aada
aadam
aadat
aade
aadh
aadha
aadhi
aadi
aadmi
aae
aaega
aaegi
aaen
aafat
aag
aagah
aage
aah
aahan
aahat
aahen
aahon
aahu
aai


In [4]:

# Path to your Roman Urdu target file
tgt_file = Path("dataset/combined/src.txt")

if tgt_file.exists():
    text = tgt_file.read_text(encoding="utf-8", errors="ignore")
    
    # Collect all unique characters
    unique_chars = sorted(set(text))
    
    print("Total unique characters:", len(unique_chars))
    print("Characters:")
    for ch in unique_chars:
        if ch == "\n":
            print("\\n (newline)")
        elif ch == "\t":
            print("\\t (tab)")
        elif ch == " ":
            print("␣ (space)")
        elif ch=="\u200c":
            print("\\u200c (ZWNJ)")
        else:
            print(ch)
else:
    print("tgt.txt not found at", tgt_file)


Total unique characters: 57
Characters:
\n (newline)
␣ (space)
!
'
،
ؔ
؟
ء
آ
أ
ؤ
ئ
ا
ب
ت
ث
ج
ح
خ
د
ذ
ر
ز
س
ش
ص
ض
ط
ظ
ع
غ
ف
ق
ل
م
ن
و
ً
ٔ
ٰ
ٹ
پ
چ
ڈ
ڑ
ژ
ک
گ
ں
ھ
ہ
ۂ
ۃ
ی
ے
ۓ
\u200c (ZWNJ)


In [5]:
from pathlib import Path

# Path to tgt file
tgt_file = Path("dataset/combined/src.txt")
out_file = Path("dataset/combined/src_normalized.txt")

# Mapping dictionary
replacements = {
    "\u200c": "",  
}

def normalize_text(text: str) -> str:
    for src, tgt in replacements.items():
        text = text.replace(src, tgt)
    return text

if tgt_file.exists():
    # Read
    text = tgt_file.read_text(encoding="utf-8", errors="ignore")
    
    # Normalize
    normalized = normalize_text(text)
    
    # Save
    out_file.write_text(normalized, encoding="utf-8")
    
    print(f"✅ Normalized file written to: {out_file}")
    print("Sample before → after:\n")
    for i, line in enumerate(text.splitlines()[:5]):
        print("ORIG:", line)
        print("NORM:", normalize_text(line))
        print()
else:
    print("❌ tgt.txt not found at", tgt_file)


✅ Normalized file written to: dataset\combined\src_normalized.txt
Sample before → after:

ORIG: آنکھ سے دور نہ ہو دل سے اتر جائے گا
NORM: آنکھ سے دور نہ ہو دل سے اتر جائے گا

ORIG: وقت کا کیا ہے گزرتا ہے گزر جائے گا
NORM: وقت کا کیا ہے گزرتا ہے گزر جائے گا

ORIG: اتنا مانوس نہ ہو خلوت غم سے اپنی
NORM: اتنا مانوس نہ ہو خلوت غم سے اپنی

ORIG: تو کبھی خود کو بھی دیکھے گا تو ڈر جائے گا
NORM: تو کبھی خود کو بھی دیکھے گا تو ڈر جائے گا

ORIG: ڈوبتے ڈوبتے کشتی کو اچھالا دے دوں
NORM: ڈوبتے ڈوبتے کشتی کو اچھالا دے دوں



In [26]:

if tgt_file.exists():
    text = tgt_file.read_text(encoding="utf-8", errors="ignore")
    
    # Split the text into a list of words
    words = text.split()
    
    # Collect all unique words
    unique_words = sorted(set(words))
    
    # Print the total number of unique words
    print("Total unique words:", len(unique_words))
    print("Words:")
    
    # Print each unique word
    for word in unique_words:
        print(word)
else:
    print("tgt.txt not found at", tgt_file)
    

Total unique words: 10345
Words:
'
''آپ
ؔحضرت
آ
آؤ
آؤں
آئنا
آئندہ
آئنوں
آئنہ
آئنۂ
آئنے
آئی
آئین
آئینا
آئینوں
آئینہ
آئینۂ
آئینے
آئیو
آئیں
آئیے
آئے
آب
آباد
آبادی
آبداری
آبرو
آبروئے
آبستنیٔ
آبشار
آبشاروں
آبلوں
آبلہ
آبلۂ
آبلے
آبگینوں
آبگینہ
آبی
آتا
آتش
آتشکدہ
آتشیں
آتی
آتیں
آتے
آثار
آج
آخر
آخر،
آخرت
آخرش
آخری
آداب
آدرش
آدم
آدمی
آدمیت
آدمیوں
آدھ
آدھا
آدھی
آذر
آرا
آرائش
آرائشیں
آرائی
آرائیاں
آراستہ
آرام
آرایش
آرزدگئ
آرزو
آرزوؤں
آرزوئیں
آرزوئے
آرسی
آرمیدگی
آری
آرے
آزاد
آزادئ
آزادوں
آزادگی
آزادہ
آزادی
آزار
آزاری
آزرانہ
آزردگاں
آزردگی
آزردہ
آزردہؔ
آزما
آزماؤں
آزمائش
آزمائشوں
آزمائی
آزمائے
آزماتے
آزمانا
آزمانے
آزمودہ
آس
آسا
آسائش
آسان
آسانی
آساں
آستان
آستانہ
آستانۂ
آستانی
آستانے
آستاں
آستین
آستینوں
آستیں
آسرا
آسرے
آسمان
آسمانوں
آسمانی
آسماں
آسودگی
آسودہ
آسیا
آسیب
آشا
آشام
آشامی
آشتی
آشفتگاں
آشفتگی
آشفتہ
آشنا
آشناؤں
آشنائی
آشنائیاں
آشنائے
آشناۓ
آشوب
آشوبی
آشکار
آشکارا
آشیان
آشیانہ
آشیانی
آشیانے
آشیاں
آغاز
آغوش
آغوشی
آغوشیاں
آفاق
آفت
آفتاب
آفتوں
آفرینش
آفریں
آفس
آقا
آلا
آلات
آلود
آلودہ
آلودۂ
آمادہ