### Dataset


In [3]:
# Basic corpus statistics & script coverage (choose split)
from pathlib import Path
from collections import Counter
import unicodedata, math

SPLIT = "train"  # "train", "valid", or "test"
corpus_path = Path('..') / f'/content/hicm_corpus.{SPLIT}.txt'
#/content/hicm_corpus.train.txt
assert corpus_path.exists(), f"Corpus file not found for split '{SPLIT}'."

sample_lines = 250000  # adjust for speed vs accuracy
char_counter = Counter()
line_count = 0

# Unicode block ranges
DEV_START, DEV_END = 0x0900, 0x097F
LAT_START, LAT_END = 0x0041, 0x007A  # rough (includes uppercase/lowercase subset)

with corpus_path.open('r', encoding='utf-8') as f:
    for line in f:
        line_count += 1
        for ch in line.rstrip('\n'):
            char_counter[ch] += 1
        if line_count >= sample_lines:
            break

cats = Counter()
for ch, freq in char_counter.items():
    cp = ord(ch)
    if DEV_START <= cp <= DEV_END:
        cats['Devanagari'] += freq
    elif 'A' <= ch <= 'Z' or 'a' <= ch <= 'z':
        cats['Latin'] += freq
    elif ch.isdigit():
        cats['Digits'] += freq
    elif ch.isspace():
        cats['Whitespace'] += freq
    else:
        cats['Other'] += freq

total_chars = sum(cats.values())
print(f"Split: {SPLIT}")
print(f"Lines sampled: {line_count:,}")
print(f"Unique chars in sample: {len(char_counter):,}")
print("Character category distribution (approx):")
for k, v in cats.items():
    print(f"  {k:12s}: {v:10d}  ({v/total_chars*100:5.2f}%)")

coverage = total_chars / sum(char_counter.values())
print(f"Character coverage observed (sampling fraction): {coverage:0.4f}")

# Show top 20 Devanagari chars and top 20 Latin chars
print('\nTop 20 Devanagari chars:')
dev_top = [(c, char_counter[c]) for c in char_counter if DEV_START <= ord(c) <= DEV_END]
for c, freq in sorted(dev_top, key=lambda x: x[1], reverse=True)[:20]:
    print(f"  {c} : {freq}")

print('\nTop 20 Latin chars:')
lat_top = [(c, char_counter[c]) for c in char_counter if ('A' <= c <= 'Z' or 'a' <= c <= 'z')]
for c, freq in sorted(lat_top, key=lambda x: x[1], reverse=True)[:20]:
    print(f"  {c} : {freq}")

Split: train
Lines sampled: 250,000
Unique chars in sample: 256
Character category distribution (approx):
  Devanagari  :    5661498  (52.55%)
  Whitespace  :    1768861  (16.42%)
  Latin       :    3291223  (30.55%)
  Digits      :      46988  ( 0.44%)
  Other       :       5783  ( 0.05%)
Character coverage observed (sampling fraction): 1.0000

Top 20 Devanagari chars:
  ा : 471759
  क : 441787
  े : 384211
  र : 340468
  ह : 261269
  ् : 247805
  न : 232096
  स : 222320
  ि : 219077
  ं : 212237
  ी : 211774
  त : 196960
  म : 173584
  ो : 151878
  य : 147182
  प : 137621
  ल : 130523
  ै : 110126
  व : 105705
  । : 93437

Top 20 Latin chars:
  e : 390334
  a : 288520
  i : 265743
  t : 250821
  r : 237903
  n : 236442
  s : 233903
  o : 217772
  l : 157297
  c : 138138
  d : 126591
  m : 104110
  u : 103160
  p : 99256
  h : 79936
  g : 75503
  y : 52465
  f : 52339
  b : 49796
  v : 39678


In [None]:
pip install sentencepiece



In [5]:
# Train SentencePiece Unigram tokenizer
import sentencepiece as spm
from pathlib import Path

corpus_path = Path('..') / '/content/hicm_corpus.train.txt'
assert corpus_path.exists(), "Train corpus file missing. Build it first."

VOCAB_SIZE = 32000  # change to e.g. 38000 or 40000 if needed
MODEL_PREFIX = f"hicm_unigram_{VOCAB_SIZE}"
MODEL_TYPE = 'unigram'
CHAR_COVERAGE = 1.0
NORMALIZATION = 'nfkc'

model_file = Path(f"{MODEL_PREFIX}.model")
vocab_file = Path(f"{MODEL_PREFIX}.vocab")

if model_file.exists():
    print(f"Model {model_file} already exists. Delete it to retrain.")
else:
    cmd = (
        f"--input={corpus_path} "
        f"--model_prefix={MODEL_PREFIX} "
        f"--model_type={MODEL_TYPE} "
        f"--vocab_size={VOCAB_SIZE} "
        f"--character_coverage={CHAR_COVERAGE} "
        f"--normalization_rule_name={NORMALIZATION} "
        f"--unk_id=0 --unk_piece=<unk> "
        f"--bos_id=1 --bos_piece=<bos> "
        f"--eos_id=2 --eos_piece=<eos> "
        f"--pad_id=3 --pad_piece=<pad> "
        f"--input_sentence_size=8000000 --shuffle_input_sentence=true "
        f"--max_sentence_length=2048 "
    )
    print("Training command:\n", cmd)
    spm.SentencePieceTrainer.Train(cmd)
    print("Training complete.")
    print("Generated files:", model_file, vocab_file)

# Quick sanity check
sp = spm.SentencePieceProcessor()
sp.load(str(model_file))
print("Loaded model with vocab size:", sp.get_piece_size())
print("Sample pieces:", [sp.id_to_piece(i) for i in range(10)])

Training command:
 --input=/content/hicm_corpus.train.txt --model_prefix=hicm_unigram_32000 --model_type=unigram --vocab_size=32000 --character_coverage=1.0 --normalization_rule_name=nfkc --unk_id=0 --unk_piece=<unk> --bos_id=1 --bos_piece=<bos> --eos_id=2 --eos_piece=<eos> --pad_id=3 --pad_piece=<pad> --input_sentence_size=8000000 --shuffle_input_sentence=true --max_sentence_length=2048 
Training complete.
Generated files: hicm_unigram_32000.model hicm_unigram_32000.vocab
Loaded model with vocab size: 32000
Sample pieces: ['<unk>', '<bos>', '<eos>', '<pad>', '\r', '▁के', '।', '▁में', '▁है', '▁की']


In [11]:
# Advanced analysis: segmentation stats & script ratios
from pathlib import Path
import sentencepiece as spm
from collections import Counter
import random

VOCAB_SIZE = 32000
MODEL_PREFIX = f"hicm_unigram_{VOCAB_SIZE}"
model_file = Path(f"{MODEL_PREFIX}.model")
corpus_path = Path('..') / '/content/hicm_corpus.train.txt'

sp = spm.SentencePieceProcessor(); sp.load(str(model_file))

sample_limit = 5000
lengths = []
script_piece_counter = Counter()
DEV_START, DEV_END = 0x0900, 0x097F

with corpus_path.open('r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        line = line.strip()
        if not line: continue
        pieces = sp.encode(line, out_type=str)
        lengths.append(len(pieces))
        for p in pieces:
            # classify piece by first char heuristic
            if p.startswith('<'): # special
                script_piece_counter['SPECIAL'] += 1
            else:
                ch = p[0]
                cp = ord(ch)
                if DEV_START <= cp <= DEV_END:
                    script_piece_counter['DEV'] += 1
                elif ('A' <= ch <= 'Z') or ('a' <= ch <= 'z'):
                    script_piece_counter['LAT'] += 1
                elif ch.isdigit():
                    script_piece_counter['DIGIT'] += 1
                else:
                    script_piece_counter['OTHER'] += 1
        if i+1 >= sample_limit:
            break

avg_len = sum(lengths)/len(lengths)
print(f"Sampled sentences: {len(lengths)}")
print(f"Avg pieces per sentence: {avg_len:0.2f}")
print("Piece script distribution:")
for k,v in script_piece_counter.items():
    print(f"  {k:7s}: {v:7d} ({v/sum(script_piece_counter.values())*100:5.2f}%)")

# OOV simulation: replace 0.5% of chars with a rare symbol and observe unk rate
random.seed(13)
rare_char = '¤'  # seldom used
modified_lines = []
with corpus_path.open('r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 1000: break
        chars = list(line.strip())
        for idx, ch in enumerate(chars):
            if random.random() < 0.005:
                chars[idx] = rare_char
        modified_lines.append(''.join(chars))

unk_id = sp.piece_to_id('<unk>')
unk_tokens = 0
total_tokens = 0
for line in modified_lines:
    ids = sp.encode(line, out_type=int)
    total_tokens += len(ids)
    unk_tokens += sum(1 for i in ids if i == unk_id)

# Out of
print(f"\nOOV simulation on 1000 lines: unk token rate = {unk_tokens/total_tokens*100:5.3f}%")
print("Done.")

Sampled sentences: 5000
Avg pieces per sentence: 9.62
Piece script distribution:
  OTHER  :   40137 (83.44%)
  DEV    :    5426 (11.28%)
  LAT    :    2456 ( 5.11%)
  DIGIT  :      83 ( 0.17%)

OOV simulation on 1000 lines: unk token rate = 0.000%
Done.


In [12]:
# Load & test tokenizer on full file (returns list of token lists)
import sentencepiece as spm
from pathlib import Path

VOCAB_SIZE = 32000
MODEL_PREFIX = f"hicm_unigram_{VOCAB_SIZE}"
model_file = Path(f"{MODEL_PREFIX}.model")
assert model_file.exists(), "Model file missing. Train tokenizer first."

sp = spm.SentencePieceProcessor()
sp.load(str(model_file))

# ---- NEW FUNCTION: tokenize a file line-by-line ----
def tokenize_file(filepath: str):
    filepath = Path(filepath)
    assert filepath.exists(), f"Input file not found: {filepath}"

    tokenized_lines = []

    with filepath.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                tokenized_lines.append([])   # Empty line → empty token list
            else:
                tokenized_lines.append(sp.encode(line, out_type=str))

    return {
        "num_lines": len(tokenized_lines),
        "tokens_per_line": tokenized_lines
    }

# ---- USAGE ----
input_file = "hicm_corpus.train.txt"   # <-- replace with your file path
result = tokenize_file(input_file)

result


{'num_lines': 4200000,
 'tokens_per_line': [['▁दूसरे',
   '▁दिन',
   '▁मैंने',
   '▁झूल',
   'े',
   '▁पर',
   '▁खेल',
   'ती',
   '▁हुई',
   '▁a',
   '▁छोटी',
   '▁सी',
   '▁प्यार',
   'ी',
   'प',
   '्या',
   'री',
   '▁girl',
   '▁देखी',
   '।'],
  ['▁झारखंड', '▁उच्च', '▁court', '▁की', '▁website', '▁देखें'],
  ['▁st', 'y', 'li', 'st', '▁प्र', 'साधक'],
  ['▁display', '▁s', '▁पर', '▁रु', 'ट', '▁login', '▁वर्जित', '▁है'],
  ['▁हर',
   '▁चीज',
   '▁की',
   '▁relevan',
   'ce',
   '▁का',
   '▁final',
   '▁measure',
   '▁life',
   '▁ही',
   '▁हो',
   '▁सकता',
   '▁है',
   '।'],
  ['▁women', '▁empowerment', '▁समिति'],
  ['▁आप',
   '▁a',
   '▁खिला',
   'डी',
   '▁कि',
   '▁कीमत',
   '▁कैसे',
   '▁fix',
   '▁करते',
   '▁हैं'],
  ['▁कृपया',
   '▁इस',
   '▁action',
   '▁के',
   '▁समर्थन',
   '▁में',
   '▁उद्देश्य',
   '▁और',
   '▁reasons',
   '▁को',
   '▁प्रस्तुत',
   '▁करें',
   '।'],
  ['▁इमारत',
   'ी',
   '▁timber',
   '▁furniture',
   '▁तथा',
   '▁fuel',
   '▁के',
   '▁लिए',
   '▁forest'

# TEST


In [14]:
# Basic corpus statistics & script coverage (choose split)
from pathlib import Path
from collections import Counter
import unicodedata, math

SPLIT = "test"  # "train", "valid", or "test"
corpus_path = Path('..') / f'/content/hicm_corpus.{SPLIT}.txt'
assert corpus_path.exists(), f"Corpus file not found for split '{SPLIT}'."

sample_lines = 250000  # adjust for speed vs accuracy
char_counter = Counter()
line_count = 0

# Unicode block ranges
DEV_START, DEV_END = 0x0900, 0x097F
LAT_START, LAT_END = 0x0041, 0x007A  # rough (includes uppercase/lowercase subset)

with corpus_path.open('r', encoding='utf-8') as f:
    for line in f:
        line_count += 1
        for ch in line.rstrip('\n'):
            char_counter[ch] += 1
        if line_count >= sample_lines:
            break

cats = Counter()
for ch, freq in char_counter.items():
    cp = ord(ch)
    if DEV_START <= cp <= DEV_END:
        cats['Devanagari'] += freq
    elif 'A' <= ch <= 'Z' or 'a' <= ch <= 'z':
        cats['Latin'] += freq
    elif ch.isdigit():
        cats['Digits'] += freq
    elif ch.isspace():
        cats['Whitespace'] += freq
    else:
        cats['Other'] += freq

total_chars = sum(cats.values())
print(f"Split: {SPLIT}")
print(f"Lines sampled: {line_count:,}")
print(f"Unique chars in sample: {len(char_counter):,}")
print("Character category distribution (approx):")
for k, v in cats.items():
    print(f"  {k:12s}: {v:10d}  ({v/total_chars*100:5.2f}%)")

coverage = total_chars / sum(char_counter.values())
print(f"Character coverage observed (sampling fraction): {coverage:0.4f}")

# Show top 20 Devanagari chars and top 20 Latin chars
print('\nTop 20 Devanagari chars:')
dev_top = [(c, char_counter[c]) for c in char_counter if DEV_START <= ord(c) <= DEV_END]
for c, freq in sorted(dev_top, key=lambda x: x[1], reverse=True)[:20]:
    print(f"  {c} : {freq}")

print('\nTop 20 Latin chars:')
lat_top = [(c, char_counter[c]) for c in char_counter if ('A' <= c <= 'Z' or 'a' <= c <= 'z')]
for c, freq in sorted(lat_top, key=lambda x: x[1], reverse=True)[:20]:
    print(f"  {c} : {freq}")

Split: test
Lines sampled: 2,507
Unique chars in sample: 155
Character category distribution (approx):
  Devanagari  :     120315  (38.75%)
  Whitespace  :      54716  (17.62%)
  Latin       :     129685  (41.76%)
  Other       :       3862  ( 1.24%)
  Digits      :       1934  ( 0.62%)
Character coverage observed (sampling fraction): 1.0000

Top 20 Devanagari chars:
  े : 12598
  क : 12415
  ा : 9735
  ह : 6676
  र : 6404
  न : 5619
  ी : 4899
  ं : 4602
  स : 4456
  ि : 4362
  म : 3599
  त : 3232
  ो : 3050
  ल : 2827
  प : 2728
  ् : 2686
  ै : 2502
  य : 2361
  । : 2217
  ए : 1906

Top 20 Latin chars:
  e : 15836
  a : 10557
  r : 10325
  i : 10139
  n : 9373
  t : 9340
  o : 8794
  s : 8204
  l : 5915
  c : 5803
  d : 4411
  p : 4066
  m : 4028
  u : 3509
  g : 2735
  h : 2700
  y : 2150
  f : 1927
  b : 1524
  v : 1375


# VAL

In [15]:
# Basic corpus statistics & script coverage (choose split)
from pathlib import Path
from collections import Counter
import unicodedata, math

SPLIT = "valid"  # "train", "valid", or "test"
corpus_path = Path('..') / f'/content/hicm_corpus.{SPLIT}.txt'
assert corpus_path.exists(), f"Corpus file not found for split '{SPLIT}'."

sample_lines = 250000  # adjust for speed vs accuracy
char_counter = Counter()
line_count = 0

# Unicode block ranges
DEV_START, DEV_END = 0x0900, 0x097F
LAT_START, LAT_END = 0x0041, 0x007A  # rough (includes uppercase/lowercase subset)

with corpus_path.open('r', encoding='utf-8') as f:
    for line in f:
        line_count += 1
        for ch in line.rstrip('\n'):
            char_counter[ch] += 1
        if line_count >= sample_lines:
            break

cats = Counter()
for ch, freq in char_counter.items():
    cp = ord(ch)
    if DEV_START <= cp <= DEV_END:
        cats['Devanagari'] += freq
    elif 'A' <= ch <= 'Z' or 'a' <= ch <= 'z':
        cats['Latin'] += freq
    elif ch.isdigit():
        cats['Digits'] += freq
    elif ch.isspace():
        cats['Whitespace'] += freq
    else:
        cats['Other'] += freq

total_chars = sum(cats.values())
print(f"Split: {SPLIT}")
print(f"Lines sampled: {line_count:,}")
print(f"Unique chars in sample: {len(char_counter):,}")
print("Character category distribution (approx):")
for k, v in cats.items():
    print(f"  {k:12s}: {v:10d}  ({v/total_chars*100:5.2f}%)")

coverage = total_chars / sum(char_counter.values())
print(f"Character coverage observed (sampling fraction): {coverage:0.4f}")

# Show top 20 Devanagari chars and top 20 Latin chars
print('\nTop 20 Devanagari chars:')
dev_top = [(c, char_counter[c]) for c in char_counter if DEV_START <= ord(c) <= DEV_END]
for c, freq in sorted(dev_top, key=lambda x: x[1], reverse=True)[:20]:
    print(f"  {c} : {freq}")

print('\nTop 20 Latin chars:')
lat_top = [(c, char_counter[c]) for c in char_counter if ('A' <= c <= 'Z' or 'a' <= c <= 'z')]
for c, freq in sorted(lat_top, key=lambda x: x[1], reverse=True)[:20]:
    print(f"  {c} : {freq}")

Split: valid
Lines sampled: 280
Unique chars in sample: 127
Character category distribution (approx):
  Latin       :       7172  (39.70%)
  Whitespace  :       3129  (17.32%)
  Devanagari  :       7648  (42.33%)
  Other       :         54  ( 0.30%)
  Digits      :         64  ( 0.35%)
Character coverage observed (sampling fraction): 1.0000

Top 20 Devanagari chars:
  े : 710
  क : 635
  ा : 629
  ह : 474
  र : 394
  ी : 393
  न : 335
  ं : 326
  स : 280
  । : 272
  ो : 240
  म : 227
  ि : 188
  ल : 181
  प : 174
  त : 169
  य : 159
  ै : 154
  ग : 148
  ् : 135

Top 20 Latin chars:
  e : 852
  i : 602
  t : 580
  a : 555
  r : 533
  n : 524
  o : 480
  s : 454
  c : 313
  l : 287
  d : 284
  m : 250
  p : 207
  u : 204
  h : 166
  y : 154
  g : 149
  f : 149
  v : 85
  b : 79


In [16]:
# Tokenize validation and test splits and write tokenized output files
from pathlib import Path

# Reuse tokenize_file and loaded SentencePiece processor `sp`
assert 'tokenize_file' in globals(), "tokenize_file() not defined. Run previous cell."

def write_tokenized(result_dict, split_name: str):
    out_file = Path(f"hicm_tokenized.{split_name}.txt")
    with out_file.open('w', encoding='utf-8') as out:
        for tokens in result_dict['tokens_per_line']:
            out.write(' '.join(tokens) + '\n')
    print(f"Wrote {out_file} ({result_dict['num_lines']} lines).")

results = {}
for split in ["valid", "test"]:
    split_path = Path('..') / f"/content/hicm_corpus.{split}.txt"
    if not split_path.exists():
        print(f"Skipping {split}; corpus file missing: {split_path}")
        continue
    res = tokenize_file(split_path)
    results[split] = res
    print(f"{split}: {res['num_lines']} lines tokenized.")
    write_tokenized(res, split)

results

valid: 280 lines tokenized.
Wrote hicm_tokenized.valid.txt (280 lines).
test: 2507 lines tokenized.
Wrote hicm_tokenized.test.txt (2507 lines).


{'valid': {'num_lines': 280,
  'tokens_per_line': [['▁headmaster',
    '▁संध्या',
    '▁मे',
    'ड',
    'पल्ली',
    'वार',
    '▁के',
    '▁encourage',
    '▁करने',
    '▁पर',
    '▁teachers',
    '▁and',
    '▁students',
    '▁ने',
    '▁sand',
    '▁से',
    '▁fort',
    '▁का',
    '▁construction',
    '▁किया',
    '।'],
   ['▁मन',
    'पा',
    '▁teacher',
    '▁union',
    '▁के',
    '▁president',
    '▁राज',
    'ेश',
    '▁ग',
    'वर',
    'े',
    '▁ने',
    '▁school',
    '▁को',
    '▁gift',
    '▁देकर',
    '▁सराहना',
    '▁की',
    '।'],
   ['▁fort',
    '▁का',
    '▁testing',
    '▁रमेश',
    '▁सात',
    'पु',
    'ते',
    '▁ने',
    '▁किया',
    '।'],
   ['▁fort',
    '▁construction',
    '▁में',
    '▁नि',
    'खिल',
    '▁का',
    'वल',
    'े',
    ',',
    '▁दर्शन',
    '▁गे',
    'ड़े',
    'कर',
    ',',
    '▁सा',
    'हिल',
    '▁मे',
    'श्',
    'राम',
    '▁इन',
    '▁students',
    '▁ने',
    '▁सह',
    'भाग',
    '▁लिया',
    '।'],
   ['▁co',
    'rp',
  