In [5]:
!pip install datasets --quiet

import re
from datasets import load_dataset
from collections import Counter
from tqdm import tqdm

# 1. Load streaming dataset
dataset = load_dataset(
    "ai4bharat/IndicCorpV2",
    "indiccorp_v2",
    split="hin_Deva",
    streaming=True
)

# 2. Sentence tokenizer for Hindi
def sentence_tokenizer(text):
    sentence_endings = re.compile(r'(?<=[।!?\.])\s+')
    return [s.strip() for s in sentence_endings.split(text.strip()) if s.strip()]

# 3. Word tokenizer (handles Hindi, English, numbers, URLs, punctuation)
def word_tokenizer(sentence):
    token_pattern = re.compile(r"""
        (https?://[^\s]+) |                # URLs
        ([\w\.-]+@[\w\.-]+) |               # Emails
        (\d{1,2}[/-]\d{1,2}[/-]\d{2,4}) |   # Dates
        (\d+\.\d+) |                        # Decimal numbers
        ([\u0900-\u097F]+) |                # Hindi words
        ([a-zA-Z]+) |                       # English words
        (\d+) |                             # Numbers
        ([^\s\w\u0900-\u097F])              # Punctuation / symbols
    """, re.VERBOSE | re.UNICODE)

    return [m.group() for m in token_pattern.finditer(sentence)]

# 4. Stats accumulators
sentence_count = 0
word_count = 0
char_count = 0
vocab_counter = Counter()
MAX_DOCS = 1000

# 5. Process and save tokenized sentences
with open("tokenized_sentences.txt", "w", encoding="utf-8") as f_out:
    for i, example in tqdm(enumerate(dataset), total=MAX_DOCS):
        if i >= MAX_DOCS:
            break

        text = example["text"]
        sentences = sentence_tokenizer(text)
        sentence_count += len(sentences)

        for s in sentences:
            tokens = word_tokenizer(s)
            word_count += len(tokens)
            char_count += sum(len(tok) for tok in tokens)
            vocab_counter.update(tokens)
            f_out.write(" ".join(tokens) + "\n")

# 6. Summary statistics
unique_tokens = len(vocab_counter)
ttr = unique_tokens / word_count if word_count > 0 else 0
avg_sent_len = word_count / sentence_count if sentence_count > 0 else 0
avg_word_len = char_count / word_count if word_count > 0 else 0

print("===== Corpus Statistics =====")
print(f"Total Documents Processed : {MAX_DOCS}")
print(f"Total Sentences           : {sentence_count:,}")
print(f"Total Words               : {word_count:,}")
print(f"Total Characters          : {char_count:,}")
print(f"Unique Tokens (Vocab Size): {unique_tokens:,}")
print(f"Average Sentence Length   : {avg_sent_len:.2f} words")
print(f"Average Word Length       : {avg_word_len:.2f} chars")
print(f"Type/Token Ratio (TTR)    : {ttr:.4f}")


100%|██████████| 1000/1000 [00:01<00:00, 977.96it/s]

===== Corpus Statistics =====
Total Documents Processed : 1000
Total Sentences           : 1,717
Total Words               : 33,156
Total Characters          : 126,918
Unique Tokens (Vocab Size): 6,976
Average Sentence Length   : 19.31 words
Average Word Length       : 3.83 chars
Type/Token Ratio (TTR)    : 0.2104



