In [9]:
!pip install datasets --quiet

import re
from datasets import load_dataset
from collections import Counter
from tqdm import tqdm

# 1. Load streaming dataset
dataset = load_dataset("ai4bharat/IndicCorpV2", "indiccorp_v2", split="hin_Deva", streaming=True)

# 2. Tokenizers
def sentence_tokenizer(text):
    sentence_endings = re.compile(r'(?<=[।.!?])\s+')
    return [s.strip() for s in sentence_endings.split(text.strip()) if s.strip()]

def word_tokenizer(sentence):
    token_pattern = re.compile(r"""
        (https?://[^\s]+) |
        ([\w\.-]+@[\w\.-]+) |
        (\d{1,2}[/-]\d{1,2}[/-]\d{2,4}) |
        (\d+\.\d+) |
        (\w+([-']\w+)*) |
        ([^\w\s])
    """, re.VERBOSE | re.UNICODE)

    return [m.group() for m in token_pattern.finditer(sentence)]

# 3. Stats accumulators
sentence_count = 0
word_count = 0
char_count = 0
vocab_counter = Counter()
MAX_DOCS = 1000

# 4. Save tokenized sentences
with open("tokenized_sentences.txt", "w", encoding="utf-8") as f_out:
    for i, example in tqdm(enumerate(dataset), total=MAX_DOCS):
        if i >= MAX_DOCS:
            break

        text = example["text"]
        sentences = sentence_tokenizer(text)
        sentence_count += len(sentences)

        for s in sentences:
            tokens = word_tokenizer(s)
            word_count += len(tokens)
            char_count += sum(len(tok) for tok in tokens)
            vocab_counter.update(tokens)

            f_out.write(" ".join(tokens) + "\n")


100%|██████████| 1000/1000 [00:00<00:00, 1164.25it/s]


In [8]:
unique_tokens = len(vocab_counter)
ttr = unique_tokens / word_count if word_count else 0
avg_sent_len = word_count / sentence_count if sentence_count else 0
avg_word_len = char_count / word_count if word_count else 0

print(f"Total Sentences: {sentence_count}")
print(f"Total Words: {word_count}")
print(f"Total Characters: {char_count}")
print(f"Average Sentence Length (words): {avg_sent_len:.2f}")
print(f"Average Word Length (chars): {avg_word_len:.2f}")
print(f"Type/Token Ratio (TTR): {ttr:.4f}")


Total Sentences: 1717
Total Words: 106070
Total Characters: 126919
Average Sentence Length (words): 61.78
Average Word Length (chars): 1.20
Type/Token Ratio (TTR): 0.0180
