In [1]:
# ===============================
# NLP Assignment 1 - Comparison
# Tokenized vs Non-Tokenized Stats
# ===============================

from datasets import load_dataset
import re
import json
import os
from collections import Counter
import psutil, gc

# -------------------- Memory Helpers --------------------
def get_memory_usage():
    try:
        process = psutil.Process(os.getpid())
        return process.memory_info().rss / 1024 / 1024
    except:
        return 0

def clear_memory():
    gc.collect()

# -------------------- Sentence Splitter --------------------
def sentence_split(paragraph):
    sentence_endings = ['.', '?', '!', '।']
    sentences = []
    current = ''
    for char in paragraph:
        current += char
        if char in sentence_endings:
            if current.strip():
                sentences.append(current.strip())
                current = ''
    if current.strip():
        sentences.append(current.strip())
    return sentences

# -------------------- Word Tokenizer --------------------
def word_tokenize(sentence):
    pattern = r'''
        (https?://[^\s]+) |
        (www\.[^\s]+) |
        (\w+@\w+\.\w+) |
        (\d{1,2}/\d{1,2}/\d{2,4}) |
        (\d+\.\d+) |
        ([\u0900-\u097F]+) |
        ([a-zA-Z0-9_-]+) |
        ([^\s])
    '''
    tokens = re.findall(pattern, sentence, re.VERBOSE)
    flat_tokens = [token for group in tokens for token in group if token]
    return flat_tokens

# -------------------- Statistics Collector --------------------
class CorpusStatistics:
    def __init__(self, use_tokenizer=True):
        self.total_sentences = 0
        self.total_words = 0
        self.total_characters = 0
        self.sentence_lengths = []
        self.word_lengths = []
        self.vocabulary = Counter()
        self.processed_documents = 0
        self.use_tokenizer = use_tokenizer

    def process_document(self, text: str):
        if not text or not text.strip():
            return
        text = text.strip()
        if len(text) > 50000:
            return  # skip too long

        sentences = sentence_split(text)
        doc_word_count = 0

        for sentence in sentences:
            if not sentence.strip():
                continue

            if self.use_tokenizer:
                words = word_tokenize(sentence)
            else:
                words = sentence.split()  # simple whitespace split

            if words:
                self.total_sentences += 1
                self.total_words += len(words)
                self.total_characters += len(sentence)
                self.sentence_lengths.append(len(words))
                self.vocabulary.update(words)
                for word in words:
                    self.word_lengths.append(len(word))
                doc_word_count += len(words)

        if doc_word_count > 0:
            self.processed_documents += 1

    def compute_statistics(self):
        if self.total_sentences == 0:
            return {}
        avg_sentence_length = sum(self.sentence_lengths) / len(self.sentence_lengths)
        avg_word_length = sum(self.word_lengths) / len(self.word_lengths) if self.word_lengths else 0
        unique_tokens = len(self.vocabulary)
        total_tokens = self.total_words
        ttr = unique_tokens / total_tokens if total_tokens > 0 else 0

        return {
            'sentences': self.total_sentences,
            'words': self.total_words,
            'characters': self.total_characters,
            'avg_sentence_length': round(avg_sentence_length, 2),
            'avg_word_length': round(avg_word_length, 2),
            'ttr': round(ttr, 4),
            'vocab_size': unique_tokens,
            'documents': self.processed_documents
        }

# -------------------- Main --------------------
def main():
    print("📘 Loading Hindi dataset (streaming)...")
    hindi_dataset = load_dataset(
        "text",
        data_files="https://huggingface.co/datasets/ai4bharat/IndicCorpV2/resolve/main/data/hi-1.txt",
        split="train",
        streaming=True
    )

    # Two corpus statistics collectors
    tokenized_stats = CorpusStatistics(use_tokenizer=True)
    raw_stats = CorpusStatistics(use_tokenizer=False)

    max_docs = 5000   # limit for quick test in Colab
    processed = 0

    for i, example in enumerate(hindi_dataset):
        if 'text' not in example or not example['text'].strip():
            continue
        text = example['text']

        tokenized_stats.process_document(text)
        raw_stats.process_document(text)
        processed += 1

        if processed % 1000 == 0:
            print(f"Processed {processed} docs... mem={get_memory_usage():.1f}MB")

        if processed >= max_docs:
            break

    print("\n✅ Processing complete!")
    tok_res = tokenized_stats.compute_statistics()
    raw_res = raw_stats.compute_statistics()

    # Save JSON comparison
    os.makedirs("hindi_corpus_output", exist_ok=True)
    with open("hindi_corpus_output/statistics_comparison.json", "w", encoding="utf-8") as f:
        json.dump({"tokenized": tok_res, "raw": raw_res}, f, indent=2, ensure_ascii=False)

    # Print side-by-side comparison
    print("\n📊 FINAL COMPARISON (on", processed, "docs)\n")
    print(f"{'Metric':<25} {'Tokenized':<15} {'Non-Tokenized':<15}")
    print("-"*55)
    for k in tok_res.keys():
        print(f"{k:<25} {tok_res[k]:<15} {raw_res[k]:<15}")

    print("\n📁 Results saved to hindi_corpus_output/statistics_comparison.json")

if __name__ == "__main__":
    main()


📘 Loading Hindi dataset (streaming)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Processed 1000 docs... mem=720.5MB
Processed 2000 docs... mem=723.1MB
Processed 3000 docs... mem=727.5MB
Processed 4000 docs... mem=728.8MB
Processed 5000 docs... mem=731.9MB

✅ Processing complete!

📊 FINAL COMPARISON (on 5000 docs)

Metric                    Tokenized       Non-Tokenized  
-------------------------------------------------------
sentences                 17461           17461          
words                     304777          286699         
characters                1441349         1441349        
avg_sentence_length       17.45           16.42          
avg_word_length           3.84            4.09           
ttr                       0.0893          0.1111         
vocab_size                27217           31847          
documents                 5000            5000           

📁 Results saved to hindi_corpus_output/statistics_comparison.json
