In [3]:
%pip install datasets --quiet
%pip install tqdm --quiet

Note: you may need to restart the kernel to use updated packages.




Note: you may need to restart the kernel to use updated packages.


In [4]:

import re
from datasets import load_dataset
from collections import Counter
from tqdm import tqdm

# ------------------------------
# 1. Load streaming dataset (Hindi - Devanagari)
# ------------------------------
dataset = load_dataset(
    "ai4bharat/IndicCorpV2",
    "indiccorp_v2",
    split="hin_Deva",
    streaming=True
)

In [3]:
# ------------------------------
# 2. Sentence tokenizer for Hindi
# ------------------------------
def sentence_tokenizer(text):
    # This regex splits on common sentence-ending punctuation, handling cases with quotes or brackets before the punctuation.
    # It also handles the Devanagari full stop (।)
    sentence_endings = re.compile(r'(?<=[।\.!?])\s+(?=["\'“‘()\[\]{}])?', re.UNICODE)
    sentences = sentence_endings.split(text.strip())
    # Further split by newline characters which often indicate sentence breaks in text data
    split_by_newline = []
    for sent in sentences:
        split_by_newline.extend(sent.split('\n'))
    return [s.strip() for s in split_by_newline if s.strip()]

In [None]:
# ------------------------------
# 3. Word tokenizer (Hindi, English, numbers, URLs, emails, dates, punctuation)
# ------------------------------
def word_tokenizer(sentence):
    # Updated regex to capture various token types including punctuation, URLs, emails, and dates.
    token_pattern = re.compile(r"""
        ^(?:https?:\/\/|www\.)\w+(org|in|com)$ | # URLs,Non-capturing group
        [a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,} | # Email addresses
        \d{1,2}[/-]\d{1,2}[/-]\d{2,4} | # Dates (simple format dd/mm/yyyy or dd-mm-yyyy)
        \d+(\.\d+)? |                 # Numbers (including decimals)
        [\u0900-\u097F]+ |            # Hindi words 
        [a-zA-Z]+ |                   # English words
        [^\s\w]                       # Punctuation (any non-whitespace, non-word character)
    """, re.VERBOSE | re.UNICODE)
    return [m.group() for m in token_pattern.finditer(sentence)]

In [5]:
# ------------------------------
# 4. Stats accumulators
# ------------------------------
sentence_count = 0
word_count = 0
char_count = 0
vocab_counter = Counter()
MAX_SENTENCES = 1_000_000  # target number of sentences

# ------------------------------
# 5. Process and save tokenized sentences and word tokens
# ------------------------------
with open("sentence_tokenized.txt", "w", encoding="utf-8") as f_sent, \
     open("word_tokenized.txt", "w", encoding="utf-8") as f_word:

    for example in tqdm(dataset, total=None):  # streaming, total unknown
        text = example["text"]
        sentences = sentence_tokenizer(text)

        for s in sentences:
            tokens = word_tokenizer(s)
            if not tokens:
                continue

            # Write files
            f_sent.write(" ".join(tokens) + "\n") # original sentence
            f_word.write(" ".join(tokens) + "\n")  # tokenized sentence

            # Update stats
            sentence_count += 1
            word_count += len(tokens)
            char_count += sum(len(tok) for tok in tokens)
            vocab_counter.update(tokens)

            # Stop after MAX_SENTENCES
            if sentence_count >= MAX_SENTENCES:
                break
        if sentence_count >= MAX_SENTENCES:
            break

610280it [03:54, 2607.80it/s] 


In [8]:
# ------------------------------
# 5a. Print all URLs and emails from sentence_tokenized.txt
# ------------------------------
url_email_pattern = re.compile(
    r'(?:https?://\S+|www\.\S+)|([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})',
    re.UNICODE
)

with open("sentence_tokenized.txt", "r", encoding="utf-8") as f:
    for line in f:
        matches = url_email_pattern.findall(line)
        # The regex returns tuples, so flatten and filter empty strings
        for match in matches:
            if isinstance(match, tuple):
                for item in match:
                    if item:
                        print(item)
            elif match:
                print(match)

mktc.kunjpura@gmail.com
indrimkt@yahoo.in
market.karnal@gmail.com
cgepidemic@gmail.com
hello@sarjak.org
prakashhindustani@gmail.com
contact@yourdomain.com
ssparasharji@gmail.com
dayanand.pandey@yahoo.com
admin@uaeembassy.in
shambhukant.sinha@inext.co.inPATNA
hotgirl13@aol.com
recruitment@rpcau.ac.in
support@puonline.co.in
amphoharyana@gmail.com
editor@mponlinenews.com
neepco.apprenticeship20@gmail.com
opkaashyap@gmail.com
cybercelljabalpur@gmail.com
ncov2019@gov.in
cmat@nta.ac.in
pibfactcheck@gmail.com
snti.recruit@tatasteel.com
sarokarzindagi@gmail.com
sarokarzindagi@gmail.com
swargoshthi@gmail.com
radioplaybackindia@live.com
swargoshthi@gmail.com
radioplaybackindia@live.com
swargoshthi@gmail.com
dhumkudiyaa@gmail.com
ignou@nta.ac.in
himalayannews1@gmail.com
khima.puri7@gmail.com
info@furballstory.com
waqas@kavtech.net
shantinairkerla@gmail.com
babywater@getresponse.com
marilynp@nctc.netAuthor
report.phising@sbi.co.in
lucknow@inext.co.in
vibratentertainment.in@gmail.com
ask.life@nw18.

In [6]:
# ------------------------------
# 6. Summary statistics
# ------------------------------
unique_tokens = len(vocab_counter)
ttr = unique_tokens / word_count if word_count > 0 else 0
avg_sent_len = word_count / sentence_count if sentence_count > 0 else 0
avg_word_len = char_count / word_count if word_count > 0 else 0

print("===== Corpus Statistics =====")
print(f"Total Sentences Processed   : {sentence_count:,}")
print(f"Total Words                 : {word_count:,}")
print(f"Total Characters            : {char_count:,}")
print(f"Unique Tokens (Vocab Size)  : {unique_tokens:,}")
print(f"Average Sentence Length     : {avg_sent_len:.2f} words")
print(f"Average Word Length         : {avg_word_len:.2f} chars")
print(f"Type/Token Ratio (TTR)      : {ttr:.4f}")

===== Corpus Statistics =====
Total Sentences Processed   : 1,000,000
Total Words                 : 18,801,825
Total Characters            : 72,369,343
Unique Tokens (Vocab Size)  : 299,473
Average Sentence Length     : 18.80 words
Average Word Length         : 3.85 chars
Type/Token Ratio (TTR)      : 0.0159
