In [2]:
# Step 1: Install and Import Required Libraries
# Install datasets if not already installed
!pip install datasets --quiet

In [1]:
# Import required libraries
from datasets import load_dataset
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 2: Load the Gujarati Dataset (Streaming Mode)
# Load the Gujarati dataset in streaming mode
dataset = load_dataset("ai4bharat/IndicCorpV2", "indiccorp_v2", split="guj_Gujr", streaming=True)

# Extract first 100 paragraphs
data_list = []
for i, item in enumerate(dataset):
    data_list.append(item['text'])  # Extract the text field
    if i >= 99:
        break

# ✅ Confirm loading
print(f"✅ Loaded {len(data_list)} paragraphs.")
print("🔹 Sample paragraph:\n", data_list[0][:300])

✅ Loaded 100 paragraphs.
🔹 Sample paragraph:
 આ વીડિયો જુઓ: ઊંઝા માર્કેટયાર્ડ આજથી 25 જુલાઈ સુધી બંધ


In [3]:
def sentence_tokenizer(text):
    """
    Splits a paragraph into sentences using Gujarati and standard punctuation.
    """
    sentence_endings = re.compile(r'(?<=[।!?\.])\s+')
    return sentence_endings.split(text.strip())

def word_tokenizer(sentence):
    """
    Tokenizes a sentence into:
    - Emails
    - URLs
    - Numbers (decimals included)
    - Dates (dd/mm/yyyy)
    - Words
    - Punctuation
    """
    pattern = re.compile(
        r"""(
            (?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4})         |  # Dates
            (?:[\w\.-]+@[\w\.-]+\.\w+)               |  # Email addresses
            (?:https?://[^\s]+)                      |  # URLs
            (?:\d+\.\d+|\d+)                         |  # Numbers and decimals
            [\u0A80-\u0AFF]+                         |  #gujarati words
            (?:\w+)                                  |  # Words
            (?:[^\w\s])                                 # Punctuation
        )""", re.VERBOSE)
    return pattern.findall(sentence)

sample_sentence = "મારું ઈમેલ example123@mail.com છે. તારીખ: 23/07/2023 છે. https://abc.com પર જુઓ!"
print("🔹 Tokenized sample:", word_tokenizer(sample_sentence))

🔹 Tokenized sample: ['મારું', 'ઈમેલ', 'example123@mail.com', 'છે', '.', 'તારીખ', ':', '23/07/2023', 'છે', '.', 'https://abc.com', 'પર', 'જુઓ', '!']


In [4]:
 # Step 4: Tokenize and Save to File
# Prepare lists to collect tokens and sentences
sentences_all = []
words_all = []

# Save tokenized output to file
with open("tokenized_output.txt", "w", encoding="utf-8") as f:
    for para in data_list:
        sentences = sentence_tokenizer(para)
        for sent in sentences:
            tokens = word_tokenizer(sent)
            sentences_all.append(sent)
            words_all.extend(tokens)
            
            f.write("Sentence: " + sent.strip() + "\n")
            f.write("Tokens: " + " ".join(tokens) + "\n\n")

print("✅ Tokenization complete. Output saved to 'tokenized_output.txt'")

✅ Tokenization complete. Output saved to 'tokenized_output.txt'


In [5]:
 # Step 5: Compute Corpus Statistics# Prepare lists to collect tokens and sentences
# Compute statistics
total_sentences = len(sentences_all)
total_words = len(words_all)
total_characters = sum(len(word) for word in words_all)
average_sentence_length = total_words / total_sentences if total_sentences else 0
average_word_length = total_characters / total_words if total_words else 0
unique_words = set(words_all)
type_token_ratio = len(unique_words) / total_words if total_words else 0

# Show results
print("\n📊 Corpus Statistics:")
print(f"Total Sentences: {total_sentences}")
print(f"Total Words: {total_words}")
print(f"Total Characters: {total_characters}")
print(f"Average Sentence Length: {average_sentence_length:.2f} words/sentence")
print(f"Average Word Length: {average_word_length:.2f} characters/word")
print(f"Type/Token Ratio (TTR): {type_token_ratio:.4f}")


📊 Corpus Statistics:
Total Sentences: 188
Total Words: 2355
Total Characters: 10179
Average Sentence Length: 12.53 words/sentence
Average Word Length: 4.32 characters/word
Type/Token Ratio (TTR): 0.5524


In [6]:
# step 6 (Optional): Save Stats to File
with open("corpus_statistics.txt", "w", encoding="utf-8") as f:
    f.write("📊 Corpus Statistics:\n")
    f.write(f"Total Sentences: {total_sentences}\n")
    f.write(f"Total Words: {total_words}\n")
    f.write(f"Total Characters: {total_characters}\n")
    f.write(f"Average Sentence Length: {average_sentence_length:.2f}\n")
    f.write(f"Average Word Length: {average_word_length:.2f}\n")
    f.write(f"Type/Token Ratio (TTR): {type_token_ratio:.4f}\n")

print("✅ Statistics saved to 'corpus_statistics.txt'")

✅ Statistics saved to 'corpus_statistics.txt'


In [None]:
# read about regular expressions 