In [1]:
# Step 1: Install & Import Libraries
# Install datasets library (if not installed)
!pip install datasets --quiet

In [2]:
# Import required libraries
from datasets import load_dataset
import re

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#  # Step 2: Load OSCAR-2301 Dataset for a Language
# language_code = "en"  # Change this to your language: 'hi', 'gu', 'fr', etc.

# dataset = load_dataset("oscar-corpus/OSCAR-2301", language_code, split="train", streaming=True)

# # Extract first 1000 paragraphs
# data_list = []
# for i, item in enumerate(dataset):
#     data_list.append(item['text'])
#     if i >= 999:
#         break

# # âœ… Confirm loading
# print(f"âœ… Loaded {len(data_list)} paragraphs from OSCAR-2301 ({language_code})")
# print("ðŸ”¹ Sample paragraph:\n", data_list[0][:300])

from datasets import load_dataset

# Load the OSCAR 2022 version (open-access version)
dataset = load_dataset("oscar", "unshuffled_deduplicated_en", split="train", streaming=True)

# Sample a few
data_list = []
for i, item in enumerate(dataset):
    data_list.append(item["text"])
    if i >= 999:
        break

print(f"âœ… Loaded {len(data_list)} paragraphs from public OSCAR (English)")
print("ðŸ”¹ Sample paragraph:\n", data_list[0][:300])

RuntimeError: Dataset scripts are no longer supported, but found oscar.py

In [None]:
# step 3  Sentence tokenizer using ., ?, !, etc.
def sentence_tokenizer(text):
    sentence_endings = re.compile(r'(?<=[à¥¤!?\.])\s+')
    return sentence_endings.split(text.strip())

# Word tokenizer
def word_tokenizer(sentence):
    pattern = re.compile(
        r"""(
            (?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4})         |  # Dates
            (?:[\w\.-]+@[\w\.-]+\.\w+)               |  # Emails
            (?:https?://[^\s]+)                      |  # URLs
            (?:\d+\.\d+|\d+)                         |  # Numbers
            (?:\w+)                                  |  # Words
            (?:[^\w\s])                                 # Punctuation
        )""", re.VERBOSE)
    return pattern.findall(sentence)

In [None]:
# step 4  Prepare sentence and word lists
sentences_all = []
words_all = []

with open("oscar_tokenized_output.txt", "w", encoding="utf-8") as f:
    for para in data_list:
        sentences = sentence_tokenizer(para)
        for sent in sentences:
            tokens = word_tokenizer(sent)
            sentences_all.append(sent)
            words_all.extend(tokens)

            f.write("Sentence: " + sent.strip() + "\n")
            f.write("Tokens: " + " ".join(tokens) + "\n\n")

print("âœ… Tokenization complete. Output saved to 'oscar_tokenized_output.txt'")

In [None]:
# Step 5: Compute Corpus Statistics
# Total counts
total_sentences = len(sentences_all)
total_words = len(words_all)
total_characters = sum(len(word) for word in words_all)

# Averages
average_sentence_length = total_words / total_sentences if total_sentences else 0
average_word_length = total_characters / total_words if total_words else 0

# Type/Token Ratio
unique_words = set(words_all)
type_token_ratio = len(unique_words) / total_words if total_words else 0

# Show results
print("\nðŸ“Š Corpus Statistics:")
print(f"Total Sentences: {total_sentences}")
print(f"Total Words: {total_words}")
print(f"Total Characters: {total_characters}")
print(f"Average Sentence Length: {average_sentence_length:.2f}")
print(f"Average Word Length: {average_word_length:.2f}")
print(f"Type/Token Ratio (TTR): {type_token_ratio:.4f}")

In [None]:
# Step 6: Save Stats to File
with open("oscar_corpus_statistics.txt", "w", encoding="utf-8") as f:
    f.write("ðŸ“Š OSCAR Corpus Statistics:\n")
    f.write(f"Total Sentences: {total_sentences}\n")
    f.write(f"Total Words: {total_words}\n")
    f.write(f"Total Characters: {total_characters}\n")
    f.write(f"Average Sentence Length: {average_sentence_length:.2f}\n")
    f.write(f"Average Word Length: {average_word_length:.2f}\n")
    f.write(f"Type/Token Ratio (TTR): {type_token_ratio:.4f}\n")

print("âœ… Statistics saved to 'oscar_corpus_statistics.txt'")