<a href="https://colab.research.google.com/github/Archit-175/NLP-LAB/blob/main/NLP_Assigment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task
Using the Gujarati subset of the "ai4bharat/IndicCorpV2" dataset, \
create a sentence tokenizer and a word tokenizer. \
The sentence tokenizer should split each paragraph into sentences. \
The word tokenizer should then split each sentence into words, and it must be able to tokenize punctuation, URLs, numbers (including decimals), email addresses, and dates. \
After tokenizing the data, save the tokenized output into one or more files.

## Load the dataset for Gujarati

### Subtask:
Load the Gujarati subset of the `ai4bharat/IndicCorpV2` dataset.


**Reasoning**:
Import the `load_dataset` function and load the specified dataset into a variable named `dataset`.



In [None]:
import requests
import pandas as pd
import numpy as np

url = "https://datasets-server.huggingface.co/first-rows?dataset=ai4bharat%2FIndicCorpV2&config=indiccorp_v2&split=guj_Gujr"
response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    # Extract the 'text' value from the 'row' column
    df = pd.DataFrame([row['row'] for row in data['rows']])
    # Remove empty rows
    df['text'].replace('', np.nan, inplace=True)
    df.dropna(subset=['text'], inplace=True)
    print(df.head())
else:
    print(f"Failed to download data. Status code: {response.status_code}")

In [None]:
import re

def sentence_tokenizer(text):
    # Split sentences based on Gujarati sentence terminators
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!|।)\s', text)
    return [s.strip() for s in sentences if s.strip()]

def word_tokenizer(sentence):
    # Regex to capture URLs, email addresses, dates, numbers (including decimals), and words/punctuation
    pattern = re.compile(r'[\w\.-]+@[\w\.-]+|\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d+\.\d+\b|\b\d+\b|[\u0A80-\u0AFF]+|[^\s\w]+')
    return pattern.findall(sentence)

df['sentences'] = df['text'].apply(sentence_tokenizer)
df['words'] = df['sentences'].apply(lambda sentences: [word_tokenizer(sentence) for sentence in sentences])

print(df[['text', 'sentences', 'words']].head())

In [None]:
with open('tokenized_gujarati_text.txt', 'w', encoding='utf-8') as f:
    for sentences in df['words']:
        for sentence in sentences:
            f.write(' '.join(sentence) + '\n')

In [None]:
total_sentences = df['sentences'].apply(len).sum()
print(f"Total number of sentences: {total_sentences}")

In [None]:
total_words = df['words'].apply(lambda sentences: sum(len(sentence) for sentence in sentences)).sum()
print(f"Total number of words: {total_words}")

In [None]:
total_characters = df['text'].apply(len).sum()
print(f"Total number of characters: {total_characters}")

In [None]:
average_sentence_length = total_words / total_sentences
average_word_length = total_characters / total_words

print(f"Average Sentence Length: {average_sentence_length:.2f} words per sentence")
print(f"Average word length: {average_word_length:.2f} characters per word")

In [None]:
# Calculate Type-Token Ratio (TTR)
all_words = [word for sentences in df['words'] for sentence in sentences for word in sentence]
unique_words = set(all_words)
ttr = len(unique_words) / len(all_words)

print(f"Type/Token Ratio (TTR): {ttr:.4f}")



---



---



---



---



## Hindi

> Add blockquote




In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "ai4bharat/IndicCorpV2",
    name="indiccorp_v2",
    split="hin_Deva",
    streaming=True  # Optional: avoid full download
)

print(next(iter(dataset)))

In [None]:
def custom_sentence_tokenizer(text):
    # Split by Hindi sentence enders: । ? !
    # Keep punctuation attached to sentence
    sentences = re.split(r'(?<=[।!?])\s+', text.strip())
    return [s.strip() for s in sentences if s.strip()]

In [None]:
import re

def custom_word_tokenizer(text):
    pattern = (
        r'\b[\w\.-]+@[\w\.-]+\.\w+\b'       # emails
        r'|https?://\S+'                    # URLs
        r'|\d+\.\d+|\d+'                    # numbers
        r'|[\u0900-\u097F]+'                # Hindi words (matras included)
        r'|[^\s\w\u0900-\u097F]'            # punctuation
    )
    return re.findall(pattern, text)

In [None]:
from datasets import load_dataset

dataset = load_dataset("ai4bharat/IndicCorpV2", name="indiccorp_v2", split="hin_Deva", streaming=True)

total_sentences = 0
total_words = 0
total_chars = 0
unique_words = set()

MAX_PARAGRAPHS = 500_000  # Optional limit for speed

for i, example in enumerate(dataset):
    text = example["text"]

    # Custom sentence tokenizer
    sentences = custom_sentence_tokenizer(text)
    for sent in sentences:
        total_sentences += 1
        words = custom_word_tokenizer(sent)
        total_words += len(words)
        total_chars += sum(len(word) for word in words)
        unique_words.update(words)

    if i + 1 >= MAX_PARAGRAPHS:
        break

# Final statistics
print(f"Total sentences: {total_sentences}")
print(f"Total words: {total_words}")
print(f"Total characters: {total_chars}")
print(f"Average sentence length (words/sentence): {total_words / total_sentences:.2f}")
print(f"Average word length (characters/word): {total_chars / total_words:.2f}")
print(f"Type/Token Ratio: {len(unique_words) / total_words:.4f}")

In [None]:
import re

def find_gujarati_words(text):
    return re.findall(r'[\u0A80-\u0AFF]+', text)

text = "This is a test with some Gujarati words: ગુજરાતી શબ્દો"
gujarati_words = find_gujarati_words(text)
print(gujarati_words)

In [None]:
text = "કેમ છો? and મજામા."
gujarati_words = find_gujarati_words(text)
print(gujarati_words)

['કેમ', 'છો', 'મજામા']
