In [None]:
!pip install contractions

Imports

In [None]:
import random
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import contractions

# Ensure the required resources are downloaded
nltk.download('punkt')
nltk.download('punkt_tab')

PreProcessing

In [None]:
# Preprocessing function
def preprocess_text(text):
    # Remove unnecessary symbols
    text = re.sub(r'[\*\(\)\[\]{}]', '', text)  # Remove *, (, ), [, ], {, }
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize spaces

    text = text.lower()
    text = contractions.fix(text)  # Expand contractions

    # Remove Project Gutenberg boilerplate and metadata
    text = re.sub(r'\*\*\*.*?\*\*\*', '', text, flags=re.DOTALL)  # Remove START/END OF GUTENBERG EBOOK
    text = re.sub(r'(title|author|release date|language|e-text|prepared by)[^:]*:.*?(\n|$)', '', text)
    text = re.sub(r'(project gutenberg license).*?(\n|$)', '', text, flags=re.DOTALL)

    # Replace patterns with contextual tags
    text = re.sub(r'http\S+|www\S+|ftp\S+', '<URL>', text)  # URLs
    text = re.sub(r'\b(\d{1,2}[:.]\d{2}\s?[ap][m]|\b(?:[a-z]+ \d{1,2},? \d{4})\b)', '<TIME>',text)  # Times and dates
    text = re.sub(r'\d+%', '<PERCENT>', text)  # Percentages
    text = re.sub(r'\d+\s?(?:years?|yrs?)\s?old', '<AGE>', text)  # Ages
    text = re.sub(r'@\w+', '<USER_MENTION>', text)  # Mentions
    text = re.sub(r'#\w+', '<TOPIC_HASHTAG>', text)  # Hashtags

    # Handle ordinals and superscripts
    text = re.sub(r'(\d+\^?(?:st|nd|rd|th))', '<ORDINAL>', text)  # Ordinals like 1^st, 2^nd
    text = re.sub(r'(\d\^st|\d\^nd|\d\^rd|\d\^th)', '<ORDINAL>', text)  # Handle other ordinals

    # Handle possessives
    text = re.sub(r'\d+', '<NUM>', text)  # Replace numbers with <NUM> token
    text = re.sub(r"\b([a-z]+['’][sS]?)\b", r'\1', text)

    # Handle dates and times
    text = re.sub(r'\b(\d{1,2} \w+ \d{4})\b', '<DATE>', text)  # Format like 01 January 1813
    text = re.sub(r'\b(\d{1,2}[:.]\d{2} [ap][m])\b', '<TIME>', text)  # Format like 12:30 PM

    text = re.sub(r'chapter\s+[ivxlcdm]+\.', '', text)

    text = contractions.fix(text)  # Expand contractions

    text = re.sub(r'[^a-z0-9\s<>]', '', text)  # Keep only alphanumeric and relevant symbols

    return text

def load_and_split_data(sentences, test_size=1000, val_size=2000):
    random.shuffle(sentences)
    test_sentences = sentences[:test_size]
    val_sentences = sentences[test_size:test_size + val_size]
    train_sentences = sentences[test_size + val_size:]
    return train_sentences, val_sentences, test_sentences


# Tokenization function
def tokenize_text(text, test_size=1000, val_size=2000,split = False):

    # Tokenize sentences using NLTK's sent_tokenize to split into sentences
    sentences = sent_tokenize(text)
    if split:
        train_sentences,val_sentences, test_sentences = load_and_split_data(sentences, test_size=1000,val_size=2000)
    else:
        train_sentences = sentences
        val_size = []
        test_sentences = []


    train_tokenized_sentences = []

    for sentence in train_sentences:
        # Preprocess text
        sentence = preprocess_text(sentence)

        # Tokenize each sentence into words
        words = sentence.split()

        train_tokenized_sentences.append(words)

    test_tokenized_sentences = []

    for sentence in test_sentences:
        # Preprocess text
        sentence = preprocess_text(sentence)

        # Tokenize each sentence into words
        words = sentence.split()

        test_tokenized_sentences.append(words)

    val_tokenized_sentences = []

    for sentence in val_sentences:
        # Preprocess text
        sentence = preprocess_text(sentence)

        # Tokenize each sentence into words
        words = sentence.split()

        val_tokenized_sentences.append(words)

    return train_tokenized_sentences,val_tokenized_sentences,test_tokenized_sentences
