Custom Regrex Sentence Tokenizer

In [1]:
import pandas as pd
import re
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize

# Load dataset
# Replace 'path_to_dataset.csv' with the actual path to the dataset containing text.
df = pd.read_csv("/kaggle/input/imdb-movie-reviews/IMDB Dataset.csv")

# Assume the dataset has a column named 'review' with textual data.
# Inspect the dataset
print(df.head())

# Using NLTK's sentence tokenizer
def nltk_sentence_tokenizer(text):
    """
    Tokenize text into sentences using NLTK's sent_tokenize.
    """
    return sent_tokenize(text)

# Applying the NLTK sentence tokenizer to the dataset
df["sentences_nltk"] = df["review"].apply(nltk_sentence_tokenizer)

# Using a Custom Regex-based Sentence Tokenizer
def regex_sentence_tokenizer(text):
    """
    Tokenize text into sentences using regex.
    Handles punctuation like '.', '!', '?', and special cases like abbreviations.
    """
    # Regex pattern to split sentences
    pattern = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s'
    return re.split(pattern, text)

# Applying the custom regex-based tokenizer to the dataset
df["sentences_regex"] = df["review"].apply(regex_sentence_tokenizer)

# Inspect the tokenized sentences
print(df[["review", "sentences_nltk", "sentences_regex"]].head())

# Save the tokenized sentences to a new CSV
df.to_csv("tokenized_sentences.csv", index=False)
print("Tokenized sentences saved to 'tokenized_sentences.csv'.")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      sentences_nltk  \
0  [One of the other reviewers has mentioned that...   
1  [A wonderful little production., <br /><br />T...   
2