In [1]:
import pandas as pd
from datasets import load_dataset
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load IMDb dataset, train split
imdb_dataset = load_dataset('imdb', split='train')

# Create new dataframe with 'review_text' and 'sentiment' columns
df = pd.DataFrame({
    'review_text': imdb_dataset['text'],
    'sentiment': imdb_dataset['label']
})

# Map sentiment labels (0->'negative', 1->'positive')
label_map = {0: 'negative', 1: 'positive'}
df['sentiment'] = df['sentiment'].map(label_map)

# Save processed dataset as CSV
df.to_csv('movie_reviews.csv', index=False, encoding='utf-8')

# Display sample rows
df.head()

Unnamed: 0,review_text,sentiment
0,I rented I AM CURIOUS-YELLOW from my video sto...,negative
1,"""I Am Curious: Yellow"" is a risible and preten...",negative
2,If only to avoid making this type of film in t...,negative
3,This film was probably inspired by Godard's Ma...,negative
4,"Oh, brother...after hearing about this ridicul...",negative


In [4]:
# Convert text to lowercase
df['processed_text'] = df['review_text'].str.lower()

# Remove punctuation
translator = str.maketrans('', '', string.punctuation)
df['processed_text'] = df['processed_text'].apply(lambda x: x.translate(translator))

# Remove stopwords using NLTK
stop_words = set(stopwords.words('english'))
df['processed_text'] = df['processed_text'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in stop_words])
)

df[['review_text', 'processed_text']].head()

Unnamed: 0,review_text,processed_text
0,I rented I AM CURIOUS-YELLOW from my video sto...,rented curiousyellow video store controversy s...
1,"""I Am Curious: Yellow"" is a risible and preten...",curious yellow risible pretentious steaming pi...
2,If only to avoid making this type of film in t...,avoid making type film future film interesting...
3,This film was probably inspired by Godard's Ma...,film probably inspired godards masculin fémini...
4,"Oh, brother...after hearing about this ridicul...",oh brotherafter hearing ridiculous film umptee...


Text Preprocessing

What is the purpose of text preprocessing in NLP applications?

To clean and standardize text data, reducing noise and variability so models can better understand and analyze the content.

How does stop word removal affect text analysis?

It removes common words that often don't carry significant meaning, which helps to focus on important keywords and reduces dimensionality.

Why is case normalization important in text preprocessing?

To ensure words are treated uniformly regardless of capitalization (e.g., "The" and "the" are treated as the same word).

What are potential drawbacks of removing stop words?

Sometimes stop words can carry important contextual or semantic meaning, so removing them may lose nuance or change meaning.



In [5]:
#Text Tokenization
import pandas as pd
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
from nltk.tokenize import TreebankWordTokenizer

# Sample training text for tokenizer
sample_train_text = """
This is a movie review. It has multiple sentences. Some reviews are long. Others are short.
This example is used to train a basic PunktSentenceTokenizer.
"""

#Train a sentence tokenizer manually
trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(sample_train_text)

sentence_tokenizer = PunktSentenceTokenizer(trainer.get_params())

#Use TreebankWordTokenizer
word_tokenizer = TreebankWordTokenizer()

#Apply tokenization to your dataframe
df = pd.read_csv("movie_reviews.csv")

# Apply sentence and word tokenization
df['sentences'] = df['review_text'].apply(lambda x: sentence_tokenizer.tokenize(str(x)))
df['word_tokens'] = df['review_text'].apply(lambda x: word_tokenizer.tokenize(str(x)))

df[['review_text', 'sentences', 'word_tokens']].head()


Unnamed: 0,review_text,sentences,word_tokens
0,I rented I AM CURIOUS-YELLOW from my video sto...,[I rented I AM CURIOUS-YELLOW from my video st...,"[I, rented, I, AM, CURIOUS-YELLOW, from, my, v..."
1,"""I Am Curious: Yellow"" is a risible and preten...","[""I Am Curious: Yellow"" is a risible and prete...","[``, I, Am, Curious, :, Yellow, '', is, a, ris..."
2,If only to avoid making this type of film in t...,[If only to avoid making this type of film in ...,"[If, only, to, avoid, making, this, type, of, ..."
3,This film was probably inspired by Godard's Ma...,[This film was probably inspired by Godard's M...,"[This, film, was, probably, inspired, by, Goda..."
4,"Oh, brother...after hearing about this ridicul...","[Oh, brother...after hearing about this ridicu...","[Oh, ,, brother, ..., after, hearing, about, t..."


Text Tokenization

What is the difference between word and sentence tokenization?

Sentence tokenization splits text into sentences, while word tokenization splits sentences into individual words or tokens.

Why is tokenization important for text analysis?

Tokenization breaks down raw text into manageable units for processing, enabling analysis at word or sentence level.

How do regular expressions help in tokenization?

They allow defining flexible patterns to identify tokens, such as words, numbers, or punctuation.

What challenges might you encounter when tokenizing social media text?

Informal language, emojis, hashtags, slang, and misspellings can confuse tokenizers.

In [6]:
import string
from nltk.corpus import stopwords
minimal_stopwords = set([
    "a", "an", "the", "and", "or", "but", "if", "while", "this", "is", "of", "in", 
    "to", "with", "on", "for", "as", "by", "it", "was", "that", "are", "be"
])

# Load CSV from previous task
df = pd.read_csv("movie_reviews.csv")

#Convert to lowercase
df['processed_text'] = df['review_text'].str.lower()

#Remove punctuation
translator = str.maketrans('', '', string.punctuation)
df['processed_text'] = df['processed_text'].apply(lambda x: x.translate(translator))

#Remove stopwords (using fallback list above)
df['processed_text'] = df['processed_text'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in minimal_stopwords])
)

df[['review_text', 'processed_text']].head()


Unnamed: 0,review_text,processed_text
0,I rented I AM CURIOUS-YELLOW from my video sto...,i rented i am curiousyellow from my video stor...
1,"""I Am Curious: Yellow"" is a risible and preten...",i am curious yellow risible pretentious steami...
2,If only to avoid making this type of film in t...,only avoid making type film future film intere...
3,This film was probably inspired by Godard's Ma...,film probably inspired godards masculin fémini...
4,"Oh, brother...after hearing about this ridicul...",oh brotherafter hearing about ridiculous film ...


In [7]:
# Bag-of-words with top 1000 features on processed text
vectorizer = CountVectorizer(max_features=1000)
bow_matrix = vectorizer.fit_transform(df['processed_text'])

# Calculate word frequencies
word_freq = np.asarray(bow_matrix.sum(axis=0)).flatten()

# Top 20 most frequent words
vocab = vectorizer.get_feature_names_out()
word_freq_dict = dict(zip(vocab, word_freq))
top_20_words = sorted(word_freq_dict.items(), key=lambda x: x[1], reverse=True)[:20]

print("Top 20 Most Frequent Words:")
for word, freq in top_20_words:
    print(f"{word}: {freq}")


Top 20 Most Frequent Words:
br: 57145
movie: 41813
film: 37461
not: 30193
you: 29509
his: 29252
have: 27667
he: 26655
one: 25511
its: 25055
at: 23364
all: 23161
they: 20961
from: 20392
who: 20375
so: 19894
like: 19645
her: 18138
just: 17632
about: 17241


Text Vectorization Questions

What information does a bag-of-words model capture?

It captures word frequency counts without considering word order or grammar.

How does vectorization help in text analysis?

It converts text into numerical features suitable for machine learning models.

What are the limitations of basic frequency counts?

They ignore context, word order, and semantics; rare but important words might be undervalued.



In [8]:
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
from nltk.tokenize import TreebankWordTokenizer

#Train a simple Punkt tokenizer
sample_train_text = """
This is a sentence. That is another one. Reviews can be long or short. Some have lots of punctuation!
"""

trainer = PunktTrainer()
trainer.train(sample_train_text)
sentence_tokenizer = PunktSentenceTokenizer(trainer.get_params())

#Use Treebank tokenizer for word tokenization
word_tokenizer = TreebankWordTokenizer()

#Tokenize reviews
df['sentences'] = df['review_text'].apply(lambda x: sentence_tokenizer.tokenize(str(x)))
df['word_tokens'] = df['review_text'].apply(lambda x: word_tokenizer.tokenize(str(x)))

df[['review_text', 'sentences', 'word_tokens']].head()


Unnamed: 0,review_text,sentences,word_tokens
0,I rented I AM CURIOUS-YELLOW from my video sto...,[I rented I AM CURIOUS-YELLOW from my video st...,"[I, rented, I, AM, CURIOUS-YELLOW, from, my, v..."
1,"""I Am Curious: Yellow"" is a risible and preten...","[""I Am Curious: Yellow"" is a risible and prete...","[``, I, Am, Curious, :, Yellow, '', is, a, ris..."
2,If only to avoid making this type of film in t...,[If only to avoid making this type of film in ...,"[If, only, to, avoid, making, this, type, of, ..."
3,This film was probably inspired by Godard's Ma...,[This film was probably inspired by Godard's M...,"[This, film, was, probably, inspired, by, Goda..."
4,"Oh, brother...after hearing about this ridicul...","[Oh, brother...after hearing about this ridicu...","[Oh, ,, brother, ..., after, hearing, about, t..."


In [9]:
# Create comparison DataFrame
df['num_sentences'] = df['sentences'].apply(len)
df['num_words_before'] = df['word_tokens'].apply(len)
df['num_words_after'] = df['processed_text'].apply(lambda x: len(x.split()))

comparison_df = df[['review_text', 'processed_text', 'num_sentences', 'num_words_before', 'num_words_after']]

# Calculate statistics
avg_sentences = comparison_df['num_sentences'].mean()
avg_words_before = comparison_df['num_words_before'].mean()
avg_words_after = comparison_df['num_words_after'].mean()

print(f"Average number of sentences per review: {avg_sentences:.2f}")
print(f"Average number of words before preprocessing: {avg_words_before:.2f}")
print(f"Average number of words after preprocessing: {avg_words_after:.2f}")

# Save processed data
comparison_df.to_csv('processed_reviews_comparison.csv', index=False, encoding='utf-8')

# Save BoW representation as DataFrame and CSV
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vocab)
bow_df.to_csv('bow_representation.csv', index=False, encoding='utf-8')

# Display sample comparisons
comparison_df.head()


Average number of sentences per review: 10.92
Average number of words before preprocessing: 273.90
Average number of words after preprocessing: 162.56


Unnamed: 0,review_text,processed_text,num_sentences,num_words_before,num_words_after
0,I rented I AM CURIOUS-YELLOW from my video sto...,i rented i am curiousyellow from my video stor...,10,327,205
1,"""I Am Curious: Yellow"" is a risible and preten...",i am curious yellow risible pretentious steami...,11,244,149
2,If only to avoid making this type of film in t...,only avoid making type film future film intere...,3,119,66
3,This film was probably inspired by Godard's Ma...,film probably inspired godards masculin fémini...,7,151,81
4,"Oh, brother...after hearing about this ridicul...",oh brotherafter hearing about ridiculous film ...,7,414,219


Summary:

This code covers loading the IMDb dataset, preprocessing text (lowercasing, punctuation removal, stopwords removal), tokenizing sentences and words, vectorizing text using bag-of-words, analyzing word frequencies, and saving detailed comparison data.