In [3]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vedan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vedan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vedan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vedan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
# Replace this path if needed
df = pd.read_csv("IMDB Dataset.csv")

# View sample
df.head(2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive


In [10]:
# Create custom stopwords list
custom_stopwords = set(stopwords.words('english'))
extra_words = {"movie", "film", "one", "make", "character", "like"}
custom_stopwords.update(extra_words)

# Cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in custom_stopwords and len(t) > 1]
    return tokens

# Apply cleaning
df['cleaned'] = df['review'].apply(clean_text)

# View cleaned tokens
df['cleaned'].iloc[0]


['reviewers',
 'mentioned',
 'watching',
 'oz',
 'episode',
 'youll',
 'hooked',
 'right',
 'exactly',
 'happened',
 'first',
 'thing',
 'struck',
 'oz',
 'brutality',
 'unflinching',
 'scenes',
 'violence',
 'set',
 'right',
 'word',
 'go',
 'trust',
 'show',
 'faint',
 'hearted',
 'timid',
 'show',
 'pulls',
 'punches',
 'regards',
 'drugs',
 'sex',
 'violence',
 'hardcore',
 'classic',
 'use',
 'word',
 'called',
 'oz',
 'nickname',
 'given',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'focuses',
 'mainly',
 'emerald',
 'city',
 'experimental',
 'section',
 'prison',
 'cells',
 'glass',
 'fronts',
 'face',
 'inwards',
 'privacy',
 'high',
 'agenda',
 'em',
 'city',
 'home',
 'manyaryans',
 'muslims',
 'gangstas',
 'latinos',
 'christians',
 'italians',
 'irish',
 'moreso',
 'scuffles',
 'death',
 'stares',
 'dodgy',
 'dealings',
 'shady',
 'agreements',
 'never',
 'far',
 'away',
 'would',
 'say',
 'main',
 'appeal',
 'show',
 'due',
 'fact',
 'goes',
 'shows',
 '

In [None]:
# Applying Stemming and Lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Apply stemming
df['stemmed'] = df['cleaned'].apply(lambda tokens: [stemmer.stem(t) for t in tokens])

# Apply lemmatization
df['lemmatized'] = df['cleaned'].apply(lambda tokens: [lemmatizer.lemmatize(t) for t in tokens])

# Sample outputs
print("Stemmed:", df['stemmed'].iloc[0])
print("Lemmatized:", df['lemmatized'].iloc[0])


In [None]:
# Vocabulary Size Comparison
stemmed_vocab = set([word for tokens in df['stemmed'] for word in tokens])
lemm_vocab = set([word for tokens in df['lemmatized'] for word in tokens])

print("Vocabulary size after stemming:", len(stemmed_vocab))
print("Vocabulary size after lemmatization:", len(lemm_vocab))


In [None]:
#Bar Plots – Top 30 Frequent Words
def plot_top_words(tokens_list, title):
    all_words = [word for tokens in tokens_list for word in tokens]
    word_freq = Counter(all_words).most_common(30)
    words, counts = zip(*word_freq)
    
    plt.figure(figsize=(12,6))
    plt.bar(words, counts, color='orange')
    plt.title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

plot_top_words(df['stemmed'], "Top 30 Words - Stemming")
plot_top_words(df['lemmatized'], "Top 30 Words - Lemmatization")


In [None]:
#Word Clouds
def show_wordcloud(tokens_list, title):
    text = ' '.join([' '.join(tokens) for tokens in tokens_list])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title)
    plt.axis('off')
    plt.show()

show_wordcloud(df['stemmed'], "Word Cloud - Stemming")
show_wordcloud(df['lemmatized'], "Word Cloud - Lemmatization")



In [None]:
#Analysis Summary
print("----- Analysis Summary -----\n")
print("1. Stemming yields a smaller vocabulary by aggressively trimming word forms.")
print("2. Lemmatization retains more context and yields human-readable words.")
print("3. Bar plots and word clouds confirm that lemmatized results are semantically richer.")
print("4. Use stemming when performance matters more than interpretability.")
print("5. Use lemmatization for better accuracy in downstream NLP tasks.")
