In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [2]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

# Basic cleaning function
def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing to all reviews
df['clean_review'] = df['review'].apply(preprocess)

# Display some results
df[['review', 'clean_review', 'sentiment']].head()


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review,clean_review,sentiment
0,One of the other reviewers has mentioned that ...,one reviewers mentioned watching 1 oz episode ...,positive
1,A wonderful little production. <br /><br />The...,wonderful little production filming technique ...,positive
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...,positive
3,Basically there's a family where a little boy ...,basically theres family little boy jake thinks...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter matteis love time money visually stunni...,positive


In [3]:
from collections import Counter

# Combine all cleaned reviews into one big string
all_text = ' '.join(df['clean_review'])

# Split into individual words
all_words = all_text.split()

# Total number of words (including repetitions)
total_words = len(all_words)

# Total number of unique words (vocabulary size)
unique_words = len(set(all_words))

# Word frequency using Counter (optional)
word_freq = Counter(all_words)

# Display results
print("Total Words in Corpus:", total_words)
print("Total Unique Words (Vocabulary):", unique_words)

# Show 10 most common words (optional)
print("\nTop 10 Most Common Words:")
print(word_freq.most_common(10))


Total Words in Corpus: 5992075
Total Unique Words (Vocabulary): 222320

Top 10 Most Common Words:
[('movie', 83508), ('film', 74468), ('one', 50369), ('like', 38825), ('good', 28483), ('even', 24280), ('would', 24001), ('time', 23266), ('really', 22894), ('see', 22432)]


In [4]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Use Keras Tokenizer to assign each word a unique integer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['clean_review'])

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(df['clean_review'])

# Vocabulary dictionary (word -> index)
word_index = tokenizer.word_index

# Optional: Pad sequences to the same length for neural networks
padded_sequences = pad_sequences(sequences)

# Show one example
print("Original review:", df['clean_review'][0])
print("Integer encoded:", sequences[0])
print("Padded sequence shape:", padded_sequences.shape)


2025-06-02 07:47:51.632805: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748850471.874871      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748850471.945696      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Original review: one reviewers mentioned watching 1 oz episode youll hooked right exactly happened methe first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use wordit called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em city home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgy dealings shady agreements never far awayi would say main appeal show due fact goes shows wouldnt dare forget pretty pictures painted mainstream audiences forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste oz got accustomed high levels graphic violence violence injustice crooked guards wholl sold nickel inmates wholl kill order get away well 

In [7]:
from keras.utils import to_categorical

# Example: Convert a sequence [3, 5, 1] to one-hot vectors (for a small vocab of size 6)
example_sequence = [3, 5, 1]
vocab_size = len(word_index) + 1
one_hot_encoded = to_categorical(example_sequence, num_classes=vocab_size)

print("One-hot encoded vectors:")
print(one_hot_encoded)


One-hot encoded vectors:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


In [6]:
# Assuming df is your DataFrame with original reviews in 'review' column
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)   # Remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text) # Remove punctuation and numbers
    text = re.sub(r'\s+', ' ', text).strip()
    return text

X_clean = df['review'].apply(clean_text)

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X_clean)

vocab = vectorizer.vocabulary_
word_counts = X_bow.sum(axis=0)

word_freq = {word: word_counts[0, idx] for word, idx in vocab.items()}
word_freq_sorted = sorted(word_freq.items(), key=lambda item: item[1], reverse=True)

print("Top 10 frequent words:\n")
for word, freq in word_freq_sorted[:10]:
    print(f"{word}: {freq}")

print(f"\nVocabulary Size: {len(vocab)}")


Top 10 frequent words:

the: 650812
and: 319428
of: 288081
to: 266297
is: 210068
in: 183153
it: 151365
this: 145500
that: 135806
was: 95187

Vocabulary Size: 214594


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# Bi-gram vectorizer
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
X_bigram = bigram_vectorizer.fit_transform(X_clean)
bigram_vocab = bigram_vectorizer.vocabulary_

# Tri-gram vectorizer
trigram_vectorizer = CountVectorizer(ngram_range=(3, 3))
X_trigram = trigram_vectorizer.fit_transform(X_clean)
trigram_vocab = trigram_vectorizer.vocabulary_

print(f"Vocabulary Size with Unigrams: {len(vocab)}")
print(f"Vocabulary Size with Bi-grams: {len(bigram_vocab)}")
print(f"Vocabulary Size with Tri-grams: {len(trigram_vocab)}")


Vocabulary Size with Unigrams: 214594
Vocabulary Size with Bi-grams: 2562875
Vocabulary Size with Tri-grams: 6653975


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform cleaned text data
X_tfidf = tfidf_vectorizer.fit_transform(X_clean)

# Vocabulary: word -> index
tfidf_vocab = tfidf_vectorizer.vocabulary_

# IDF scores
idf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))

# Sort IDF scores (lowest to highest)
sorted_idf = sorted(idf_scores.items(), key=lambda x: x[1])

print(f"Vocabulary Size with TF-IDF: {len(tfidf_vocab)}\n")

print("Top 10 words with lowest IDF scores (most common words):")
for word, score in sorted_idf[:10]:
    print(f"{word}: {score:.4f}")

print("\nTop 10 words with highest IDF scores (most unique words):")
for word, score in sorted_idf[-10:]:
    print(f"{word}: {score:.4f}")


Vocabulary Size with TF-IDF: 214594

Top 10 words with lowest IDF scores (most common words):
the: 1.0093
and: 1.0365
of: 1.0532
to: 1.0630
this: 1.1074
is: 1.1124
in: 1.1312
it: 1.1707
that: 1.2289
for: 1.3458

Top 10 words with highest IDF scores (most unique words):
zyuranger: 11.1267
zzzzip: 11.1267
zzzzz: 11.1267
zzzzzs: 11.1267
zzzzzzzz: 11.1267
zzzzzzzzz: 11.1267
zzzzzzzzzzzz: 11.1267
zzzzzzzzzzzzz: 11.1267
zzzzzzzzzzzzzzzzzz: 11.1267
zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: 11.1267
