In [1]:
# Import libraries
import nltk
from nltk.corpus import movie_reviews
import random
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

# Download dataset
nltk.download('movie_reviews')

# Load movie reviews dataset
documents = [(movie_reviews.raw(fileid), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle documents
random.shuffle(documents)

# Extract texts and labels
texts = [doc for doc, label in documents]
labels = [label for doc, label in documents]

# --- a. Bag of Words ---
bow_vectorizer = CountVectorizer(max_features=5000, stop_words='english')
X_bow = bow_vectorizer.fit_transform(texts)

# Convert to DataFrame for display
bow_df = pd.DataFrame(X_bow.toarray(), columns=bow_vectorizer.get_feature_names_out())
print("Bag of Words Representation (first 5 rows):")
print(bow_df.head())

# --- b. TF-IDF ---
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(texts)

# Convert to DataFrame for display
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Representation (first 5 rows):")
print(tfidf_df.head())


[nltk_data] Downloading package movie_reviews to C:\Users\anitta-
[nltk_data]     pc\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Bag of Words Representation (first 5 rows):
   000  10  100  11  12  13  13th  14  15  16  ...  york  young  younger  \
0    0   0    0   0   0   0     0   0   0   0  ...     0      2        0   
1    0   0    0   0   0   0     0   0   0   0  ...     0      0        0   
2    0  16    0   0   0   0     0   0   0   0  ...     0      1        0   
3    0   0    0   0   0   0     0   0   0   0  ...     0      1        0   
4    0   0    0   0   0   0     0   0   0   0  ...     0      1        0   

   youth  zane  zany  zellweger  zero  zeta  zone  
0      0     0     0          0     0     0     0  
1      0     0     0          0     0     0     0  
2      0     0     0          0     0     0     0  
3      0     0     0          0     0     0     0  
4      0     0     0          0     0     0     0  

[5 rows x 5000 columns]

TF-IDF Representation (first 5 rows):
   000        10  100   11   12   13  13th   14   15   16  ...  york  \
0  0.0  0.000000  0.0  0.0  0.0  0.0   0.0  0.0  0.