DATASET:
IMDB Dataset of 50K Movie Reviews
https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

1. Finding a dataset for sentiment classification
2. Preparing the dataset by tokenization, stopwords removal, and stemming
3. Text vectorization
4. Training a classification model for sentiment classification

In [2]:
pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     -------- ------------------------------- 0.3/1.5 MB 7.0 MB/s eta 0:00:01
     ------------------- -------------------- 0.7/1.5 MB 7.6 MB/s eta 0:00:01
     ----------------------------- ---------- 1.1/1.5 MB 7.8 MB/s eta 0:00:01
     ------------------------------------- -- 1.4/1.5 MB 8.2 MB/s eta 0:00:01
     ---------------------------------------- 1.5/1.5 MB 8.0 MB/s eta 0:00:00
Installing collected packages: nltk
Successfully installed nltk-3.8.1
Note: you may need to restart the kernel to use updated packages.


In [6]:
# Finding a Dataset

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

import nltk
nltk.download("stopwords")

data = pd.read_csv("IMDB Dataset.csv")
print(data.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\irene\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [7]:
# Data Preparation, Tokenization, Stopwords Removal and Stemming

import nltk
import re
nltk.download("stopwords")
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword = set(stopwords.words("english"))

def clean(text):
    text = str(text).lower()
    text = re.sub("\[.*?\]", "", text)
    text = re.sub("https?://\S+|www\.\S+", "", text)
    text = re.sub("<.*?>+", "", text)
    text = re.sub("[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub("\n", "", text)
    text = re.sub("\w*\d\w*", "", text)
    text = [word for word in text.split(" ") if word not in stopword]
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split(" ")]
    text = " ".join(text)
    return text
data["review"] = data["review"].apply(clean)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\irene\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
pip install wordcloud

Collecting wordcloud
  Using cached wordcloud-1.8.2.2.tar.gz (220 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: wordcloud
  Building wheel for wordcloud (setup.py): started
  Building wheel for wordcloud (setup.py): finished with status 'error'
  Running setup.py clean for wordcloud
Failed to build wordcloud
Installing collected packages: wordcloud
  Running setup.py install for wordcloud: started
  Running setup.py install for wordcloud: finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × python setup.py bdist_wheel did not run successfully.
  │ exit code: 1
  ╰─> [20 lines of output]
      running bdist_wheel
      running build
      running build_py
      creating build
      creating build\lib.win-amd64-cpython-311
      creating build\lib.win-amd64-cpython-311\wordcloud
      copying wordcloud\color_from_image.py -> build\lib.win-amd64-cpython-311\wordcloud
      copying wordcloud\tokenization.py -> build\lib.win-amd64-cpython-311\wordcloud
      copying wordcloud\wordcloud.py -> build\lib.win-amd64-cpython-311\wordcloud
      copying wordcloud\wordcloud_cli.py -> build\lib.win-amd64-cpython-311\wordcloud
      copying wordcloud\_version.py -> build\lib.win-amd64-cpython-311\wordcloud
      copying wordcloud\__init__.py -> build\lib.win-amd64-cpython-311\wordcloud
      copying wordcloud\__main__.py -> build\lib.win-amd64-cpython-311\wordcloud
      copying wordcloud\stopwords -> build\lib.win-amd64-cpython-311\wordcloud

In [9]:
# error because above 'pip install wordcloud' doesn't install correctly
# visualisation of the words as wordcloud
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
text = " ".join(i for i in data.review)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

ModuleNotFoundError: No module named 'wordcloud'

In [10]:
# Text Vectorization

x = np.array(data["review"])
y = np.array(data["sentiment"])

cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

In [11]:
# Text Classification

from sklearn.linear_model import PassiveAggressiveClassifier
model = PassiveAggressiveClassifier()
model.fit(X_train, y_train)

In [16]:
# testing

user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
output = model.predict(data)
print(user)
print(output)

One of the best movies i have ever seen!
['negative']
