In [32]:
import numpy as np
import pandas as pd

import re # regex

# stopwords corpus within NLTK contains a collection of common words 
# that are often considered irrelevant for analysis
# and are thus typically removed from text data during preprocessing.
from nltk.corpus import stopwords

# tokenize module, text data can be split into individual words or tokens,
from nltk.tokenize import word_tokenize

# stem module applies the Snowball stemming algorithm 
# to reduce words to their root or base form
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

# model saver and loader
import pickle

The CountVectorizer from scikit-learn’s feature extraction module converts a collection of text documents into a matrix of token counts, representing the frequency of each word in the corpus, thereby enabling machine learning models to process textual data.

stopwordscorpus within NLTK contains a collection of common words that are often considered irrelevant for analysis and are thus typically removed from text data during preprocessing.

The pickle module in Python provides functionality for serializing and deserializing Python objects, allowing for easy storage and retrieval of data structures, such as lists or dictionaries, in a binary format.

In [33]:
data = pd.read_csv("IMDB Dataset.csv")

In [34]:
data.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [35]:
data.shape

(50000, 2)

In [36]:
data.describe(include='all')

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [37]:
data.sentiment.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [38]:
data.sentiment.replace('positive',1,inplace=True)
data.sentiment.replace('negative',0,inplace=True)

# Pre-processing Steps

Any sentiment analysis workflow begins with loading data. But what do you do once the data’s been loaded? You need to process it through a natural language processing pipeline before you can do anything interesting with it.The necessary steps include (but aren’t limited to) the following:

- Remove HTML tags:
- Remove special characters
- Convert everything to lowercase
- Remove stopwords
- Stemming

All these steps serve to reduce the noise inherent in any human-readable text and improve the accuracy of your classifier’s results. There are lots of great tools to help with this, such as the Natural Language Toolkit, TextBlob, and spaCy. For this tutorial, you’ll use spaCy.

## 1. Remove HTML tags

In [39]:
# This Python function clean utilizes regular expressions to remove HTML tags 
# from the input text and returns the cleaned text.

def clean(text):
    cleaned = re.compile(r'<.*?>')
    return re.sub(cleaned, '', text)

data.review = data.review.apply(clean)

In [40]:
data.review[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.I would say the main appeal of the show is due to the fact that it goes where other shows wo

In [41]:
data.review[1]

'A wonderful little production. The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well done.'

## 2. Remove special characters

In [42]:
# isalnum = alphanumeric เป็นตัวอักษรและตัวเลข
# if it is non-alphanumeric => replace them with whitespace

def is_special(text):
    rem = ''
    for i in text:
        if i.isalnum(): rem += i
        else: rem += ' '
    return rem

data.review = data.review.apply(is_special)
data.review[1]

'A wonderful little production  The filming technique is very unassuming  very old time BBC fashion and gives a comforting  and sometimes discomforting  sense of realism to the entire piece  The actors are extremely well chosen  Michael Sheen not only  has got all the polari  but he has all the voices down pat too  You can truly see the seamless editing guided by the references to Williams  diary entries  not only is it well worth the watching but it is a terrificly written and performed piece  A masterful production about one of the great master s of comedy and his life  The realism really comes home with the little things  the fantasy of the guard which  rather than use the traditional  dream  techniques remains solid then disappears  It plays on our knowledge and our senses  particularly with the scenes concerning Orton and Halliwell and the sets  particularly of their flat with Halliwell s murals decorating every surface  are terribly well done '

## 3. Convert everything to lowercase

In [43]:
def to_lower(text):
    return text.lower()

data.review = data.review.apply(to_lower)
data.review[1]

'a wonderful little production  the filming technique is very unassuming  very old time bbc fashion and gives a comforting  and sometimes discomforting  sense of realism to the entire piece  the actors are extremely well chosen  michael sheen not only  has got all the polari  but he has all the voices down pat too  you can truly see the seamless editing guided by the references to williams  diary entries  not only is it well worth the watching but it is a terrificly written and performed piece  a masterful production about one of the great master s of comedy and his life  the realism really comes home with the little things  the fantasy of the guard which  rather than use the traditional  dream  techniques remains solid then disappears  it plays on our knowledge and our senses  particularly with the scenes concerning orton and halliwell and the sets  particularly of their flat with halliwell s murals decorating every surface  are terribly well done '

## 4. Remove stopwords

<b>Stop words</b> are words that may be important in human communication but are of little value for machines. nltk comes with a default list of stop words that you can customize. For now, you’ll see how you can use token attributes to remove stop words:

In [44]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alive\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

This Python code imports the Natural Language Toolkit (NLTK) library and downloads the stopwords corpus, which contains common words like “the,” “is,” and “and,” used in text processing tasks for filtering out irrelevant words.

In [45]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alive\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

This Python code imports the Natural Language Toolkit (NLTK) library and downloads the Punkt tokenizer models, which are used for tokenization tasks like splitting text into individual words or sentences.

In [46]:
def rem_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

data.review = data.review.apply(rem_stopwords)
data.review[0]

['one',
 'reviewers',
 'mentioned',
 'watching',
 '1',
 'oz',
 'episode',
 'hooked',
 'right',
 'exactly',
 'happened',
 'first',
 'thing',
 'struck',
 'oz',
 'brutality',
 'unflinching',
 'scenes',
 'violence',
 'set',
 'right',
 'word',
 'go',
 'trust',
 'show',
 'faint',
 'hearted',
 'timid',
 'show',
 'pulls',
 'punches',
 'regards',
 'drugs',
 'sex',
 'violence',
 'hardcore',
 'classic',
 'use',
 'word',
 'called',
 'oz',
 'nickname',
 'given',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'focuses',
 'mainly',
 'emerald',
 'city',
 'experimental',
 'section',
 'prison',
 'cells',
 'glass',
 'fronts',
 'face',
 'inwards',
 'privacy',
 'high',
 'agenda',
 'em',
 'city',
 'home',
 'many',
 'aryans',
 'muslims',
 'gangstas',
 'latinos',
 'christians',
 'italians',
 'irish',
 'scuffles',
 'death',
 'stares',
 'dodgy',
 'dealings',
 'shady',
 'agreements',
 'never',
 'far',
 'away',
 'would',
 'say',
 'main',
 'appeal',
 'show',
 'due',
 'fact',
 'goes',
 'shows',
 'da

In [47]:
data.review[1]

['wonderful',
 'little',
 'production',
 'filming',
 'technique',
 'unassuming',
 'old',
 'time',
 'bbc',
 'fashion',
 'gives',
 'comforting',
 'sometimes',
 'discomforting',
 'sense',
 'realism',
 'entire',
 'piece',
 'actors',
 'extremely',
 'well',
 'chosen',
 'michael',
 'sheen',
 'got',
 'polari',
 'voices',
 'pat',
 'truly',
 'see',
 'seamless',
 'editing',
 'guided',
 'references',
 'williams',
 'diary',
 'entries',
 'well',
 'worth',
 'watching',
 'terrificly',
 'written',
 'performed',
 'piece',
 'masterful',
 'production',
 'one',
 'great',
 'master',
 'comedy',
 'life',
 'realism',
 'really',
 'comes',
 'home',
 'little',
 'things',
 'fantasy',
 'guard',
 'rather',
 'use',
 'traditional',
 'dream',
 'techniques',
 'remains',
 'solid',
 'disappears',
 'plays',
 'knowledge',
 'senses',
 'particularly',
 'scenes',
 'concerning',
 'orton',
 'halliwell',
 'sets',
 'particularly',
 'flat',
 'halliwell',
 'murals',
 'decorating',
 'every',
 'surface',
 'terribly',
 'well',
 'done']

This Python function rem_stopwords removes stopwords from the input text using NLTK’s English stopwords corpus and tokenizes the text into words, returning a list of words excluding the stopwords.

## 5. Stem the words

In [48]:
def stem_txt(text):
    ss = SnowballStemmer('english')
    return " ".join([ss.stem(w) for w in text])

data.review = data.review.apply(stem_txt)
data.review[0]

'one review mention watch 1 oz episod hook right exact happen first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word call oz nicknam given oswald maximum secur state penitentari focus main emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch darker side'

In [49]:
data.review[1]

'wonder littl product film techniqu unassum old time bbc fashion give comfort sometim discomfort sens realism entir piec actor extrem well chosen michael sheen got polari voic pat truli see seamless edit guid refer william diari entri well worth watch terrif written perform piec master product one great master comedi life realism realli come home littl thing fantasi guard rather use tradit dream techniqu remain solid disappear play knowledg sens particular scene concern orton halliwel set particular flat halliwel mural decor everi surfac terribl well done'

This Python function stem_txt stems the words in the input text using the Snowball Stemmer for English and returns a string where each word is replaced with its stem.

In [50]:
data.head(5)

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod hook righ...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1


# Model Training

## 1. Creating Bag Of Words (BOW)

In [51]:
X = np.array(data.iloc[:, 0].values)
Y = np.array(data.sentiment.values)

cv = CountVectorizer(max_features = 1000)

X = cv.fit_transform(data.review).toarray()

X.shape, Y.shape

((50000, 1000), (50000,))

This Python code segment converts the ‘review’ column of the ‘data’ object into a bag-of-words representation using CountVectorizer with a maximum of 1000 features, assigns it to ‘X’, and extracts the ‘sentiment’ column into ‘y’, then prints the shapes of ‘X’ and ‘y’.

Creating a Bag of Words (BOW) involves representing text data as a collection of unique words and their frequencies, disregarding grammar and word order, essentially converting text into numerical vectors for machine learning tasks.

## 2. Train test split

In [52]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2, random_state=69)
print("Train shapes: X = {}, Y = {}".format(train_x.shape, train_y.shape))
print("Test shapes: X = {}, Y = {}".format(test_x.shape, test_y.shape))

Train shapes: X = (40000, 1000), Y = (40000,)
Test shapes: X = (10000, 1000), Y = (10000,)


## 3. Defining the models and Training them

In [53]:
gnb = GaussianNB()
mnb = MultinomialNB(alpha=1.0, fit_prior=True)
bnb = BernoulliNB(alpha=1.0, fit_prior=True)

gnb.fit(train_x, train_y)
mnb.fit(train_x, train_y)
bnb.fit(train_x, train_y)

# Model Testing

## 4. Prediction and accuracy metrics to choose best model

In [54]:
gnb_pred = gnb.predict(test_x)
ypm_pred = mnb.predict(test_x)
ypb_pred = bnb.predict(test_x)

print("Gaussian = ", accuracy_score(test_y, gnb_pred))
print("Multinomial = ", accuracy_score(test_y, ypm_pred))
print("Bernoulli = ", accuracy_score(test_y, ypb_pred))

Gaussian =  0.7788
Multinomial =  0.834
Bernoulli =  0.8372


## Model saving

In [55]:
# Bernoulli Model
pickle.dump(bnb,open('model1.pkl','wb'))

## Performance Checking

In [57]:
rev =  """Terrible. Complete trash. Brainless tripe. Insulting to anyone who isn't an 8 year old fan boy. Im actually pretty disgusted that this movie is making the money it is - what does it say about the people who brainlessly hand over the hard earned cash to be 'entertained' in this fashion and then come here to leave a positive 8.8 review?? Oh yes, they are morons. Its the only sensible conclusion to draw. How anyone can rate this movie amongst the pantheon of great titles is beyond me.

So trying to find something constructive to say about this title is hard...I enjoyed Iron Man? Tony Stark is an inspirational character in his own movies but here he is a pale shadow of that...About the only 'hook' this movie had into me was wondering when and if Iron Man would knock Captain America out...Oh how I wished he had :( What were these other characters anyways? Useless, bickering idiots who really couldn't organise happy times in a brewery. The film was a chaotic mish mash of action elements and failed 'set pieces'...

I found the villain to be quite amusing.

And now I give up. This movie is not robbing any more of my time but I felt I ought to contribute to restoring the obvious fake rating and reviews this movie has been getting on IMDb."""

In [58]:
f1 = clean(rev)
f2 = is_special(f1)
f3 = to_lower(f2)
f4 = rem_stopwords(f3)
f5 = stem_txt(f4)

In [64]:
bag_of_words = []
words = word_tokenize(f5)

for w in words: 
    bag_of_words.append(words.count(w))
    
word_dict = cv.vocabulary_
pickle.dump(word_dict, open('bow.pkl', 'wb'))

In [66]:
inp = []

for i in word_dict: inp.append(f5.count(i[0]))

y_pred = bnb.predict(np.array(inp).reshape(1, 1000))

print(y_pred)

[1]


Reference: https://medium.com/@Coursesteach/using-machine-learning-to-predict-movie-reviews-82b0ab1db313