# Python NLTK: Sentiment Analysis on Movie Reviews

In [10]:

import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Chrispdl\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


## Explore movie_reviews dataset

In [11]:
# List of all the words in 'movie_reviews'
words= movie_reviews.words()
print('List of all the words used:', words)

# Total number of words in 'movie_reviews'
num_words= len(movie_reviews.words())
print('Total number of words used:',num_words)

#Categories of the movies reviews
cat= movie_reviews.categories()
print('the categories of the movies\' reviews: ',cat)

List of all the words used: ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
Total number of words used: 1583820
the categories of the movies' reviews:  ['neg', 'pos']


## Preprocessing

In [12]:
#.Lower() 
words_lower = [word.lower() for word in movie_reviews.words()] 

#Frequency of all words used 
from nltk import FreqDist
words_freq = FreqDist(words_lower)
print('distribution frequency: ',words_freq)

print('The total number of distinct words in moview reviews:',len(words_freq))
# print 10 most frequently occurring words
print ('The 15 most common words used:',words_freq.most_common(15))

#remove the stopwords 
from nltk.corpus import stopwords
en_stopwords = stopwords.words('english')
no_stopwords= [ word for word in words_lower if word not in en_stopwords ]


#remove punctuation
import string
no_stopwords_punctuation= [ word for word in no_stopwords if word not in string.punctuation ]

#Frequency Distribution of new words list after  removing stopwords and punctuation

clean_words_freq = FreqDist(no_stopwords_punctuation)
print('clean words distribution frequency: ',clean_words_freq)
print('The total number of distinct words in moview reviews after removing punctuation and stopwords:',len(clean_words_freq))
print ('The 15 most common words used:',clean_words_freq.most_common(15))



distribution frequency:  <FreqDist with 39768 samples and 1583820 outcomes>
The total number of distinct words in moview reviews: 39768
The 15 most common words used: [(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]
clean words distribution frequency:  <FreqDist with 39586 samples and 710578 outcomes>
The total number of distinct words in moview reviews after removing punctuation and stopwords: 39586
The 15 most common words used: [('film', 9517), ('one', 5852), ('movie', 5771), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2169), ('would', 2109), ('much', 2049), ('character', 2020), ('also', 1967), ('get', 1949), ('two', 1911), ('well', 1906)]


##### This shows that after removing  only 182 stopwords and punctuation the outcomes number has been reduced around to the half (from 1583820    to 710578)  

### Create Word Feature using 4000 most frequently occurring words

In [26]:
most_freq_words= clean_words_freq.most_common(4000)
print(most_freq_words[:15])
#now this is in a form of tuple 
most_freq_features= [word[0] for word in most_freq_words]
print(most_freq_features[:15])
print(len(most_freq_words))

[('film', 9517), ('one', 5852), ('movie', 5771), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2169), ('would', 2109), ('much', 2049), ('character', 2020), ('also', 1967), ('get', 1949), ('two', 1911), ('well', 1906)]
['film', 'one', 'movie', 'like', 'even', 'good', 'time', 'story', 'would', 'much', 'character', 'also', 'get', 'two', 'well']
4000


### Create list of movie review list 


In [27]:
#Create list of movie review document
movies = []
print(movie_reviews.categories())
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        movies.append((list(movie_reviews.words(fileid)), category))
        

print (len(movies)) 
print(movies[1])
#print(movies[0])
# shuffle the list
from random import shuffle 
shuffle(movies)

['neg', 'pos']
2000
(['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', 'review', 'damn', 'that', 'y2k', 'bug', '.', 'it', "'", 's', 'got', 'a', 'head', 'start', 'in', 'this', 'movie', 'starring', 'jamie', 'lee', 'curtis', 'and', 'another', 'baldwin', 'brother', '(', 'william', 'this', 'time', ')', 'in', 'a', 'story', 'regarding', 'a', 'crew', 'of', 'a', 'tugboat', 'that', 'comes', 'across', 'a', 'deserted', 'russian', 'tech', 'ship', 'that', 'has', 'a', 'strangeness', 'to', 'it', 'when', 'they', 'kick', 'the', 'power', 'back', 'on', '.', 'little', 'do', 'they', 'know', 'the', 'power', 'within', '.', '.', '.', 'going', 'for', 'the', 'gore', 'and', 'bringing', 'on', 'a', 'few', 'action', 'sequences', 'here', 'and', 'there', ',', 'virus', 'still', 'feels', 'very', 'empty', ',', 'like', 'a', 'movie', 'going', 'for', 'all', 'flash', 'and', 'no', 'substance', '.', 'we', 'don', "'", 't', 'know', 'why', 'the', 'crew', 'was', 'really', 'out', 'in', 'the', 'middle', 'of', 'nowhere', ',', '

### Create Feature Set
The feature set is used to train the classifier.
We define a feature extractor function that checks if the words in a given document are present in the word_features list or not.

In [28]:
def create_features(movie):
    # "set" function will remove repeated/duplicate tokens in the given list
    movie_words = set(movie)
    features = {}
    for word in most_freq_features:
        features['contains(%s)' % word] = (word in movie_words)
    return features

feature_set = [(create_features(movie), category) for (movie, category) in movies]
print (feature_set[4])



## Training Classifier

In [29]:
#creating Train and Test Set
#I have already suffled the movies reviews
test_set = feature_set[1600:]
train_set = feature_set[:1600]
print ('train set length: ',len(train_set)) 
print ('test set length: ',len(test_set)) 

train set length:  1600
test set length:  400


In [30]:
#creating a classifier 
from nltk import NaiveBayesClassifier  #(other options is SVM , Decision Tree Classifier , CNN etc.)

classifier = NaiveBayesClassifier.train(train_set)
from nltk import classify 

accuracy = classify.accuracy(classifier, test_set)
print(accuracy) 

0.7675


In [31]:
#The 15 most informative features
print(classifier.show_most_informative_features(15))


Most Informative Features
   contains(outstanding) = True              pos : neg    =     15.7 : 1.0
        contains(seagal) = True              neg : pos    =     11.6 : 1.0
       contains(winslet) = True              pos : neg    =      9.1 : 1.0
        contains(alicia) = True              neg : pos    =      8.9 : 1.0
         contains(sucks) = True              neg : pos    =      8.5 : 1.0
   contains(beautifully) = True              pos : neg    =      8.4 : 1.0
     contains(stupidity) = True              neg : pos    =      8.4 : 1.0
        contains(sinise) = True              neg : pos    =      8.3 : 1.0
     contains(insulting) = True              neg : pos    =      7.7 : 1.0
        contains(turkey) = True              neg : pos    =      7.7 : 1.0
         contains(upper) = True              pos : neg    =      7.6 : 1.0
     contains(ludicrous) = True              neg : pos    =      7.2 : 1.0
        contains(poorly) = True              neg : pos    =      7.1 : 1.0

This shows that a movie review which contains the word 'outstanding' will be positive 13.9 times more often than negative.As a result , a movie  review has a big  chance to be classified as positive if it contains words like outstanding, mulan, finest ,wonderfully , era while  movie reviews that contain words like seagal awful, wasted have a high chance to be classified as negative


## Bi-gram Features

In [34]:
from nltk import ngrams
from nltk.tokenize import word_tokenize

# cleaning words is helpful for unigrams but for bigrams this can remove important words like very,over,under 
# we create a new stopwords list specifically for bigrams by omitting such important words
important_words = ['above', 'below', 'off', 'over', 'under', 'more', 'most', 'such', 'no', 'nor', 'not', 'only', 'so', 'than', 'too', 'very', 'just', 'but']
en_stopwords_for_bigrams = set(en_stopwords) - set(important_words)
no_stopwords_bigrams = [ word for word in words_lower if word not in en_stopwords_for_bigrams ]

def clean_words(words, stopwords_english):
    words_clean = []
    for word in words:
        word = word.lower()
        if word not in stopwords_english and word not in string.punctuation:
            words_clean.append(word)    
    return words_clean


# feature extractor function for ngrams (bigram)
def bag_of_ngrams(words, n=2):
    words_bigram = []
    for item in iter(ngrams(words, n)):
        words_bigram.append(item)
    words_dictionary = dict([word, True] for word in words_bigram)  
    return words_dictionary

def bag_of_ngrams_cleaned(words, n=2):
    words_clean_for_bigrams = clean_words(words, en_stopwords_for_bigrams)
    bigram_features = bag_of_ngrams(words_clean_for_bigrams)
    return bigram_features


In [35]:
pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
    words = movie_reviews.words(fileid)
    pos_reviews.append(words)

neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    words = movie_reviews.words(fileid)
    neg_reviews.append(words)
    
# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
    pos_reviews_set.append((bag_of_ngrams_cleaned(words), 'pos'))

# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
    neg_reviews_set.append((bag_of_ngrams_cleaned(words), 'neg'))
    
# randomizing the reviews    
from random import shuffle 
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)

test_set_2grams = pos_reviews_set[800:] + neg_reviews_set[800:]
train_set_2grams = pos_reviews_set[:800] + neg_reviews_set[:800]
print(len(test_set_2grams))
classifier_2gram = NaiveBayesClassifier.train(train_set_2grams)
accuracy_2gram = classify.accuracy(classifier_2gram, test_set_2grams)
print('Accuracy using 2grams:',accuracy_2gram)

400
Accuracy using 2grams: 0.815


In [36]:
#The 15 most informative features
print(classifier_2gram.show_most_informative_features(15))


Most Informative Features
        ('one', 'worst') = True              neg : pos    =     11.4 : 1.0
       ('waste', 'time') = True              neg : pos    =     11.3 : 1.0
     ('batman', 'robin') = True              neg : pos    =     11.0 : 1.0
       ('bad', 'acting') = True              neg : pos    =     11.0 : 1.0
       ('quite', 'well') = True              pos : neg    =     11.0 : 1.0
     ('bad', 'dialogue') = True              neg : pos    =     10.3 : 1.0
       ('well', 'worth') = True              pos : neg    =     10.3 : 1.0
     ('aspects', 'film') = True              pos : neg    =      9.7 : 1.0
         ('so', 'badly') = True              neg : pos    =      9.7 : 1.0
       ('also', 'quite') = True              pos : neg    =      9.0 : 1.0
        ('dark', 'side') = True              pos : neg    =      9.0 : 1.0
 ('more', 'appropriate') = True              neg : pos    =      9.0 : 1.0
      ('never', 'seems') = True              pos : neg    =      9.0 : 1.0

The accuracy of the classifier using bigrams instead of unigrams has increased from 0.79 to 0.83.Moreover the most informative features make better sense than before.