## Sentiment analysis using supervised learning

In [1]:
!pip install nltk==3.6.2 
# setup & download data
import nltk
import random
from nltk.corpus import movie_reviews
# corpus of 2000 movie reviews
nltk.download('movie_reviews')

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/mjams001/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [2]:
# create a list of documents
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [3]:
documents[0]

(['plot',
  ':',
  'two',
  'teen',
  'couples',
  'go',
  'to',
  'a',
  'church',
  'party',
  ',',
  'drink',
  'and',
  'then',
  'drive',
  '.',
  'they',
  'get',
  'into',
  'an',
  'accident',
  '.',
  'one',
  'of',
  'the',
  'guys',
  'dies',
  ',',
  'but',
  'his',
  'girlfriend',
  'continues',
  'to',
  'see',
  'him',
  'in',
  'her',
  'life',
  ',',
  'and',
  'has',
  'nightmares',
  '.',
  'what',
  "'",
  's',
  'the',
  'deal',
  '?',
  'watch',
  'the',
  'movie',
  'and',
  '"',
  'sorta',
  '"',
  'find',
  'out',
  '.',
  '.',
  '.',
  'critique',
  ':',
  'a',
  'mind',
  '-',
  'fuck',
  'movie',
  'for',
  'the',
  'teen',
  'generation',
  'that',
  'touches',
  'on',
  'a',
  'very',
  'cool',
  'idea',
  ',',
  'but',
  'presents',
  'it',
  'in',
  'a',
  'very',
  'bad',
  'package',
  '.',
  'which',
  'is',
  'what',
  'makes',
  'this',
  'review',
  'an',
  'even',
  'harder',
  'one',
  'to',
  'write',
  ',',
  'since',
  'i',
  'generally',
  'a

In [4]:
movie_reviews.categories()

['neg', 'pos']

In [5]:
len(movie_reviews.fileids())

2000

In [6]:
movie_reviews.words('neg/cv000_29416.txt')

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [7]:
random.shuffle(documents)

In [8]:
N = 200
# create a freq list of the words, convert to lower case
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
# keep just the top N
word_features = list(all_words)[:N]

# helper function to do feature extraction on each doc
def document_features(document): 
    document_words = set(document) 
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [9]:
all_words

FreqDist({',': 77717, 'the': 76529, '.': 65876, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, "'": 30585, 'is': 25195, 'in': 21822, ...})

In [10]:
word_features

[',',
 'the',
 '.',
 'a',
 'and',
 'of',
 'to',
 "'",
 'is',
 'in',
 's',
 '"',
 'it',
 'that',
 '-',
 ')',
 '(',
 'as',
 'with',
 'for',
 'his',
 'this',
 'film',
 'i',
 'he',
 'but',
 'on',
 'are',
 't',
 'by',
 'be',
 'one',
 'movie',
 'an',
 'who',
 'not',
 'you',
 'from',
 'at',
 'was',
 'have',
 'they',
 'has',
 'her',
 'all',
 '?',
 'there',
 'like',
 'so',
 'out',
 'about',
 'up',
 'more',
 'what',
 'when',
 'which',
 'or',
 'she',
 'their',
 ':',
 'some',
 'just',
 'can',
 'if',
 'we',
 'him',
 'into',
 'even',
 'only',
 'than',
 'no',
 'good',
 'time',
 'most',
 'its',
 'will',
 'story',
 'would',
 'been',
 'much',
 'character',
 'also',
 'get',
 'other',
 'do',
 'two',
 'well',
 'them',
 'very',
 'characters',
 ';',
 'first',
 '--',
 'after',
 'see',
 '!',
 'way',
 'because',
 'make',
 'life',
 'off',
 'too',
 'any',
 'does',
 'really',
 'had',
 'while',
 'films',
 'how',
 'plot',
 'little',
 'where',
 'people',
 'over',
 'could',
 'then',
 'me',
 'scene',
 'man',
 'bad',
 '

In [11]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) 

{'contains(,)': True, 'contains(the)': True, 'contains(.)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': True, 'contains(to)': True, "contains(')": True, 'contains(is)': True, 'contains(in)': True, 'contains(s)': True, 'contains(")': True, 'contains(it)': True, 'contains(that)': True, 'contains(-)': True, 'contains())': True, 'contains(()': True, 'contains(as)': True, 'contains(with)': True, 'contains(for)': True, 'contains(his)': True, 'contains(this)': True, 'contains(film)': False, 'contains(i)': False, 'contains(he)': True, 'contains(but)': True, 'contains(on)': True, 'contains(are)': True, 'contains(t)': False, 'contains(by)': True, 'contains(be)': True, 'contains(one)': True, 'contains(movie)': True, 'contains(an)': True, 'contains(who)': True, 'contains(not)': True, 'contains(you)': True, 'contains(from)': True, 'contains(at)': False, 'contains(was)': False, 'contains(have)': True, 'contains(they)': True, 'contains(has)': True, 'contains(her)': False, 'conta

In [12]:
len(document_features(movie_reviews.words('pos/cv957_8737.txt')))

200

In [13]:
# do feature extraction on our data
featuresets = [(document_features(d), c) for (d,c) in documents]
# split into train and test
train_set, test_set = featuresets[100:], featuresets[:100]

In [14]:
# create a NB classifier
NBclassifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(NBclassifier, test_set))

0.7


In [15]:
NBclassifier.show_most_informative_features(5)

Most Informative Features
            contains(it) = False             neg : pos    =      3.0 : 1.0
           contains(bad) = True              neg : pos    =      2.0 : 1.0
            contains(as) = False             neg : pos    =      1.7 : 1.0
          contains(have) = False             pos : neg    =      1.6 : 1.0
          contains(both) = True              pos : neg    =      1.5 : 1.0


In [16]:
# create a DT classifier
DTclassifier = nltk.DecisionTreeClassifier.train(train_set)
print(nltk.classify.accuracy(DTclassifier, test_set))

0.55


In [17]:
# create a MaxEnt classifier
MEclassifier = nltk.MaxentClassifier.train(train_set, max_iter=3)
print(nltk.classify.accuracy(MEclassifier, test_set));
MEclassifier.show_most_informative_features(5)

  ==> Training (3 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.499
             2          -0.68977        0.716
         Final          -0.68644        0.719
0.69
  -0.010 contains(it)==False and label is 'pos'
   0.010 contains(and)==False and label is 'pos'
   0.010 contains(a)==False and label is 'neg'
   0.010 contains(of)==False and label is 'neg'
   0.010 contains(the)==False and label is 'neg'
