In [1]:
from nltk.corpus import movie_reviews

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [4]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [5]:
import random
random.shuffle(documents)
documents[0:5]

[(['it', "'", 's', 'always', 'a', 'bad', 'sign', 'when', ...], 'neg'),
 (['jacques', 'tati', "'", 's', '1953', 'classic', '"', ...], 'pos'),
 (['_soldier_', 'is', 'hands', 'down', 'one', 'of', ...], 'neg'),
 (['ralph', 'fiennes', 'is', 'carving', 'out', 'a', ...], 'pos'),
 (['errol', 'morris', ',', 'critically', 'acclaimed', ...], 'pos')]

In [6]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [7]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [8]:
from nltk import pos_tag
w = "better"
pos_tag([w])

[('better', 'RBR')]

In [9]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)
stops, string.punctuation

({'!',
  '"',
  '#',
  '$',
  '%',
  '&',
  "'",
  '(',
  ')',
  '*',
  '+',
  ',',
  '-',
  '.',
  '/',
  ':',
  ';',
  '<',
  '=',
  '>',
  '?',
  '@',
  '[',
  '\\',
  ']',
  '^',
  '_',
  '`',
  'a',
  'about',
  'above',
  'after',
  'again',
  'against',
  'ain',
  'all',
  'am',
  'an',
  'and',
  'any',
  'are',
  'aren',
  "aren't",
  'as',
  'at',
  'be',
  'because',
  'been',
  'before',
  'being',
  'below',
  'between',
  'both',
  'but',
  'by',
  'can',
  'couldn',
  "couldn't",
  'd',
  'did',
  'didn',
  "didn't",
  'do',
  'does',
  'doesn',
  "doesn't",
  'doing',
  'don',
  "don't",
  'down',
  'during',
  'each',
  'few',
  'for',
  'from',
  'further',
  'had',
  'hadn',
  "hadn't",
  'has',
  'hasn',
  "hasn't",
  'have',
  'haven',
  "haven't",
  'having',
  'he',
  'her',
  'here',
  'hers',
  'herself',
  'him',
  'himself',
  'his',
  'how',
  'i',
  'if',
  'in',
  'into',
  'is',
  'isn',
  "isn't",
  'it',
  "it's",
  'its',
  'itself',
  'just',
  'll',


In [10]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [11]:
documents[0][1]

'neg'

In [12]:
range(len(documents))

range(0, 2000)

In [13]:
clean_review(documents[1][0])

['jacques',
 'tati',
 '1953',
 'classic',
 'le',
 'vacances',
 'de',
 'hulot',
 'also',
 'know',
 'mr',
 'hulot',
 'vacation',
 'english',
 'ought',
 'textbook',
 'non',
 'dialogue',
 'comedy',
 'film',
 'character',
 'hardly',
 'ever',
 'speak',
 'directly',
 'film',
 'operates',
 'paper',
 'thin',
 'premise',
 'monsieur',
 'hulot',
 'played',
 'tati',
 'awkward',
 'clumsy',
 'generally',
 'manages',
 'annoy',
 'guest',
 'beach',
 'resort',
 'hotel',
 'take',
 'week',
 'vacation',
 'yet',
 'film',
 'beside',
 'laugh',
 'consistently',
 'recent',
 'memory',
 'part',
 'reason',
 'film',
 'work',
 'well',
 'clever',
 'setup',
 'various',
 'gag',
 'slapstick',
 'often',
 'regard',
 'unsophisticated',
 'even',
 'crude',
 'form',
 'comedy',
 'think',
 'part',
 'reason',
 'often',
 'seem',
 'force',
 'see',
 'movie',
 'character',
 'start',
 'run',
 'something',
 'crash',
 'face',
 'first',
 'pole',
 'many',
 'time',
 'start',
 'wonder',
 'pole',
 'first',
 'place',
 'people',
 'watch',
 'go

In [14]:
documents  = [(clean_review(document),category) for document,category in documents ]

In [15]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]

In [16]:
all_words = []
for doc in training_documents:
    all_words += doc[0]

In [17]:
import nltk

In [18]:
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
features = [i[0] for i in common]

In [19]:
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'well',
 'scene',
 'even',
 'good',
 'story',
 'take',
 'would',
 'much',
 'also',
 'come',
 'two',
 'bad',
 'look',
 'life',
 'way',
 'seem',
 'give',
 'know',
 'first',
 '--',
 'end',
 'work',
 'thing',
 'year',
 'plot',
 'say',
 'really',
 'play',
 'little',
 'show',
 'people',
 'man',
 'could',
 'never',
 'great',
 'star',
 'best',
 'try',
 'love',
 'director',
 'new',
 'performance',
 'want',
 'u',
 'many',
 'big',
 'watch',
 'actor',
 'find',
 'action',
 'think',
 'role',
 'act',
 'another',
 'back',
 'audience',
 'day',
 'something',
 'turn',
 'still',
 'world',
 'however',
 'set',
 'use',
 'every',
 'old',
 'guy',
 'begin',
 'feel',
 'real',
 'part',
 'cast',
 'though',
 'enough',
 'comedy',
 'interest',
 'write',
 'fact',
 'around',
 'young',
 'may',
 'funny',
 'point',
 'name',
 'run',
 'lot',
 'last',
 'minute',
 'actually',
 'script',
 'effect',
 'long',
 'right',
 'although',
 'almo

In [20]:
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [21]:
output = get_feature_dict(training_documents[0][0])
output

{'film': True,
 'movie': False,
 'one': True,
 'make': True,
 'like': True,
 'character': False,
 'get': False,
 'see': False,
 'go': False,
 'time': True,
 'well': True,
 'scene': False,
 'even': True,
 'good': False,
 'story': True,
 'take': False,
 'would': False,
 'much': False,
 'also': False,
 'come': True,
 'two': False,
 'bad': True,
 'look': False,
 'life': False,
 'way': True,
 'seem': True,
 'give': False,
 'know': False,
 'first': False,
 '--': True,
 'end': True,
 'work': False,
 'thing': False,
 'year': True,
 'plot': True,
 'say': False,
 'really': False,
 'play': False,
 'little': False,
 'show': False,
 'people': False,
 'man': False,
 'could': False,
 'never': True,
 'great': False,
 'star': False,
 'best': True,
 'try': False,
 'love': True,
 'director': False,
 'new': False,
 'performance': False,
 'want': True,
 'u': False,
 'many': False,
 'big': False,
 'watch': True,
 'actor': False,
 'find': False,
 'action': True,
 'think': True,
 'role': False,
 'act': False,

In [22]:
training_data = [(get_feature_dict(doc), category) for doc, category in training_documents]
testing_data = [(get_feature_dict(doc), category) for doc, category in training_documents]

In [23]:
training_data[0]

({'film': True,
  'movie': False,
  'one': True,
  'make': True,
  'like': True,
  'character': False,
  'get': False,
  'see': False,
  'go': False,
  'time': True,
  'well': True,
  'scene': False,
  'even': True,
  'good': False,
  'story': True,
  'take': False,
  'would': False,
  'much': False,
  'also': False,
  'come': True,
  'two': False,
  'bad': True,
  'look': False,
  'life': False,
  'way': True,
  'seem': True,
  'give': False,
  'know': False,
  'first': False,
  '--': True,
  'end': True,
  'work': False,
  'thing': False,
  'year': True,
  'plot': True,
  'say': False,
  'really': False,
  'play': False,
  'little': False,
  'show': False,
  'people': False,
  'man': False,
  'could': False,
  'never': True,
  'great': False,
  'star': False,
  'best': True,
  'try': False,
  'love': True,
  'director': False,
  'new': False,
  'performance': False,
  'want': True,
  'u': False,
  'many': False,
  'big': False,
  'watch': True,
  'actor': False,
  'find': False,
  'a

In [24]:
from nltk import NaiveBayesClassifier

In [25]:
classfier = NaiveBayesClassifier.train(training_data)

In [26]:
nltk.classify.accuracy(classfier, testing_data)

0.858

In [27]:
classfier.show_most_informative_features(15)

Most Informative Features
             wonderfully = True              pos : neg    =     21.5 : 1.0
               stupidity = True              neg : pos    =     17.2 : 1.0
             outstanding = True              pos : neg    =     11.5 : 1.0
                poignant = True              pos : neg    =     10.1 : 1.0
               ludicrous = True              neg : pos    =      9.1 : 1.0
                   jolie = True              neg : pos    =      8.4 : 1.0
                   anger = True              pos : neg    =      8.3 : 1.0
                   awful = True              neg : pos    =      8.2 : 1.0
                  castle = True              pos : neg    =      7.7 : 1.0
                   mulan = True              pos : neg    =      7.6 : 1.0
                religion = True              pos : neg    =      7.2 : 1.0
                  turkey = True              neg : pos    =      7.1 : 1.0
                    lame = True              neg : pos    =      6.9 : 1.0

In [28]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [29]:
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)

In [30]:
classifier_sklearn.train(training_data)

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))>

In [31]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.818

In [32]:
from sklearn.ensemble import RandomForestClassifier

In [33]:
rfc = RandomForestClassifier()
classifier_sklearn1 = SklearnClassifier(rfc)

In [34]:
classifier_sklearn1.train(training_data)

<SklearnClassifier(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))>

In [35]:
nltk.classify.accuracy(classifier_sklearn1, testing_data)

0.988