In [1]:
from nltk.corpus import movie_reviews

In [3]:
import nltk
nltk.download('movie_reviews')
nltk.download

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/Eshita/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [4]:
movie_reviews.categories()

['neg', 'pos']

In [11]:
len(movie_reviews.fileids('neg')), len(movie_reviews.fileids('pos')) # no. of files in each categories

(1000, 1000)

In [12]:
movie_reviews.words(movie_reviews.fileids()[5]) # get words of a file

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [24]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [25]:
import random
random.shuffle(documents)
documents[0:5]

[(['to', 'put', 'it', 'bluntly', ',', 'ed', 'wood', ...], 'neg'),
 (['my', 'opinion', 'on', 'a', 'film', 'can', 'be', ...], 'neg'),
 (['i', 'admit', 'it', '.', 'i', 'thought', 'arnold', ...], 'neg'),
 (['you', "'", 've', 'got', 'to', 'love', 'disney', '.', ...], 'pos'),
 (['note', ':', 'some', 'may', 'consider', 'portions', ...], 'pos')]

In [35]:
from nltk.corpus import stopwords
import string

stops = set(stopwords.words('english'))
punctuations = string.punctuation
stops.update(punctuations)
stops

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [27]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [38]:
from nltk.corpus import wordnet
from nltk import pos_tag
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.ADJ
    elif tag.startswith('J'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [36]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [39]:
documents = [(clean_review(document), category) for document, category in documents]

#### Using Count Vectorizer

In [63]:
from sklearn.feature_extraction.text import CountVectorizer

In [64]:
categories = [category for document, category in documents]

In [65]:
text_documents = [" ".join(document) for document, category in documents]

In [68]:
from sklearn.model_selection import train_test_split

In [69]:
x_train, x_test, y_train, y_test = train_test_split(text_documents, categories, random_state = 1)

In [93]:
count_vec = CountVectorizer(max_features = 2000, ngram_range=(1,3))
x_train_features = count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0, 0, 0, ..., 2, 0, 0],
        [0, 0, 0, ..., 2, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 1, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 0, 0]])

In [94]:
count_vec.get_feature_names_out()

array(['000', '10', '100', ..., 'young', 'young man', 'zero'],
      dtype=object)

In [95]:
x_test_features = count_vec.transform(x_test)
x_test_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [96]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train_features, y_train)
y_pred = svc.predict(x_train_features)

In [97]:
svc.score(x_test_features, y_test)

0.832

#### Using NLTK 

In [40]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]

In [42]:
all_words = []
for doc in training_documents:
    all_words += doc[0]

In [45]:
import nltk
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
features = [i[0] for i in common]
features

['film',
 'one',
 'movie',
 'make',
 'like',
 'get',
 'go',
 'see',
 'well',
 'even',
 'time',
 'good',
 'take',
 'story',
 'would',
 'much',
 'character',
 'come',
 'bad',
 'also',
 '--',
 'two',
 'give',
 'seem',
 'characters',
 'first',
 'way',
 'life',
 'say',
 'really',
 'films',
 'plot',
 'end',
 'know',
 'little',
 'could',
 'look',
 'people',
 'man',
 'great',
 'scene',
 'never',
 'work',
 'best',
 'new',
 'many',
 'love',
 'big',
 'scenes',
 'u',
 'director',
 'want',
 'movies',
 'action',
 'watch',
 'another',
 'show',
 'world',
 'back',
 'still',
 'something',
 'act',
 'think',
 'try',
 'use',
 'funny',
 'long',
 'every',
 'however',
 'star',
 'old',
 'real',
 'around',
 'though',
 'cast',
 'write',
 'enough',
 'audience',
 'role',
 'young',
 'interest',
 'year',
 'performance',
 'may',
 'comedy',
 'actually',
 'find',
 'last',
 'things',
 'years',
 'almost',
 'script',
 'thing',
 'fact',
 'nothing',
 'played',
 'play',
 'screen',
 'john',
 'right',
 'since',
 'although',
 '

In [46]:
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [48]:
get_feature_dict(training_documents[0][0])

{'film': True,
 'one': True,
 'movie': False,
 'make': True,
 'like': True,
 'get': True,
 'go': True,
 'see': False,
 'well': False,
 'even': False,
 'time': False,
 'good': False,
 'take': False,
 'story': True,
 'would': True,
 'much': True,
 'character': True,
 'come': False,
 'bad': True,
 'also': True,
 '--': False,
 'two': True,
 'give': False,
 'seem': False,
 'characters': False,
 'first': False,
 'way': False,
 'life': True,
 'say': True,
 'really': False,
 'films': False,
 'plot': True,
 'end': True,
 'know': True,
 'little': False,
 'could': True,
 'look': True,
 'people': True,
 'man': False,
 'great': True,
 'scene': False,
 'never': False,
 'work': False,
 'best': True,
 'new': False,
 'many': False,
 'love': False,
 'big': False,
 'scenes': True,
 'u': False,
 'director': False,
 'want': False,
 'movies': False,
 'action': False,
 'watch': True,
 'another': True,
 'show': False,
 'world': False,
 'back': False,
 'still': True,
 'something': True,
 'act': False,
 'think'

In [49]:
training_data = [(get_feature_dict(doc),category)for doc,category in training_documents]
testing_data = [(get_feature_dict(doc),category)for doc,category in testing_documents]

In [51]:
from nltk import NaiveBayesClassifier

In [52]:
clf = NaiveBayesClassifier.train(training_data)

In [53]:
nltk.classify.accuracy(clf, testing_data)

0.836

In [54]:
clf.show_most_informative_features(15)

Most Informative Features
             outstanding = True              pos : neg    =     17.7 : 1.0
              uninspired = True              neg : pos    =     16.7 : 1.0
                   inept = True              neg : pos    =      8.8 : 1.0
                   jolie = True              neg : pos    =      8.8 : 1.0
               ludicrous = True              neg : pos    =      8.3 : 1.0
                    lame = True              neg : pos    =      8.0 : 1.0
                 balance = True              pos : neg    =      7.1 : 1.0
                   mulan = True              pos : neg    =      7.1 : 1.0
               showgirls = True              neg : pos    =      6.9 : 1.0
                   anger = True              pos : neg    =      6.7 : 1.0
               stupidity = True              neg : pos    =      6.6 : 1.0
                 refresh = True              pos : neg    =      6.5 : 1.0
                     sat = True              neg : pos    =      6.3 : 1.0

In [56]:
# converting data in a format accepted by sklearn classifier
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [58]:
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)

In [59]:
classifier_sklearn.train(training_data)

<SklearnClassifier(SVC())>

In [60]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.878

In [61]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
classifier_sklearn = SklearnClassifier(svc)
classifier_sklearn.train(training_data)
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.878