# Importing the data

In [1]:
from nltk.corpus import movie_reviews

In [2]:
import nltk

In [3]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\aruna\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [4]:
# movie_reviews.categories()

In [5]:
# movie_reviews.fileids('neg')

In [6]:
# movie_reviews.fileids('pos')

In [7]:
# movie_reviews.words(movie_reviews.fileids()[10])

# Creating the document array

In [26]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

# Randomizing the data

In [27]:
import random
random.shuffle(documents)
documents[:5]

[(['"', 'the', 'tailor', 'of', 'panama', '"', 'is', 'a', ...], 'pos'),
 (['it', "'", 's', 'a', 'fact', 'that', 'a', 'good', ...], 'pos'),
 (['the', 'promotion', 'for', 'fear', 'and', 'loathing', ...], 'pos'),
 (['you', "'", 've', 'got', 'to', 'think', 'twice', ...], 'pos'),
 (['i', 'was', 'born', 'in', '1970', ',', 'which', ...], 'pos')]

# Cleaning the data

## * Creating the stopWords

In [28]:
from nltk.corpus import stopwords
import string

In [29]:
stop = set(stopwords.words('english'))
punctuations = string.punctuation
stop.update(punctuations)

# * Lemmatizing the words

In [30]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

In [31]:
from nltk import pos_tag

In [32]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [33]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w])
            clean_word = lemma.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    
    return output_words

In [37]:
documents = [(clean_review(document),category) for document, category in documents]

In [35]:
all_words = []
training_documents = documents[:1500]
testing_documents = documents[1500:]

In [39]:
training_documents[:3][0]

(['"', 'the', 'tailor', 'of', 'panama', '"', 'is', 'a', ...], 'pos')

In [47]:
for doc in training_documents:
    all_words += list(doc[0])

In [48]:
import nltk

In [49]:
freq = nltk.FreqDist(all_words)

In [52]:
common = freq.most_common(3000)

In [54]:
features = [i[0] for i in common]

In [67]:
def get_feature_dictionary(words):
    current_features = {}
    word_set = set(words)
    for w in features:
        current_features = w in word_set
    return current_features

In [68]:
training_documents[0][0]

['"', 'the', 'tailor', 'of', 'panama', '"', 'is', 'a', ...]

In [69]:
training_data = [(get_feature_dictionary(doc), category) for doc, category in training_documents]

In [79]:
training_data[:1500]

[(False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'pos'

In [71]:
testing_data = [(get_feature_dictionary(doc), category) for doc, category in testing_documents]

In [72]:
testing_data

[(False, 'pos'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'pos'),
 (True, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'neg'),
 (True, 'pos'),
 (True, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'neg'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (False, 'pos'),
 (True, 'neg'),
 (False, 'neg'),
 

In [76]:
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(training_data)
# nltk.classify.accuracy(classifier, testing_data)
# classifier.show_most_informative_features(15)

AttributeError: 'bool' object has no attribute 'items'