In [1]:
import random
import nltk
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords

## Dataset 

In [2]:
# lets create a  dataset using movie reviws data
documents = []
for category in movie_reviews.categories():
    for file in movie_reviews.fileids(category):
        review = movie_reviews.words(fileids=file)
        documents.append((review,category))
        
# total documents 
print('total documents in our dataset',len(documents))

# let's shuffle the data
random.shuffle(documents)

# lets check the first five doc + labels
documents[:5]

total documents in our dataset 2000


[(['welcome', 'to', 'your', 'oh', '-', 'so', 'typical', ...], 'neg'),
 (['susan', 'granger', "'", 's', 'review', 'of', '"', ...], 'neg'),
 (['the', 'long', 'kiss', 'goodnight', '(', 'r', ')', ...], 'pos'),
 (['i', 'heard', 'actor', 'skeet', 'ulrich', ...], 'neg'),
 (['long', 'ago', ',', 'films', 'were', 'constructed', ...], 'neg')]

## How to create a vocabulary 

In [3]:
# this give us all the words in the movie review data
vocab = movie_reviews.words()
# length of vocab
print('Length of vocabulary',len(vocab))

Length of vocabulary 1583820


In [11]:
# lets check the distribution of words
distribution = nltk.FreqDist(vocab)

In [12]:
# what are the most common words -- lets see top 10
distribution.most_common()[:10]

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822)]

In [4]:
# we need to remove the words which are common in natural language
# for example 
# words like 'the', 'a', 'an', 'this', 'that'
# Here for our movie review classification these words are not relevent
english_stopwords = stopwords.words('english')
english_stopwords[:5]

['i', 'me', 'my', 'myself', 'we']

In [5]:
# improved vocabulary for movie review classification problem
vocab = [word for word in movie_reviews.words() if word not in english_stopwords]

In [6]:
# what are the most common words in our improved vocab -- lets see top 10
# lets check the distribution of words
distribution = nltk.FreqDist(vocab)
distribution.most_common()[:10]

[(',', 77717),
 ('.', 65876),
 ("'", 30585),
 ('"', 17612),
 ('-', 15595),
 (')', 11781),
 ('(', 11664),
 ('film', 9517),
 ('one', 5852),
 ('movie', 5771)]

In [7]:
# length of vocab
print('Length of vocabulary',len(vocab))


Length of vocabulary 955610


## Creating Feature vector

how can we take a review and turn it into a feature vector

In [8]:
# a moview review
print(documents[0][0])
# length of the review
print(len(documents[0][0]),'words in this movie review')
print('sentiment : ',documents[0][1])

['welcome', 'to', 'your', 'oh', '-', 'so', 'typical', ...]
1180 words in this movie review
sentiment :  neg


In [11]:
### lets take top 500 frequent words from the vocab

frequent_500 = distribution.most_common()[:500]

# for each word in this list
# if that word is in the movie review we put a '1' or 'True'
# else we put '0' or 'False'
# the result will be a multi-hot-vector
frequent_500 = [tup[0] for tup in frequent_500]


In [12]:
%%time
## Multi hot vectors
feature_vectors = []

for review, sentiment in documents:
    multi_hot_vector = {}
    # we are converting reviews into a dict
    # words in review are keys 
    # and for every key we set True as value    
    review_lookup = {word:True for word in review}
    
    for word in frequent_500:
        try:
            if review_lookup[word]:
                multi_hot_vector[word] = True
        except:
            multi_hot_vector[word] = False
            
    feature_vectors.append(tuple([multi_hot_vector, sentiment]))  

CPU times: total: 2.12 s
Wall time: 2.11 s


In [13]:
# Test driven development
assert len(documents) == len(feature_vectors)

In [16]:
# Now lets take a look at the first vector

list(feature_vectors[0][0].values())[:10]

[True, True, True, True, True, True, True, False, True, True]

In [17]:
# sentiment
feature_vectors[0][1]

'neg'

In [18]:
## This is a very naive approach we will improve our features in later notebooks
## This is just an intro to feature building
## These features may or may not be good, because we are including most frequent 500 words as components
## if these 500 words are enough to represent the sentiments then we are in luck 

## For creating our feature vectors
## we have to convert reviews to lookup dict
## otherwise there will be a time complexity issue
## for example suppose our code is

# '''

# for review, sentiment in documents:
#     multi_hot_vector = []
    
#     for word in frequent_500:
    
#         ###### to create each component we have to search the entire length of review ######

#         if word in review:                            <<<< TIME COMPLEXITY >>>>
#             multi_hot_vector.append(True)
#         else:
#             multi_hot_vector.append(False)
            
#     feature_vectors.append((multi_hot_vector, sentiment))  
    
# ''' 

# there are 2000 reviews
# for each 
# we have to do ~ 500 * len(review) to create the multi-hot-vector

In [19]:
# how many True/1 in feature vector for review 1
sum(feature_vectors[0][0].values())

144

In [20]:
sum(feature_vectors[1][0].values())

56

In [21]:
sum(feature_vectors[2][0].values())

95

In [22]:
sum(feature_vectors[3][0].values())

130

In [23]:
sum(feature_vectors[4][0].values())

85

In [24]:
# length of feature vector
len(feature_vectors[0][0].values())

500