# Load the "movie_reviews" dataset

In [1]:
import pandas as pd
import numpy as np

#Load movie reviews dataset
df = pd.read_csv( './data/movie_reviews.csv')
texts = df.text.values #pd.Series -> np.ndarray
label = df.label.values #pd.Series -> np.ndarray
df.head()
#print(texts.shape[0])

Unnamed: 0,label,text
0,1,"To an entire generation of filmgoers, it just ..."
1,1,Pixar classic is one of the best kids' movies ...
2,1,Apesar de representar um imenso avanço tecnoló...
3,1,"When Woody perks up in the opening scene, it's..."
4,1,Introduced not one but two indelible character...


# Tokenize the raw text strings

In [2]:
import nltk
# Transform each review string as a list of token strings. May take a few seconds
text_token = [nltk.word_tokenize(text) for text in texts]

n = 0 #arbitrary pick
print('Example review:\n   Raw: {} \n\n   Tokenized: {}'.format(texts[n], [i for i in text_token[n]]))

Example review:
   Raw: To an entire generation of filmgoers, it just might represent the most significant leap in storytelling that they will ever see... 

   Tokenized: ['To', 'an', 'entire', 'generation', 'of', 'filmgoers', ',', 'it', 'just', 'might', 'represent', 'the', 'most', 'significant', 'leap', 'in', 'storytelling', 'that', 'they', 'will', 'ever', 'see', '...']


In [3]:
from collections import Counter

#Note that we convert all tokens to lower case, otherwise words like *The* and *the* are different tokens.
text_counter = Counter(token.lower() for sentence in text_token for token in sentence)
top10 = text_counter.most_common()[:10]
for i, t in enumerate(top10):
    print('{:>2}.{:>5}  freq: {:>7}'.format(i+1, t[0], t[1]))

 1.  the  freq:  749124
 2.    ,  freq:  643961
 3.    .  freq:  573701
 4.    a  freq:  388669
 5.  and  freq:  380567
 6.   of  freq:  347832
 7.   to  freq:  303867
 8.   is  freq:  251629
 9.   ''  freq:  229560
10.   it  freq:  217043


# Cleaning text data, check the effectiveness of lemmatization

In [6]:
from nltk.corpus import stopwords
from string import punctuation
from itertools import chain

def clean_text(tokenized_list, sw, punct, lemmatize=True):
    new_list = []
    for doc in tokenized_list:
        new_list.append([token.lower() for token in doc if token.lower() not in chain(punct, sw)])
    return new_list

# Remove punctuations and stopwords, and lower-case text
sw = stopwords.words('english')
punct = punctuation
text_cleaned = clean_text(text_token, sw, punct)

idx = -1
count = 0
label_list = label.tolist()
for text in text_cleaned:
    idx = idx + 1
    if len(text) == 0:
        count = count + 1
        text_cleaned.remove(text)
        label_list.remove(label_list[idx])
label = np.array(label_list)

print('how many texts are empty:',count)
print('text cleaned:\n',text_cleaned[0:5])
#print(len(text_cleaned))
#Note that we convert all tokens to lower case, otherwise words like *The* and *the* are different tokens.
text_counter = Counter(token.lower() for sentence in text_cleaned for token in sentence)
top10 = text_counter.most_common()[:10]
for i, t in enumerate(top10):
    print('{:>2}.{:>5}  freq: {:>7}'.format(i+1, t[0], t[1]))

how many texts are empty: 15
text cleaned:
 [['entire', 'generation', 'filmgoers', 'might', 'represent', 'significant', 'leap', 'storytelling', 'ever', 'see', '...'], ['pixar', 'classic', 'one', 'best', 'kids', 'movies', 'time'], ['apesar', 'de', 'representar', 'um', 'imenso', 'avanço', 'tecnológico', 'força', 'filme', 'reside', 'carisma', 'de', 'seus', 'personagens', 'e', 'charme', 'de', 'sua', 'história'], ['woody', 'perks', 'opening', 'scene', "'s", 'toy', 'cowboy', 'comes', 'alive', "'re", 'watching', 'rebirth', 'art', 'form'], ['introduced', 'one', 'two', 'indelible', 'characters', 'pop', 'culture', 'pantheon', 'cowboy', 'rag-doll', 'woody', 'tom', 'hanks', 'plastic', 'space', 'ranger', 'buzz', 'lightyear', 'tim', 'allen', 'blu-ray']]
 1.   ''  freq:  229560
 2.   's  freq:  157679
 3.   ``  freq:  102253
 4.movie  freq:   95453
 5. film  freq:   92791
 6.  n't  freq:   74589
 7.  one  freq:   59797
 8. like  freq:   44255
 9.  ...  freq:   33143
10. good  freq:   32642


# BOW，TF-IDF，Word2Vec

Models discussed in this section are all trained on our "movie_reviews" dataset

## BOW

In [7]:
from gensim import corpora

# Create a dictionary from list of documents in order to create BOW model
dictionary = corpora.Dictionary(text_cleaned)
corpus = [dictionary.doc2bow(text) for text in text_cleaned]

#print dictionary and corpus
count=0
for key,value in dictionary.items():
    count = count + 1
    if count<10:
        print('{key}:{value}'.format(key = key, value = value))
        
for i in range(5):
    print(corpus[i])



0:...
1:entire
2:ever
3:filmgoers
4:generation
5:leap
6:might
7:represent
8:see
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)]
[(11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]
[(18, 1), (19, 1), (20, 1), (21, 1), (22, 3), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]
[(35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1)]
[(15, 1), (40, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1)]


## TF-IDF

In [8]:
from gensim import corpora, models
import numpy as np

#Create a TFIDF Model for the corpus
tfidf = models.TfidfModel(corpus)

for i in range(5):
    print('Example review:\n',text_cleaned[i])
    print('TFIDF scores:\n:',np.round(tfidf[corpus[i]],3))
#print('Example review featurized with TF-IDF scores : \n{}'.format([(dictionary[i[0]], round(i[1],3)) for i in tfidf[corpus[n]]]))

Example review:
 ['entire', 'generation', 'filmgoers', 'might', 'represent', 'significant', 'leap', 'storytelling', 'ever', 'see', '...']
TFIDF scores:
: [[ 0.     0.122]
 [ 1.     0.233]
 [ 2.     0.152]
 [ 3.     0.475]
 [ 4.     0.32 ]
 [ 5.     0.403]
 [ 6.     0.187]
 [ 7.     0.38 ]
 [ 8.     0.126]
 [ 9.     0.344]
 [10.     0.323]]
Example review:
 ['pixar', 'classic', 'one', 'best', 'kids', 'movies', 'time']
TFIDF scores:
: [[11.     0.252]
 [12.     0.358]
 [13.     0.388]
 [14.     0.255]
 [15.     0.147]
 [16.     0.726]
 [17.     0.21 ]]
Example review:
 ['apesar', 'de', 'representar', 'um', 'imenso', 'avanço', 'tecnológico', 'força', 'filme', 'reside', 'carisma', 'de', 'seus', 'personagens', 'e', 'charme', 'de', 'sua', 'história']
TFIDF scores:
: [[18.     0.222]
 [19.     0.307]
 [20.     0.248]
 [21.     0.245]
 [22.     0.343]
 [23.     0.144]
 [24.     0.162]
 [25.     0.266]
 [26.     0.207]
 [27.     0.272]
 [28.     0.201]
 [29.     0.288]
 [30.     0.23 ]
 [31.   

## Word2Vec

In [9]:
from gensim import models

# Training word2vec model on already cleaned text. This may take a few minutes.
word2vec = models.Word2Vec(text_cleaned,
                        size = 300,
                        window = 5,
                        min_count = 1,                      # set "min_count" = 1 to make sure every word corresponds to a vector, in case something go wrong in the Naive Doc2Vec process
                        sg = 0,
                        alpha = 0.025,                      # if I set alpha = 0.01, performance will be much worse
                        iter=10,
                        batch_words = 10000)

In [10]:
# vector = model.wv['food']
# print(vector)
word2vec['representar']

  This is separate from the ipykernel package so we can avoid doing imports until


array([ 7.92698469e-03, -2.01397687e-02, -2.53254939e-02,  2.97184605e-02,
        4.93387831e-03,  3.49686816e-02, -6.64250180e-02,  3.59089039e-02,
        5.81033267e-02, -4.68371361e-02, -2.28206646e-02,  2.80665997e-02,
       -9.75980163e-02,  2.08341088e-02,  3.15778307e-03,  6.60883170e-03,
       -3.25060338e-02,  8.95441324e-02,  2.66021118e-03,  6.97508603e-02,
        3.22876424e-02, -1.52867977e-02, -2.22618449e-02, -5.51112089e-03,
        2.43894150e-03, -2.32183766e-02, -6.03412762e-02, -1.16388589e-01,
        7.80114755e-02,  2.99245119e-02, -5.32919429e-02, -3.17887333e-03,
       -4.18883860e-02,  7.06827417e-02,  4.22851332e-02, -2.74281092e-02,
        1.09859653e-01,  3.76331806e-03, -4.56196629e-02, -3.83603573e-02,
        4.16262150e-02,  5.43430820e-03, -1.28748771e-02,  3.60228983e-03,
       -5.37535772e-02, -1.69625729e-02,  1.05463624e-01, -5.87137826e-02,
       -5.74758835e-02, -3.43802869e-02,  7.84358233e-02,  5.03958762e-03,
       -1.24559822e-02, -

In [11]:
# words closest to the token 'boy'
word2vec.wv.most_similar(positive=['boy'])

  if np.issubdtype(vec.dtype, np.int):


[('girl', 0.7022062540054321),
 ('kid', 0.6464823484420776),
 ('orphan', 0.5869698524475098),
 ('lad', 0.5416067838668823),
 ('sister', 0.5377366542816162),
 ('son', 0.5364755392074585),
 ('boys', 0.5278647541999817),
 ('daughter', 0.5146600008010864),
 ('child', 0.5090888738632202),
 ('brother', 0.5007424354553223)]

#  Doc2Vec

## Gensim Doc2Vec method

In [12]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.corpus import reuters

tokenized_docs = text_cleaned

print('tokenized_docs:\n',tokenized_docs[0])

# Convert tokenized documents to TaggedDocuments
tagged_docs = [TaggedDocument(doc, tags=[idx]) for idx, doc in enumerate(tokenized_docs)]

print('tagged_docs:\n',tagged_docs[0])

# Create and train the doc2vec model. May take a few seconds
doc2vec = Doc2Vec(size=300, window=5, min_count=5, dm = 1, iter=10)

# Build the word2vec model from the corpus
doc2vec.build_vocab(tagged_docs)

# Train the models
doc2vec.train(tagged_docs, epochs=10, total_examples=doc2vec.corpus_count)

tokenized_docs:
 ['entire', 'generation', 'filmgoers', 'might', 'represent', 'significant', 'leap', 'storytelling', 'ever', 'see', '...']
tagged_docs:
 TaggedDocument(['entire', 'generation', 'filmgoers', 'might', 'represent', 'significant', 'leap', 'storytelling', 'ever', 'see', '...'], [0])




In [13]:
doc2vec.infer_vector(text_cleaned[0])

array([-0.11640756,  0.00959367,  0.0027819 , -0.03638881, -0.01558619,
        0.0022028 , -0.04430351, -0.02634423,  0.01320325,  0.03729074,
       -0.02903551,  0.01164167, -0.02520508, -0.04448992, -0.04869303,
        0.06765765, -0.07196426,  0.06566668, -0.01529197,  0.11903958,
        0.0127807 , -0.0612068 ,  0.04332183,  0.04714288, -0.02795585,
       -0.06994917,  0.22520344, -0.07885246,  0.08964897,  0.09731934,
       -0.04219134,  0.01476393, -0.06538308,  0.05677607,  0.0147265 ,
        0.0333243 ,  0.11887433, -0.07735306, -0.0982732 ,  0.11280705,
       -0.02923373, -0.06992949,  0.02145182, -0.04887317,  0.00080843,
        0.02430671,  0.04117583, -0.02046664, -0.09429718, -0.04877159,
        0.09297715, -0.03404789, -0.02101492, -0.0170896 , -0.024143  ,
        0.04043116, -0.00307455, -0.04466435, -0.03466206, -0.03094201,
       -0.00222797, -0.01393872,  0.02641464,  0.00348142, -0.06175144,
        0.01550744, -0.03016403, -0.00313367,  0.01504597, -0.18

## Naive Doc2Vec

Word2Vec each word and add all word vectors within the same text to make up the text vector(remember to normalize) 

In [14]:
import numpy as np

naive_doc2vec = np.zeros((len(text_cleaned),300))
idx = -1
for text in text_cleaned:
    idx = idx + 1
    num_words = len(text)
    for word in text:
        word_vector = word2vec[word]
        naive_doc2vec[idx,:] = naive_doc2vec[idx,:] + word_vector
    naive_doc2vec[idx,:] = naive_doc2vec[idx,:] / num_words

  if __name__ == '__main__':


In [15]:
print(naive_doc2vec[0]) 

[ 0.27868995 -0.01163037  0.02616349 -0.11340432  0.17190187 -0.11143054
 -0.20620216 -0.44378885  0.12310633  0.23665084 -0.18205675  0.12810443
 -0.30301312  0.05627629  0.00477885  0.0982552   0.30097052 -0.03005744
  0.14404079  0.63472626 -0.15718583  0.12605314  0.28221462  0.05259475
  0.34176543  0.04152222 -0.24683218 -0.48777249  0.75362668  0.22371268
 -0.12088574  0.48317955  0.33968828  0.08922517  0.24257583  0.15982012
  0.21012554 -0.43060528 -0.09351016  0.43406987 -0.22121981  0.27096202
 -0.11485293  0.014093   -0.22046518  0.00972378  0.07254423 -0.46137426
 -0.266652   -0.03980379  0.4158588   0.07519136 -0.30022692  0.20392187
  0.11854771 -0.35231757 -0.89835919 -0.09728083  0.22352538 -0.25093949
 -0.25188945  0.39162992  0.23554688  0.39454835 -0.04212488 -0.40168226
 -0.11605915 -0.31589135  0.10637169 -0.62099302 -0.27728408  0.06855654
  0.00346043 -0.20456885 -0.28070957  0.32892583  0.12659171 -0.24138619
 -0.25910763  0.42475589 -0.59049432  0.24278067  0

# Put Doc2Vec model into the classifier 

## Gensim "Doc2Vec"

In [16]:
doc2vec_data = np.zeros((len(text_cleaned),300))
for i in range (len(text_cleaned)):
    doc2vec_data[i] = doc2vec.infer_vector(text_cleaned[i])

print(doc2vec_data[0])

[-0.12993325  0.01495038 -0.01516713  0.00803873 -0.03303552  0.00829881
 -0.056199   -0.02831089  0.0205844   0.08184966 -0.03804151  0.0425854
 -0.04435135 -0.09430112 -0.06735714  0.00923983 -0.0673715   0.08545925
 -0.00976392  0.09746398  0.03904494 -0.01941598  0.02679177  0.09253461
 -0.03168099 -0.1321025   0.1341363  -0.073345    0.0878863   0.09139118
 -0.00156319  0.00442715 -0.01662348  0.08107968 -0.00208479  0.00113985
  0.06923959 -0.03663416 -0.1182862   0.08386566 -0.00256242 -0.04221505
 -0.01267311 -0.04487883  0.00712917  0.02610446  0.06214851 -0.03210143
 -0.08627249 -0.05941227  0.10935897 -0.0680353  -0.04132422 -0.00228863
 -0.00982817  0.02722536 -0.03139601 -0.02445724  0.01061779 -0.07104012
 -0.05112461 -0.02387296  0.00058168  0.02013861 -0.03728822  0.01044006
 -0.02754978 -0.05414862 -0.00578358 -0.16648366  0.06323683  0.0053912
  0.00907841 -0.08694317 -0.0680019  -0.03028907 -0.01701905 -0.00586718
  0.02852613  0.01246372 -0.10654023 -0.01166201 -0.0

In [17]:
from sklearn.model_selection import train_test_split

# split data into train and test sets
seed = 42
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(doc2vec_data, label, test_size=test_size, random_state=seed)
print(X_test.shape)

(50357, 300)


In [18]:
# instantiate a SVM regression model, and fit with X and y
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score


model = LogisticRegression()
model = model.fit(X_train, y_train.astype(int))

y_pred = model.predict(X_test)

print(y_pred[0:10])
# check the accuracy on the training set
print(confusion_matrix(y_true=y_test.astype(int), y_pred=y_pred))
score = model.score(X_test, y_test.astype(int))

p = precision_score(y_test.astype(int), y_pred, average='binary')
r = recall_score(y_test.astype(int), y_pred, average='binary')
f1score = f1_score(y_test.astype(int), y_pred, average='binary')
print('accuracy:',score)
print('precision:',p)
print('recall:',r)
print('f1score:',f1score)

[0 0 0 0 1 1 1 1 1 1]
[[ 8639 12131]
 [ 2974 26613]]
accuracy: 0.7000417022459638
precision: 0.6868934544703696
recall: 0.8994828809950316
f1score: 0.7789436712473109


## Naive Doce2Vec

In [19]:
from sklearn.model_selection import train_test_split

# split data into train and test sets
seed = 42
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(naive_doc2vec, label, test_size=test_size, random_state=seed)
print(X_test.shape)

(50357, 300)


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score


model = LogisticRegression()
model = model.fit(X_train, y_train.astype(int))
y_pred = model.predict(X_test)

print(y_pred[0:10])
# check the accuracy on the training set
print(confusion_matrix(y_true=y_test.astype(int), y_pred=y_pred))
score = model.score(X_test, y_test.astype(int))

p = precision_score(y_test.astype(int), y_pred, average='binary')
r = recall_score(y_test.astype(int), y_pred, average='binary')
f1score = f1_score(y_test.astype(int), y_pred, average='binary')
print('accuracy:',score)
print('precision:',p)
print('recall:',r)
print('f1score:',f1score)

[0 0 1 0 1 0 1 1 1 1]
[[ 9054 11716]
 [ 4390 25197]]
accuracy: 0.680163631669877
precision: 0.6826050442933383
recall: 0.851624024064623
f1score: 0.7578045112781955


** Final results show that the performance of Gensim Doc2Vec model is better than Naive Doc2Vec model**