In [3]:
# Notes
#1. Word2vec can only use Python3 because Py2 have the ASC code error
#2. Therefore, use "source activate nlp3" and then "jupyter notebook"

# Word2Vec
#1. It's a kind of method which is faster than some deep learning algorithms
#2. Do not need labels, it can generated labels automatically based on relationships between words.
#3. Because it cares relationships, don't remove stop words and numbers.
#4. 關係性的比重如何算: Attemp1 Vector Average, Attemp2 Clustering

In [4]:
import pandas as pd

# Read data
train = pd.read_csv("labeledTrainData.tsv", delimiter = "\t", quoting = 3)
test = pd.read_csv("testData.tsv", delimiter = "\t", quoting = 3)
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", delimiter = "\t", quoting = 3)

# print
print(train["review"].size, test["review"].size, unlabeled_train["review"].size)
print("\n")
print(train["review"][0])
print("\n")
print(train.columns, test.columns, unlabeled_train.columns)

25000 25000 50000


"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit

In [5]:
# Clean data (remove html, non-letters, lower case, split)
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist(review, remove_stopwords = False):
    review_text = BeautifulSoup(review, "lxml").get_text()
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    words = review_text.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)

In [6]:
# Test
list_1 = review_to_wordlist(train["review"][0])
list_1[0:10]

['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with']

In [7]:
# Punkt tokenizer to split paragraphs to sentences
# Above "review_to_wordlist" function is to split sentences into wordlist
# paragraphs -> sentences -> wordlists 讓每句sentence都有自己的wordlist

import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences(review, tokenizer, remove_stopwords = False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for i in raw_sentences:
        if len(i) > 0:
            sentences.append(review_to_wordlist(i, remove_stopwords))
    return sentences

In [8]:
# Test
list_2 = tokenizer.tokenize(train["review"][0].strip())
print(list_2[0:2])
print("\n")
list_3 = review_to_sentences(train["review"][0], tokenizer, remove_stopwords = False)
print(list_3[0:2])

['"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again.', 'Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent.']


[['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again'], ['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent']]


In [9]:
# Start to make paragraph into list of words

sentences = []
for i in train["review"]:
    sentences += review_to_sentences(i, tokenizer)

print("Parsing now")
for j in unlabeled_train["review"]:
    sentences += review_to_sentences(j, tokenizer)

  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Parsing now


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [10]:
print(len(sentences))
print("\n")
print(sentences[0])
print("\n")
print(sentences[1])

795538


['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent']


In [11]:
import logging
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)

from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers = 4, size = 300, min_count = 40, window = 10, \
                         sample = 0.001)

model.init_sims(replace = True)
model_name = "300features_40minwords"
model.save(model_name) #We can load it using Word2Vec.load()

2018-05-26 15:26:14,147 : INFO : collecting all words and their counts
2018-05-26 15:26:14,149 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-05-26 15:26:14,256 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types


Training model...


2018-05-26 15:26:14,354 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2018-05-26 15:26:14,450 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2018-05-26 15:26:14,549 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2018-05-26 15:26:14,678 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2018-05-26 15:26:14,755 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2018-05-26 15:26:14,840 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2018-05-26 15:26:14,929 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2018-05-26 15:26:15,011 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2018-05-26 15:26:15,089 : INFO : PROGRESS: at sentence #100000, processed 2226966 words, keeping 50

In [12]:
model.doesnt_match("man woman child kitchen".split())

  if __name__ == '__main__':


'kitchen'

In [13]:
model.doesnt_match("guy girl woman man monkey".split())

  if __name__ == '__main__':


'monkey'

In [15]:
model.most_similar("monkey")

  if __name__ == '__main__':


[('turtle', 0.6487868428230286),
 ('chicken', 0.6476317048072815),
 ('gorilla', 0.6274864673614502),
 ('rabbit', 0.6065736413002014),
 ('giant', 0.6062736511230469),
 ('sock', 0.6038341522216797),
 ('bowl', 0.5888091325759888),
 ('dinosaur', 0.5883542895317078),
 ('midget', 0.5821043252944946),
 ('pig', 0.5818576216697693)]

In [16]:
model.most_similar("girl")

  if __name__ == '__main__':


[('boy', 0.7197983860969543),
 ('woman', 0.6685178875923157),
 ('prostitute', 0.6315896511077881),
 ('teenager', 0.5830933451652527),
 ('girls', 0.5804745554924011),
 ('gal', 0.5762826800346375),
 ('nun', 0.56174236536026),
 ('lady', 0.559321403503418),
 ('lad', 0.5577070713043213),
 ('daughter', 0.5506519675254822)]

In [30]:
from gensim.models import Word2Vec
import numpy as np
model.wv.index2word
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    
#model = Word2Vec.load("300features_40minwords")

['the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'it',
 'in',
 'i',
 'this',
 'that',
 's',
 'was',
 'as',
 'with',
 'for',
 'movie',
 'but',
 'film',
 'you',
 't',
 'on',
 'not',
 'he',
 'are',
 'his',
 'have',
 'be',
 'one',
 'all',
 'at',
 'they',
 'by',
 'who',
 'an',
 'from',
 'so',
 'like',
 'there',
 'her',
 'or',
 'just',
 'about',
 'out',
 'has',
 'if',
 'what',
 'some',
 'good',
 'can',
 'more',
 'when',
 'very',
 'she',
 'up',
 'no',
 'time',
 'even',
 'would',
 'my',
 'which',
 'their',
 'story',
 'only',
 'really',
 'see',
 'had',
 'were',
 'well',
 'we',
 'me',
 'than',
 'much',
 'bad',
 'get',
 'been',
 'people',
 'also',
 'into',
 'do',
 'great',
 'other',
 'will',
 'first',
 'because',
 'him',
 'how',
 'most',
 'don',
 'them',
 'made',
 'its',
 'make',
 'then',
 'way',
 'could',
 'too',
 'movies',
 'after',
 'any',
 'characters',
 'character',
 'think',
 'films',
 'two',
 'watch',
 'being',
 'many',
 'plot',
 'seen',
 'never',
 'where',
 'love',
 'life',
 'little',
 'acting

In [38]:
num_features = 300

def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,), dtype = "float32")
    nwords = 0
    index2word_set = set(model.wv.index2word)
    
    for i in words:
        if i in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[i])
            
    featureVec = np.divide(featureVec, nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype = "float32")
    
    for j in reviews:
        if counter % 5000 == 0:
            print("Review %d of %d" % (counter, len(reviews)))
        reviewFeatureVecs[counter] = makeFeatureVec(j, model, num_features)
        
        counter  = counter + 1
    return reviewFeatureVecs

In [39]:
clean_train_reviews = []
for i in train["review"]:
    clean_train_reviews.append(review_to_wordlist(i, remove_stopwords = True))
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, 300)


clean_test_reviews = []
for j in test["review"]:
    clean_test_reviews.append(review_to_wordlist(j, remove_stopwords = True))
testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, 300)

Review 0 of 25000




Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000
Review 0 of 25000
Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000


In [49]:
print(trainDataVecs.shape)
print(testDataVecs.shape)
print(trainDataVecs)

(25000, 300)
(25000, 300)
[[ 0.01300876  0.01047531  0.00367966 ..., -0.0239167  -0.00254921
  -0.00640002]
 [ 0.04700413  0.00239555 -0.02017518 ..., -0.01232704 -0.00812424
  -0.00573464]
 [-0.01612295  0.00396236  0.01739677 ..., -0.02867449  0.01314587
  -0.01016296]
 ..., 
 [ 0.0123734   0.00477476 -0.00576705 ..., -0.00718576 -0.01778827
   0.00567526]
 [ 0.02530028  0.00200104 -0.02043257 ..., -0.02031976  0.01739063
   0.01097241]
 [ 0.01760322 -0.02194046 -0.01950093 ..., -0.00689489  0.00377015
   0.01398756]]


In [50]:
# Fit a random forest to the training data, using 100 trees
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(trainDataVecs, train["sentiment"])
result = forest.predict(testDataVecs)

output = pd.DataFrame(data = {"id":test["id"], "sentiment": result})
output.to_csv("Word2Vec_trial1.csv", index = False, quoting = 3)


In [53]:
output[0:5]

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",1
3,"""7186_2""",0
4,"""12128_7""",1
