In [None]:
# Notes
#1. Word2vec can only use Python3 because Py2 have the ASC code error
#2. Therefore, use "source activate nlp3" and then "jupyter notebook"

# Word2Vec
#1. It's a kind of method which is faster than some deep learning algorithms
#2. Do not need labels, it can generated labels automatically based on relationships between words.
#3. Because it cares relationships, don't remove stop words and numbers.

In [4]:
import pandas as pd

# Read data
train = pd.read_csv("labeledTrainData.tsv", delimiter = "\t", quoting = 3)
test = pd.read_csv("testData.tsv", delimiter = "\t", quoting = 3)
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", delimiter = "\t", quoting = 3)

# print
print(train["review"].size, test["review"].size, unlabeled_train["review"].size)
print("\n")
print(train["review"][0])
print("\n")
print(train.columns, test.columns, unlabeled_train.columns)

25000 25000 50000


"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit

In [5]:
# Clean data (remove html, non-letters, lower case, split)
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist(review, remove_stopwords = False):
    review_text = BeautifulSoup(review, "lxml").get_text()
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    words = review_text.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)

In [37]:
# Test
list_1 = review_to_wordlist(train["review"][0])
list_1[0:10]

['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with']

In [31]:
# Punkt tokenizer to split paragraphs to sentences
# Above "review_to_wordlist" function is to split sentences into wordlist
# paragraphs -> sentences -> wordlists 讓每句sentence都有自己的wordlist

import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences(review, tokenizer, remove_stopwords = False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for i in raw_sentences:
        if len(i) > 0:
            sentences.append(review_to_wordlist(i, remove_stopwords))
    return sentences

In [39]:
# Test
list_2 = tokenizer.tokenize(train["review"][0].strip())
print(list_2[0:2])
print("\n")
list_3 = review_to_sentences(train["review"][0], tokenizer, remove_stopwords = False)
print(list_3[0:2])

['"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again.', 'Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent.']


[['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again'], ['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent']]


In [40]:
# Start to make paragraph into list of words

sentences = []
for i in train["review"]:
    sentences += review_to_sentences(i, tokenizer)

print("Parsing now")
for j in unlabeled_train["review"]:
    sentences += review_to_sentences(j, tokenizer)

  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Parsing now


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [43]:
print(len(sentences))
print("\n")
print(sentences[0])
print("\n")
print(sentences[1])

795538


['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent']


In [44]:
import logging
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)

from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers = 4, size = 300, min_count = 40, window = 10, \
                         sample = 0.001)

model.init_sims(replace = True)
model_name = "300features_40minwords"
model.save(model_name) #We can load it using Word2Vec.load()

ImportError: No module named 'gensim'