In [23]:
# Notes
#1. Word2vec can only use Python3 because Py2 have the ASC code error
#2. Therefore, use "source activate nlp3" and then "jupyter notebook"

# Word2Vec
#1. It's a kind of method which is faster than some deep learning algorithms
#2. Do not need labels, it can generated labels automatically based on relationships between words.
#3. Because it cares relationships, don't remove stop words and numbers.
#4. 關係性的比重如何算: Attemp1 Vector Average, Attemp2 Clustering

# Trial1 - 5/29
#1. 萃取出500個具有關係性的單詞

In [24]:
import pandas as pd

# Read data
train = pd.read_csv("labeledTrainData.tsv", delimiter = "\t", quoting = 3)
test = pd.read_csv("testData.tsv", delimiter = "\t", quoting = 3)
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", delimiter = "\t", quoting = 3)

# print
print(train["review"].size, test["review"].size, unlabeled_train["review"].size)
print("\n")
print(train["review"][0])
print("\n")
print(train.columns, test.columns, unlabeled_train.columns)

25000 25000 50000


"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit

In [25]:
# Clean data (remove html, non-letters, lower case, split)
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist(review):
    review_text = BeautifulSoup(review, "lxml").get_text()
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    words = review_text.lower().split()
    
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops]
    return(words)

In [26]:
# Test
list_1 = review_to_wordlist(train["review"][0])
list_1[0:10]

['stuff',
 'going',
 'moment',
 'mj',
 'started',
 'listening',
 'music',
 'watching',
 'odd',
 'documentary']

In [27]:
# Punkt tokenizer to split paragraphs to sentences
# Above "review_to_wordlist" function is to split sentences into wordlist
# paragraphs -> sentences -> wordlists 讓每句sentence都有自己的wordlist

import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences(review, tokenizer):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for i in raw_sentences:
        if len(i) > 0:
            sentences.append(review_to_wordlist(i))
    return sentences

In [28]:
# Test
list_2 = tokenizer.tokenize(train["review"][0].strip())
print(list_2[0:2])
print("\n")
list_3 = review_to_sentences(train["review"][0], tokenizer)
print(list_3[0:2])

['"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again.', 'Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent.']


[['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker'], ['maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent']]


In [29]:
# Start to make paragraph into list of words
sentences = []
for i in train["review"]:
    sentences += review_to_sentences(i, tokenizer)


  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [32]:
print(len(sentences))
print("\n")
print(sentences[0])
print("\n")
print(sentences[1])

266551


['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker']


['maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent']


In [33]:
import logging
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)

from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers = 4, size = 300, min_count = 40, window = 10, \
                         sample = 0.001)

model.init_sims(replace = True)
model_name = "300features_40minwords"
model.save(model_name) #We can load it using Word2Vec.load()


2018-05-29 10:47:45,567 : INFO : collecting all words and their counts
2018-05-29 10:47:45,569 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-05-29 10:47:45,688 : INFO : PROGRESS: at sentence #10000, processed 114931 words, keeping 17627 word types


Training model...


2018-05-29 10:47:45,768 : INFO : PROGRESS: at sentence #20000, processed 228988 words, keeping 24797 word types
2018-05-29 10:47:45,854 : INFO : PROGRESS: at sentence #30000, processed 339533 words, keeping 29883 word types
2018-05-29 10:47:45,920 : INFO : PROGRESS: at sentence #40000, processed 453983 words, keeping 34196 word types
2018-05-29 10:47:45,987 : INFO : PROGRESS: at sentence #50000, processed 565006 words, keeping 37609 word types
2018-05-29 10:47:46,057 : INFO : PROGRESS: at sentence #60000, processed 676637 words, keeping 40571 word types
2018-05-29 10:47:46,128 : INFO : PROGRESS: at sentence #70000, processed 789005 words, keeping 43180 word types
2018-05-29 10:47:46,196 : INFO : PROGRESS: at sentence #80000, processed 899771 words, keeping 45561 word types
2018-05-29 10:47:46,259 : INFO : PROGRESS: at sentence #90000, processed 1013453 words, keeping 47982 word types
2018-05-29 10:47:46,335 : INFO : PROGRESS: at sentence #100000, processed 1125135 words, keeping 50054 

In [50]:
from gensim.models import Word2Vec
import numpy as np
w2v_words = model.wv.index2word
w2v_words[0:10]
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    
#model = Word2Vec.load("300features_40minwords")

['movie',
 'film',
 'one',
 'like',
 'good',
 'time',
 'even',
 'would',
 'story',
 'really']

In [43]:
#Make function to clean data including remove html, punctuations, numbers, stopwords, lower case and split
def cleandata(raw_data):
    removehtml = BeautifulSoup(raw_data).get_text()
    removenonalphabet = re.sub("[^a-zA-Z]", " ", removehtml)
    lowersplit = removenonalphabet.lower().split()
    stops = set(stopwords.words("english"))
    removestop = [w for w in lowersplit if not w in stops]
    return(" ".join(removestop))

# Get the clean_train data
clean_train = []
train_size = train["review"].size
for i in range(0, train_size):
    if (i+1) % 5000 == 0:
        print("Reviews now %d in %d\n" % (i+1, train_size))
    clean_train.append(cleandata(train["review"][i]))



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Reviews now 5000 in 25000

Reviews now 10000 in 25000

Reviews now 15000 in 25000

Reviews now 20000 in 25000

Reviews now 25000 in 25000



In [91]:
#Make vectorize & features
from sklearn.feature_extraction.text import CountVectorizer
feature_size = len(w2v_words)
vectorizer = CountVectorizer(analyzer = "word", stop_words = None, preprocessor = None, \
                             tokenizer = None, max_features = feature_size)


train_features = vectorizer.fit_transform(clean_train)
train_features = train_features.toarray()

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [96]:
len(train_features)

25000

In [87]:
#List the features
bag_words = vectorizer.get_feature_names()
dist = np.sum(train_features, axis = 0)

bag_words[0:10]

['aaron',
 'abandon',
 'abandoned',
 'abc',
 'abilities',
 'ability',
 'able',
 'abomination',
 'abortion',
 'abound']

In [88]:
w2v_words = sorted(w2v_words, key = str.lower)
print(len(w2v_words), len(bag_words))
w2v_words[0:10]

8160 8160


['aaron',
 'abandon',
 'abandoned',
 'abc',
 'abilities',
 'ability',
 'able',
 'abomination',
 'abortion',
 'abound']

In [89]:
vocab = [val for val in w2v_words if val in bag_words]
len(vocab)

8142

In [90]:
#Apply random forest
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators = 100)
model_rf = model_rf.fit(vocab, train["sentiment"])


ValueError: could not convert string to float: 'aaron'