# part 3 of kaggle-sentiment analysis


1.restore the word2vec model and get word vector

2.combine all word vector in one review into one vector

3.use random forest to generate a sentiment analysis model

In [1]:
import re
import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier

#restore the trained model from word2vec part
model = Word2Vec.load("300features_40minwords_10contex")

#shape of the word vector
#more about word2vec : 
#     https://radimrehurek.com/gensim/models/word2vec.html
model.wv.syn0.shape

  app.launch_new_instance()


(16490, 300)

In [2]:
model.wv["flower"].shape

(300,)

From Words To Paragraphs, Attempt 1: Vector Averaging

One challenge with the IMDB dataset is the variable-length reviews. We need to find a way to take individual word vectors and transform them into a feature set that is the same length for every review.

Since each word is a vector in 300-dimensional space, we can use vector operations to combine the words in each review. One method we tried was to simply average the word vectors in a given review (for this purpose, we removed stop words, which would just add noise).

The following code averages the feature vectors, building on our code from Part 2.

In [3]:
def makeFeatureVec(words,model,num_features):
    
    #build up a zero feature vector
    featureVecs = np.zeros((num_features,),dtype="float32")
    #indicating how many words to average
    nwords = 0
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    
    for word in words:
        if word in index2word_set:
            featureVecs = np.add(featureVecs,model.wv[word])
            nwords += 1
    featureVecs = np.divide(featureVecs,nwords)
    return featureVecs

In [4]:
def getAvgFeatureVec(reviews,model,num_features):
    
    #initialize a counter
    counter = 0
    
    #initialize a zero matrix 
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    
    for review in reviews:
        if (counter+1)%1000 ==0:
            print("Review %d of %d" % (counter,len(reviews)))
        reviewFeatureVecs[counter] = makeFeatureVec(review,model,num_features)
        counter += 1
    return  reviewFeatureVecs

In [5]:
#clean review with option : whether remove stop word
def review_to_wordlist(review,remove_stopwords=False):
    #Function to convert a document to a sequence of words
    #1.remove Html
    raw_review = BeautifulSoup(review).get_text()
    #2.remove NON-charaters
    cleaned_words = re.sub("[^a-zA-Z]"," ",raw_review)
    #3.lower case and split review into word list
    words = cleaned_words.lower().split()
    #4.remove stopwords if said
    if remove_stopwords == True:
        stop_words = set(stopwords.words("english"))
        words = [word for word in words if word not in stop_words]
    return words

In [8]:
# train review feature vectors
cleaned_train_reviews = []
train = pd.read_csv("./dataset/labeledTrainData.tsv",
                    header=0,
                    delimiter='\t',
                    quoting=3)
for i in range(train["review"].size):
    cleaned_train_reviews.append(review_to_wordlist(train["review"][i],remove_stopwords=True))
trainDataVecs = getAvgFeatureVec(cleaned_train_reviews,model,300)
    
#test review feature vectors
cleaned_test_reviews = []
test = pd.read_csv("./dataset/testData.tsv",
                   header=0,delimiter='\t',
                   quoting=3)
for i in range(test["review"].size):
    cleaned_test_reviews.append(review_to_wordlist(test["review"][i],remove_stopwords=True))
testDataVecs = getAvgFeatureVec(cleaned_test_reviews,model,300)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Review 999 of 25000
Review 1999 of 25000
Review 2999 of 25000
Review 3999 of 25000
Review 4999 of 25000
Review 5999 of 25000
Review 6999 of 25000
Review 7999 of 25000
Review 8999 of 25000
Review 9999 of 25000
Review 10999 of 25000
Review 11999 of 25000
Review 12999 of 25000
Review 13999 of 25000
Review 14999 of 25000
Review 15999 of 25000
Review 16999 of 25000
Review 17999 of 25000
Review 18999 of 25000
Review 19999 of 25000
Review 20999 of 25000
Review 21999 of 25000
Review 22999 of 25000
Review 23999 of 25000
Review 24999 of 25000
Review 999 of 25000
Review 1999 of 25000
Review 2999 of 25000
Review 3999 of 25000
Review 4999 of 25000
Review 5999 of 25000
Review 6999 of 25000
Review 7999 of 25000
Review 8999 of 25000
Review 9999 of 25000
Review 10999 of 25000
Review 11999 of 25000
Review 12999 of 25000
Review 13999 of 25000
Review 14999 of 25000
Review 15999 of 25000
Review 16999 of 25000
Review 17999 of 25000
Review 18999 of 25000
Review 19999 of 25000
Review 20999 of 25000
Review 219

In [9]:
#initialize random forest classifier
forest = RandomForestClassifier(n_estimators=100)

#train the classifier
forest.fit(trainDataVecs,train["sentiment"])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
#make prediction and store results
result = forest.predict(testDataVecs)
output = pd.DataFrame(data={"id":test["id"],"sentiment":result})
output.to_csv("Word2Vec_AverageVectors.csv",index=False,quoting=3)

From Words to Paragraphs, Attempt 2: Clustering 

Word2Vec creates clusters of semantically related words, so another possible approach is to exploit the similarity of words within a cluster. Grouping vectors in this way is known as "vector quantization." To accomplish this, we first need to find the centers of the word clusters, which we can do by using a clustering algorithm such as K-Means.

In K-Means, the one parameter we need to set is "K," or the number of clusters. How should we decide how many clusters to create? Trial and error suggested that small clusters, with an average of only 5 words or so per cluster, gave better results than large clusters with many words. Clustering code is given below. We use scikit-learn to perform our K-Means.

K-Means clustering with large K can be very slow; the following code took more than 40 minutes on my computer. Below, we set a timer around the K-Means function to see how long it takes.

In [6]:
from sklearn.cluster import KMeans
import time

In [23]:
start = time.time()
#ndarray word vector
word_vectors = model.wv.syn0

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
# average of 5 words per cluster
num_clusters = word_vectors.shape[0]//5

#initialize an KMeans object
kmeans_clustering = KMeans(n_clusters=num_clusters)
#make clusters for word vectors
idx = kmeans_clustering.fit_predict(word_vectors)
end = time.time()
print("Time taken for K Means clustering:{} seconds".format(end-start))

  This is separate from the ipykernel package so we can avoid doing imports until


Time taken for K Means clustering:357.40642619132996 seconds


In [39]:
#show clustering result
word_centroid_map = dict(zip(model.wv.index2word,idx))
for cluster in range(0,10):
    words = []
    for i in range(len(word_centroid_map)):
        if list(word_centroid_map.values())[i] == cluster:
            words.append(list(word_centroid_map.keys())[i])
    print("cluster :{}".format(cluster))
    print(words)

cluster :0
['blue', 'green', 'glowing', 'soylent']
cluster :1
['christopher', 'walken', 'baldwin', 'reeve', 'lambert', 'plummer']
cluster :2
['pursuit', 'midst', 'meantime']
cluster :3
['awkward', 'lazy', 'clumsy', 'phony', 'deliberate', 'limp', 'misplaced', 'strained', 'calculated', 'shabby', 'haphazard', 'functional', 'repetitious']
cluster :4
['iraqi']
cluster :5
['infamous', 'notorious', 'acclaimed', 'iconic', 'celebrated', 'influential', 'quintessential', 'prolific', 'pioneer', 'revered']
cluster :6
['dagger']
cluster :7
['kate', 'melissa', 'ashley', 'olsen', 'farrah', 'beckinsale', 'winslet', 'latifah']
cluster :8
['concludes', 'engages', 'replaces', 'guides', 'undergoes', 'indulges']
cluster :9
['front', 'block', 'cliff', 'roof', 'desk', 'deck', 'staircase', 'branch', 'balcony', 'lawn', 'heel', 'tips', 'dock', 'dangling']


In [44]:
def create_bag_of_centroids(wordlist,word_centroid_map):
    #Very much alike the bag of words idea
    #count the frequency of clusters happened in a review
    num_centroid = max(word_centroid_map.values())+1
    #create a zero vector(bag of cluster)
    bag_of_centroid = np.zeros(num_centroid,dtype="float32")
    #loop over the words in the review if the word is in the vocabulary,
    #find the cluster and plus one 
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroid[index] += 1
    return bag_of_centroid

In [45]:
# train review feature vectors
cleaned_train_reviews = []
train = pd.read_csv("./dataset/labeledTrainData.tsv",
                    header=0,
                    delimiter='\t',
                    quoting=3)
for i in range(train["review"].size):
    cleaned_train_reviews.append(review_to_wordlist(train["review"][i],remove_stopwords=True))
    
#test review feature vectors
cleaned_test_reviews = []
test = pd.read_csv("./dataset/testData.tsv",
                   header=0,delimiter='\t',
                   quoting=3)
for i in range(test["review"].size):
    cleaned_test_reviews.append(review_to_wordlist(test["review"][i],remove_stopwords=True))



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [46]:
#initialize a zero matrix to store "bag of cluster"features for reviews
train_centroids = np.zeros((train["review"].size,num_clusters),dtype="float32")

#
counter = 0
for review in cleaned_train_reviews:
    train_centroids[counter] = create_bag_of_centroids(review,
                                                       word_centroid_map)
    counter += 1
#initialize a zero matrix to store "bag of cluster"features for reviews
test_centroids = np.zeros((test["review"].size,num_clusters),dtype="float32")

counter = 0
for review in cleaned_test_reviews:
    test_centroids[counter] = create_bag_of_centroids(review,
                                                     word_centroid_map)
    counter += 1


In [49]:
#fit a random forest and extract prediction

#initialize the random forest model
forest = RandomForestClassifier(n_estimators=100)

#train the model with bag of cluster feature
forest.fit(train_centroids,train["sentiment"])

#make prediction
result = forest.predict(test_centroids)

#write the test results
output = pd.DataFrame(data={"id":test["id"],"sentiment":result})
output.to_csv( "BagOfCentroids.csv", index=False, quoting=3 )