# Sentiment analysis using word2vec

In [1]:
# Firstly, please note that the performance of google word2vec is better on big datasets. 
# In this example we are considering only 25000 training examples from the imdb dataset.
# Therefore, the performance is similar to the "bag of words" model.

# Importing libraries
import numpy as np
import pandas as pd
# BeautifulSoup is used to remove html tags from the text
from bs4 import BeautifulSoup 
import re # For regular expressions

# Stopwords can be useful to undersand the semantics of the sentence.
# Therefore stopwords are not removed while creating the word2vec model.
# But they will be removed  while averaging feature vectors.
from nltk.corpus import stopwords

In [2]:
# Read data from files
train = pd.read_csv("labeledTrainData.tsv", header=0,\
                    delimiter="\t", quoting=3)

test = pd.read_csv("testData.tsv",header=0,\
                    delimiter="\t", quoting=3)

In [3]:
# This function converts a text to a sequence of words.
def review_wordlist(review, remove_stopwords=False):
    # 1. Removing html tags
    review_text = BeautifulSoup(review).get_text()
    # 2. Removing non-letter.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    # 3. Converting to lower case and splitting
    words = review_text.lower().split()
    # 4. Optionally remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    
    return(words)

In [4]:
# word2vec expects a list of lists.
# Using punkt tokenizer for better splitting of a paragraph into sentences.

import nltk.data
#nltk.download('popular')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [5]:
# This function splits a review into sentences
def review_sentences(review, tokenizer, remove_stopwords=False):
    # 1. Using nltk tokenizer
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    # 2. Loop for each sentence
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_wordlist(raw_sentence,\
                                            remove_stopwords))

    # This returns the list of lists
    return sentences

In [6]:
sentences = []
print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_sentences(review, tokenizer)
    

Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [7]:
# Importing the built-in logging module
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [8]:
# Creating the model and setting values for the various parameters
num_features = 300  # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 10        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

# Initializing the train model
from gensim.models import word2vec
print("Training model....")
model = word2vec.Word2Vec(sentences,\
                          workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model.init_sims(replace=True)

# Saving the model for later use. Can be loaded using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2018-11-28 22:40:08,825 : INFO : 'pattern' package not found; tag filters are not available for English
2018-11-28 22:40:08,831 : INFO : collecting all words and their counts
2018-11-28 22:40:08,832 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-11-28 22:40:08,868 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2018-11-28 22:40:08,907 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2018-11-28 22:40:08,944 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2018-11-28 22:40:08,983 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2018-11-28 22:40:09,019 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types


Training model....


2018-11-28 22:40:09,058 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2018-11-28 22:40:09,096 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2018-11-28 22:40:09,135 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2018-11-28 22:40:09,173 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2018-11-28 22:40:09,211 : INFO : PROGRESS: at sentence #100000, processed 2226966 words, keeping 50207 word types
2018-11-28 22:40:09,249 : INFO : PROGRESS: at sentence #110000, processed 2446580 words, keeping 52081 word types
2018-11-28 22:40:09,288 : INFO : PROGRESS: at sentence #120000, processed 2668775 words, keeping 54119 word types
2018-11-28 22:40:09,331 : INFO : PROGRESS: at sentence #130000, processed 2894303 words, keeping 55847 word types
2018-11-28 22:40:09,368 : INFO : PROGRESS: at sentence #140000, processed 3107005 words, kee

2018-11-28 22:40:36,101 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-11-28 22:40:36,107 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-11-28 22:40:36,108 : INFO : EPOCH - 5 : training on 5920724 raw words (4045425 effective words) took 7.7s, 524607 effective words/s
2018-11-28 22:40:36,109 : INFO : training on a 29603620 raw words (20222724 effective words) took 26.1s, 776297 effective words/s
2018-11-28 22:40:36,109 : INFO : precomputing L2-norms of word weight vectors
2018-11-28 22:40:36,183 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2018-11-28 22:40:36,184 : INFO : not storing attribute vectors_norm
2018-11-28 22:40:36,185 : INFO : not storing attribute cum_table
2018-11-28 22:40:36,401 : INFO : saved 300features_40minwords_10context


In [9]:
# This will print the most similar words present in the model
model.wv.most_similar("recommend")

[('suggest', 0.7905669808387756),
 ('enjoy', 0.7444268465042114),
 ('advise', 0.7254894375801086),
 ('recommended', 0.6676059365272522),
 ('encourage', 0.632380485534668),
 ('consider', 0.6173403859138489),
 ('appreciate', 0.5855698585510254),
 ('buy', 0.5642555952072144),
 ('adore', 0.5579320788383484),
 ('watch', 0.5548550486564636)]

In [10]:
# This will give the total number of words in the vocabolary created from this dataset
model.wv.vectors.shape

(8306, 300)

In [11]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [12]:
#Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [13]:
##### Submission data
test_data_for_subtaskA_predictions = pd.read_csv("TrialData_SubtaskA_Test.csv", sep=",", header=None, skipfooter=1, engine="python")
test_data_for_subtaskA_predictions.columns = ["id","Review","Sugg_Class"]

# Calculating average feature vector for training set
clean_validate_reviews = []
for review in test_data_for_subtaskA_predictions['Review']:
    clean_validate_reviews.append(review_wordlist(review, remove_stopwords=False))
    
ValidateDataVecs = getAvgFeatureVecs(clean_validate_reviews, model , num_features)


  del sys.path[0]


Review 0 of 591


In [14]:
ValidateDataVecs

array([[ 3.8570851e-02, -5.2160244e-03, -1.1007537e-02, ...,
         7.2561158e-03,  1.2660294e-02, -1.9059860e-03],
       [ 1.2300771e-02, -5.0368901e-02, -7.4674666e-02, ...,
        -5.0331812e-02,  4.1573968e-02, -7.4399561e-02],
       [ 2.5556121e-02, -3.6878309e-03,  1.9673318e-02, ...,
        -2.1263473e-02,  2.0651696e-02, -4.1768778e-04],
       ...,
       [ 2.4777399e-02,  4.7071842e-03, -1.5330539e-02, ...,
        -1.3544981e-02,  1.2782880e-02,  1.3583881e-02],
       [ 3.2923143e-02, -1.1218979e-02, -2.7119055e-02, ...,
        -1.7023390e-03,  4.4273183e-02,  3.2701306e-03],
       [ 2.8297424e-02, -1.6164076e-02,  5.7021156e-05, ...,
         1.2326315e-02,  3.1974025e-02, -6.6459284e-04]], dtype=float32)

In [15]:
# Calculating average feature vector for training set
clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(review_wordlist(review, remove_stopwords=True))
    
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

Review 0 of 25000


  del sys.path[0]


Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000


In [16]:
# Calculating average feature vactors for test set     
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append(review_wordlist(review,remove_stopwords=True))
    
testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

Review 0 of 25000


  del sys.path[0]


Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000


In [17]:
# Fitting a random forest classifier to the training data
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
    
print("Fitting random forest to training data....")    
forest = forest.fit(trainDataVecs, train["sentiment"])

Fitting random forest to training data....


In [18]:
# Predicting the sentiment values for test data and saving the results in a csv file 
result = forest.predict(testDataVecs)
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv( "output.csv", index=False, quoting=3 )

In [19]:
result


array([1, 0, 1, ..., 1, 1, 0])

In [20]:
output

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",1
3,"""7186_2""",0
4,"""12128_7""",1
5,"""2913_8""",1
6,"""4396_1""",0
7,"""395_2""",0
8,"""10616_1""",0
9,"""9074_9""",1
