# Part 1: cleaning the text corpus

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
# importing nltk
import nltk
# download nltk data 
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
# importing the stopwords list
from nltk.corpus import stopwords
stopwords = list(set(stopwords.words("english")))
print(stopwords)

['who', 'down', 'further', 'was', 'll', 'couldn', "that'll", 'mightn', 'hasn', 'these', 'both', 'other', 'most', 'how', 'will', 'my', 'such', 'so', 'them', 'at', 'their', 'isn', 'they', 'into', 'nor', 'with', 'on', "it's", 'but', 'by', 'any', "you're", 'her', 'is', 'this', 'where', 'been', 'having', 'can', "couldn't", 'themselves', 'him', 'than', 'ours', 've', 'when', "wasn't", 'too', 'that', 'off', 'once', 'were', 'have', 'doing', 'didn', 'his', 're', 's', 'am', 'between', 'all', 'm', 'yourselves', 'again', 'before', 'y', "you'd", 'does', "shan't", 'she', 'should', 'now', 'yourself', 'what', "she's", 'few', "you've", 'has', 'shouldn', 'about', 'be', 'ain', 'hers', 'myself', 'haven', 'he', 'not', "hadn't", 'shan', 'being', "isn't", "needn't", "shouldn't", 'you', 'd', 'won', 'against', 'ma', 'are', 'the', 'itself', 'and', 'over', "hasn't", 'only', "haven't", 'wasn', 'do', "aren't", 'through', 'no', 'it', 'an', 'herself', 'we', 'did', 't', 'then', 'ourselves', 'more', 'from', "should've"

In [3]:
# loading dataset
data = (pd.read_csv("labeledTrainData.tsv", quoting=3, header=0, delimiter="\t"))
# limiting data to 10000 examples due to lack of computation power
data = data[0:100]
# data info
print("data shape: ", data.shape)
print("columns: ",data.columns.values)
data.head()

data shape:  (100, 3)
columns:  ['id' 'sentiment' 'review']


Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
# removing html markups 
review = [BeautifulSoup(i, "html5lib").getText() for i in data['review'] ]
# removing unnecessary characters
review = [re.sub("[^a-zA-Z']", " ", i) for i in review]
# converting the text corpus into lower case and splitting
review = [i.lower().split() for i in review]

In [5]:
# function for removing stop words
def remove_stopwords(text):
    text = [word for word in text if not word in stopwords]
    return text

# removing stop words
for i in range(len(review)):
    review[i] = remove_stopwords(review[i])

In [6]:
# creating vocabulary
vocab = list(set([word for line in review for word in line]))
# creating a word to index dictionary; will be useful in one_hot encoding
word_to_id = {word:i for i, word in enumerate(vocab)}
# creating a index to word dictionary; will be useful in one_hot decoding
id_to_word = {i:word for i, word in enumerate(vocab)}

# part 2: preparing data for word2vec encoding

In [7]:
import numpy as np

# function for one hot encoding
def one_hot(text, vocab):
    hot_matrix = np.zeros([len(text), len(vocab)])
    for i in range(len(text)):
        if not text[i] in word_to_id:
            hot_matrix[i, 0] = 0
        else:
            hot_matrix[i, word_to_id[text[i]]] = 1

    return hot_matrix

In [165]:
# this is a very important function

def context_words(text, skip):
    right = []; left = [];  final = []
    for i in range(len(text)):
        # words window at the right of the main word
        right.append([text[i+s] for s in range(1, skip+1) if (i+s)< len(text)])
        # words window at the left of the main word
        left.append(list(reversed([text[i-s] for s in range(1, skip+1) if (i-s) >= 0])))
        # concatenating final list of context words
        final.append(left[i] + right[i])
    # filling empty spaces
    final[0].insert(0, 'unk')
    final[0].insert(1, 'unk')
    final[1].insert(0, 'unk')
    final[-2].insert(0, 'unk')
    final[-1].insert(0, 'unk')
    final[-1].insert(1, 'unk')
    return final

In [157]:
# function to create sets of input and output data

def create_context_batch(text, skip, return_one_hot = False):
    # list of context words
    context_list = context_words(text, skip)
    # lists to hold training set and labels
    x = []; y = []
    # loop over each index in text
    for i in range(len(text)):
        # loop over each index inside context_list[i]
        for j in context_list[i]:
            # training set
            x.append(str(text[i]))
            # labels
            y.append(str(j))
    if return_one_hot == True:
        x = one_hot(x, vocab)
        y = one_hot(y, vocab)
    return x, y

# part 3: Training  the word embedding

In [158]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(output_dim = 400, activation = None, input_dim = len(vocab)))
model.add(Dense(output_dim = len(vocab), activation = 'sigmoid', input_dim = 400 ))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

for text in review[0:70]:
    # create training and labels
    x_train, y_train = create_context_batch(text, 2, return_one_hot = True)
    # training 
    model.fit(x_train, y_train, epochs=5)
    print('\n\ndone\n\n')

  """
  


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


done




# part 4: Creating word embeddings on trained model

In [187]:
# extracting weights and bias for the embedding layer
weights = model.get_weights()[0]
bias = model.get_weights()[1]

# final prediction
def get_embedding(text, w, b):
    text_input = one_hot(text, vocab)
    
    embedding = np.dot(text_input, w) + b
    
    return embedding

In [189]:
from pprint import pprint


print('input text: \n')
# example
print(data['review'][1])
# word embedding for the above example
embed = get_embedding(data['review'][1], weights, bias)

print('\n\nword embedding: \n')
pprint(embed)

input text: 

"\"The Classic War of the Worlds\" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur \"critics\" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the \"critics\". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells' classic novel, and we found it to be very entertaining. This made it easy to overlook what the \"critics\" perceive to be its shortcomings."


word embedding: 

array([[ 0.190561