In [11]:
#import all necessary libraries
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.functional import F
from torch.optim import Adam
from torchsummary import summary

import os
import numpy as np
import re
from collections import defaultdict
from scipy.spatial.distance import cosine

In [12]:
# get all the words that appear in the wordsim file
wordsim = set()
with open("./combined.csv", "rt") as inf:
    for line in inf.readlines():
        ws = line.split(',')
        wordsim.add(ws[0])
        wordsim.add(ws[1])

In [3]:
len(wordsim)

439

In [13]:
# merge the contents of all the books
al = ""
for bookname in os.listdir('./books/'):
    with open("./books/" + bookname) as book:
        text = book.readlines()
        text = " ".join(text)
    al += text
    
len(al)

10411221

In [14]:
# remove all unnecessary words and whitespaces
al = re.sub('[0-9]+', '', al)
al = re.sub('[\s]{2,}', ' ', al)

In [15]:
# create bag of words from the books
cv = CountVectorizer(stop_words=stopwords.words('english'))
result = cv.fit_transform([al])

In [16]:
# create a dictionary with the word counts
cnt = defaultdict(int)
lst = cv.get_feature_names()
stpw = set(stopwords.words('english'))

for i, w in enumerate(lst):
    word = w.replace('_', '')
    if word not in stpw and len(word) > 2:
        cnt[word] += result.data[i]

In [17]:
# get the top 10000 most frequent words 
sorted_counts = dict(sorted(cnt.items(), key=lambda item: item[1], reverse=True))
chosen_words = set(list(sorted_counts.keys())[:10000])

# add the remaining words that appear in both
# the books and wordsim
for w in sorted_counts.keys():
    if w in wordsim:
        chosen_words.add(w)

In [10]:
len(chosen_words)

10268

In [18]:
# parse each book
WINDOW_SIZE = 2
illegal_chars = ".,!?/<>{}[]()\\|-+_=#@&*\"\'"
pairs = set()

for bookname in os.listdir('./books/'):
    print(bookname)
    with open("./books/" + bookname) as book:
        text = book.readlines()
        text = " ".join(text)
        text = text.lower()
    # tokenize text by sentences
    for sentence in sent_tokenize(text):
        # tokenize sentence by words
        words = word_tokenize(sentence)
        
        # iterate through words
        for i, word in enumerate(words):
            # exclude word if it is an illegal character
            if word in illegal_chars:
                continue
            
            # define lower and upper bounds of a words environment
            lower = max(i - WINDOW_SIZE, 0)
            upper = min(i + WINDOW_SIZE, len(words))
            
            # create wordpairs 
            for j in range(lower, upper):
                if words[j] == word or words[j] in illegal_chars:
                    continue
                pairs.add((word, words[j]))

pg20897.txt
28338-0.txt
pg14558.txt
pg22277.txt
6138-0.txt
pg40498.txt
pg9799.txt
pg40643.txt
pg20816.txt
pg34.txt
pg6679.txt
pg1612.txt
pg33962.txt
pg22002.txt
pg42069.txt
47367-0.txt
5352-0.txt
7843-0.txt
47436-0.txt
pg48344.txt


In [11]:
len(pairs)

1376492

In [19]:
# keep only those pairs where both words appear in wordsim
train_data = list(filter(lambda x: x[0] in chosen_words and x[1] in chosen_words, pairs))

In [20]:
len(train_data)

82312

In [21]:
# define dataset generator class
class EmbeddingSet(Dataset):
    def __init__(self, pairs, word_list):
        self.pairs = pairs
        self.word_list = word_list
        
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        w1, w2 = self.pairs[idx]
        
        idx1 = self.word_list.index(w1)
        idx2 = self.word_list.index(w2)
        
        ohe_x = np.zeros((len(self.word_list),))
        ohe_y = np.zeros((len(self.word_list),))
        ohe_x[idx1] = 1
        ohe_x = ohe_x.astype('float32')
        
        return torch.tensor(ohe_x), torch.tensor(idx2)

In [22]:
# create dataset and dataloader
dataset = EmbeddingSet(train_data, list(chosen_words))
dataloader = DataLoader(dataset, batch_size=100, shuffle=True)

In [23]:
# create NN model class
class EmbeddingModel(nn.Module):
    def __init__(self, in_features):
        super(EmbeddingModel, self).__init__()
        
        self.linear1 = nn.Linear(in_features, 100)
        self.linear2 = nn.Linear(100, in_features)
        
    def forward(self, x):
        out = F.relu(self.linear1(x))
        out = self.linear2(out)
        return out

In [24]:
# print summary of model
model = EmbeddingModel(len(chosen_words))
summary(model.cuda(), (len(chosen_words),))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                  [-1, 100]       1,026,900
            Linear-2                [-1, 10268]       1,037,068
Total params: 2,063,968
Trainable params: 2,063,968
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.04
Forward/backward pass size (MB): 0.08
Params size (MB): 7.87
Estimated Total Size (MB): 7.99
----------------------------------------------------------------


In [25]:
# define necessary parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cpu = torch.device("cpu")

optimizer = Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

n_epochs = 20
log_interval = 50
model.train()

EmbeddingModel(
  (linear1): Linear(in_features=10268, out_features=100, bias=True)
  (linear2): Linear(in_features=100, out_features=10268, bias=True)
)

In [26]:
# train the model
losses = np.zeros(len(dataloader) * n_epochs)

for epoch in range(n_epochs):
    for batch_idx, (data, target) in enumerate(dataloader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        losses[len(dataloader) * epoch + batch_idx] = loss.item()
        
        if not batch_idx % log_interval:
            print("Epoch {}. Batches: {}/{}. Loss: {}.".format(
                epoch, batch_idx, len(dataloader), loss.item()))

Epoch 0. Batches: 0/824. Loss: 9.243396759033203.
Epoch 0. Batches: 50/824. Loss: 9.083513259887695.
Epoch 0. Batches: 100/824. Loss: 8.280298233032227.
Epoch 0. Batches: 150/824. Loss: 7.586933612823486.
Epoch 0. Batches: 200/824. Loss: 7.8843512535095215.
Epoch 0. Batches: 250/824. Loss: 7.7210693359375.
Epoch 0. Batches: 300/824. Loss: 7.8387370109558105.
Epoch 0. Batches: 350/824. Loss: 7.501501083374023.
Epoch 0. Batches: 400/824. Loss: 7.869247913360596.
Epoch 0. Batches: 450/824. Loss: 7.652331352233887.
Epoch 0. Batches: 500/824. Loss: 7.494274139404297.
Epoch 0. Batches: 550/824. Loss: 7.701819896697998.
Epoch 0. Batches: 600/824. Loss: 7.466142177581787.
Epoch 0. Batches: 650/824. Loss: 7.3699798583984375.
Epoch 0. Batches: 700/824. Loss: 7.584627628326416.
Epoch 0. Batches: 750/824. Loss: 7.120809555053711.
Epoch 0. Batches: 800/824. Loss: 7.679732799530029.
Epoch 1. Batches: 0/824. Loss: 7.404466152191162.
Epoch 1. Batches: 50/824. Loss: 7.644104480743408.
Epoch 1. Batches:

Epoch 9. Batches: 300/824. Loss: 6.543927192687988.
Epoch 9. Batches: 350/824. Loss: 6.5060224533081055.
Epoch 9. Batches: 400/824. Loss: 6.29942512512207.
Epoch 9. Batches: 450/824. Loss: 6.613790512084961.
Epoch 9. Batches: 500/824. Loss: 6.112674713134766.
Epoch 9. Batches: 550/824. Loss: 6.004948616027832.
Epoch 9. Batches: 600/824. Loss: 6.2785444259643555.
Epoch 9. Batches: 650/824. Loss: 6.289528846740723.
Epoch 9. Batches: 700/824. Loss: 6.222177028656006.
Epoch 9. Batches: 750/824. Loss: 6.145449161529541.
Epoch 9. Batches: 800/824. Loss: 6.04640007019043.
Epoch 10. Batches: 0/824. Loss: 6.080854415893555.
Epoch 10. Batches: 50/824. Loss: 6.383656024932861.
Epoch 10. Batches: 100/824. Loss: 6.101265907287598.
Epoch 10. Batches: 150/824. Loss: 6.27835750579834.
Epoch 10. Batches: 200/824. Loss: 6.136828422546387.
Epoch 10. Batches: 250/824. Loss: 6.277472496032715.
Epoch 10. Batches: 300/824. Loss: 6.163389205932617.
Epoch 10. Batches: 350/824. Loss: 6.014395236968994.
Epoch 10

Epoch 18. Batches: 450/824. Loss: 5.038995742797852.
Epoch 18. Batches: 500/824. Loss: 4.880331516265869.
Epoch 18. Batches: 550/824. Loss: 5.305757522583008.
Epoch 18. Batches: 600/824. Loss: 4.792581081390381.
Epoch 18. Batches: 650/824. Loss: 5.246639251708984.
Epoch 18. Batches: 700/824. Loss: 4.685539722442627.
Epoch 18. Batches: 750/824. Loss: 4.871416091918945.
Epoch 18. Batches: 800/824. Loss: 5.005054950714111.
Epoch 19. Batches: 0/824. Loss: 4.504660129547119.
Epoch 19. Batches: 50/824. Loss: 4.640068054199219.
Epoch 19. Batches: 100/824. Loss: 5.020140647888184.
Epoch 19. Batches: 150/824. Loss: 5.262938976287842.
Epoch 19. Batches: 200/824. Loss: 4.810288429260254.
Epoch 19. Batches: 250/824. Loss: 5.2104716300964355.
Epoch 19. Batches: 300/824. Loss: 4.875925540924072.
Epoch 19. Batches: 350/824. Loss: 5.323032855987549.
Epoch 19. Batches: 400/824. Loss: 5.025117874145508.
Epoch 19. Batches: 450/824. Loss: 4.952408313751221.
Epoch 19. Batches: 500/824. Loss: 4.819994926452

In [40]:
# save models parameters
torch.save(model.state_dict(), "./state_dict.pt")

In [38]:
# get weights of the first FC layer
embeddings = F.relu(model.linear1.weight.data).T
embeddings = embeddings.cpu().numpy()
embeddings.shape

(10268, 100)

In [28]:
# create bag-of-docs model
texts = []
for bookname in os.listdir("./books/"):
    with open("./books/" + bookname) as inf:
        text = " ".join(inf.readlines())
    texts.append(text)
    
cv = CountVectorizer(stop_words=stopwords.words('english'))
result = cv.fit_transform(texts)

In [29]:
result.toarray().T

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0, 37, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  2,  0],
       [ 0,  1,  0, ...,  0,  0,  0],
       [ 0,  1,  0, ...,  0,  0,  0]])

In [30]:
# save words from the wordsim together with their similarity values
a_words = list(chosen_words)
valid_pairs = []

with open("combined.csv") as inf:
    for line in inf.readlines():
        ws = line.split(",")
        if ws[0] in chosen_words and ws[1] in chosen_words:
            valid_pairs.append((ws[0], ws[1], float(ws[2])))

In [34]:
# calculate cosine similarity of two words 
def similarity(embeddings, w_list, w1, w2):
    idx1 = w_list.index(w1)
    idx2 = w_list.index(w2)
    
    v1 = embeddings[idx1, :]
    v2 = embeddings[idx2, :]
    
    dot_product = np.dot(v1, v2)
    norm_a = np.linalg.norm(v1)
    norm_b = np.linalg.norm(v2)
    
    return dot_product / (norm_a * norm_b)

In [35]:
# calculate mse
def calc_mse(embeddings, w_list, pairs):
    mse = .0
    for pair in pairs:
        w1 = pair[0]
        w2 = pair[1]
        sim = pair[2]
        sim_cal = similarity(embeddings, w_list, w1, w2)
    
        mse += (sim / 10. - sim_cal) ** 2
    
    mse /= len(valid_pairs)
    return mse

In [39]:
print("W2V embedding mse: {}".format(calc_mse(embeddings, dataset.word_list, valid_pairs)))
print("BOW embedding mse: {}".format(calc_mse(result.toarray().T, cv.get_feature_names(), valid_pairs)))

W2V embedding mse: 0.09040535084477971
BOW embedding mse: 0.12781699469779034
