In [2]:
import numpy as np
import gensim
import string
import re

## Setup

The first thing to do is to obtain a selection of pre-trained word embeddings that we would like to compare.

The candidates are:

* Plain `word2vec` from Google trained on the Google News corpus. Vector size 300. Obtained from [the official page](https://code.google.com/archive/p/word2vec/). [Link for the file.](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/)

* `GloVe` trained on CommonCrawl 820B and on Wikipedia+Gigaword. Vector size 300. Obtained from [the official page](https://nlp.stanford.edu/projects/glove/).

* ELMo Medium version with output size 256. Obtained from [AllenNLP](https://allennlp.org/elmo)

* BERT-Base, uncased version, obtained from [Transformers library](https://huggingface.co/transformers/index.html).

Installation (everything except `word2vec`):

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip
!wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5
!wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json
!wget https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip

In [None]:
!unzip glove.6B.zip
!rm glove.6B.zip glove.6B.50d.txt glove.6B.100d.txt glove.6B.200d.txt
!unzip glove.840B.300d.zip
!rm glove.840B.300d.zip
!unzip cased_L-12_H-768_A-12.zip
!rm cased_L-12_H-768_A-12.zip
!gunzip GoogleNews-vectors-negative300.bin.gz

## Preparation

Now let us create a samle text file that we will later convert into a point cloud. We will use one text from the SQuAD dataset.

In [None]:
def tokenize_from_file(filename, keep_punct = False):
    with open(filename,"r") as file:
        text = file.read()
    if keep_punct is True:
        for punct in string.punctuation:
            text = text.replace(punct, ' ' + punct + ' ')
    else:
        for punct in string.punctuation:
            text = text.replace(punct, ' ')
    
    text = re.sub('\s+', ' ', text)
    
    result = []
    
    for x in text.lower().split(' '):
        if x.isalpha():
            result.append(x)
        else:
            word = []
            for y in x: # for every character
                if y.isalpha(): word.append(y)
            if len(word) > 0:
                result.append(''.join(word))
                
    return result

def get_vectors(wv, words):
    M = []
    for w in words:
        try:
            M.append(wv[w])
        except KeyError:
            continue
    M = np.stack(M)
    return M

In [None]:
text = """Imperialism is a type of advocacy of empire. Its name originated from the Latin word "imperium", which means to rule over large territories. Imperialism is "a policy of extending a country's power and influence through colonization, use of military force, or other means". Imperialism has greatly shaped the contemporary world. It has also allowed for the rapid spread of technologies and ideas. The term imperialism has been applied to Western (and Japanese) political and economic dominance especially in Asia and Africa in the 19th and 20th centuries. Its precise meaning continues to be debated by scholars. Some writers, such as Edward Said, use the term more broadly to describe any system of domination and subordination organised with an imperial center and a periphery. Imperialism is defined as "A policy of extending a country’s power and influence through diplomacy or military force." Imperialism is particularly focused on the control that one group, often a state power, has on another group of people. This is often through various forms of "othering" (see other) based on racial, religious, or cultural stereotypes. There are "formal" or "informal" imperialisms. "Formal imperialism" is defined as "physical control or full-fledged colonial rule". "Informal imperialism" is less direct; however, it is still a powerful form of dominance."""

with open("test_text.txt", "w") as file:
    file.write(text)
    
text_words = tokenize_from_file("test_text.txt")

## Word vectors for a text

Then we need to create embeddings for each of the word vector types.

### `word2vec`

In [None]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', \
                                                                 binary=True)  

In [None]:
word2vec_output = get_vectors(word2vec_model, text_words)
np.save("test_word2vec", word2vec_output)

### `GloVe`

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="glove.6B.300d.txt", word2vec_output_file="glove.6B.300d.gensim.txt")
glove2word2vec(glove_input_file="glove.840B.300d.txt", word2vec_output_file="glove.840B.300d.gensim.txt")
!rm glove.6B.300d.txt glove.840B.300d.txt

In [None]:
glove_cc_model = gensim.models.KeyedVectors.load_word2vec_format('glove.840B.300d.gensim.txt', binary=False)
glove_wiki_model = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.300d.gensim.txt', binary=False)

In [None]:
glove_cc_output = get_vectors(glove_cc_model, text_words)
glove_wiki_output = get_vectors(glove_wiki_model, text_words)
np.save("test_glove_cc", glove_cc_output)
np.save("test_glove_wiki", glove_wiki_output)

### `ELMo` 

In [None]:
from allennlp.commands.elmo import ElmoEmbedder

elmo_embedder = ElmoEmbedder(options_file = "elmo_2x2048_256_2048cnn_1xhighway_options.json", \
                        weight_file = "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5")

elmo_embeddings_raw = elmo_embedder.embed_sentence(text_words)

elmo_embeddings = elmo_embeddings_raw[2]

np.save("test_elmo",elmo_embeddings)

### `BERT`

In [12]:
import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input
with open("test_text.txt", "r") as f:
    text = f.read()
tokenized_text = tokenizer.tokenize(text)

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])

In [35]:
model = BertModel.from_pretrained('bert-base-uncased')

model.eval()

with torch.no_grad():
    outputs = model(tokens_tensor)
    
    bert_encoded_layers = outputs[0][0]
    
bert_encoded_layers.shape

torch.Size([268, 768])

In [36]:
np.save("test_bert", bert_encoded_layers.numpy())

In [3]:
word2vec_embedding = np.load("test_word2vec.npy")
glove_cc_embedding = np.load("test_glove_cc.npy")
glove_wiki_embedding = np.load("test_glove_wiki.npy")
elmo_embedding = np.load("test_elmo.npy")
bert_embedding = np.load("test_bert.npy")

In [3]:
from sklearn.metrics.pairwise import euclidean_distances

dist_matrix = euclidean_distances(word2vec_embedding)

np.min(dist_matrix), np.mean(dist_matrix), np.max(dist_matrix)

(0.0, 3.1484153, 5.792499)

In [5]:
def vectors_to_perseus(vectors, filename, initial_radii = []):
    if len(initial_radii) == 0:
        initial_radii = [0.1 for i in range(vectors.shape[1])]
    dist_matrix = euclidean_distances(vectors)
    vectors = vectors / np.mean(dist_matrix)
    with open(filename, "w") as out:
        out.write(str(vectors.shape[1])+"\n")
        out.write("1 0.01 100\n")
        for i, vector in enumerate(vectors):
            out.write(" ".join(map(str,vector))+" " +str(initial_radii[i]))
            out.write("\n")

In [6]:
vectors_to_perseus(word2vec_embedding, filename="test_word2vec_perseus")
vectors_to_perseus(glove_cc_embedding, filename="test_glove_cc_perseus")
vectors_to_perseus(glove_wiki_embedding,filename="test_glove_wiki_perseus")
vectors_to_perseus(elmo_embedding,filename="test_elmo_perseus")
vectors_to_perseus(bert_embedding,filename="test_bert_perseus")

In [18]:
word2vec_embedding.shape

(179, 300)

In [4]:
def vectors_to_plex(vectors, filename):
#     dist_matrix = euclidean_distances(vectors)
#     vectors = vectors / np.mean(dist_matrix)
    with open(filename, "w") as out:
        out.write(" ".join([str(i) for i in range(vectors.shape[1])]))
        out.write("\n")
        for i, vector in enumerate(vectors):
            out.write(" ".join(map(str,vector)))
            out.write("\n")

In [7]:
vectors_to_plex(word2vec_embedding, filename="test_word2vec_plex")
vectors_to_plex(glove_cc_embedding, filename="test_glove_cc_plex")
vectors_to_plex(glove_wiki_embedding, filename="test_glove_wiki_plex")
vectors_to_plex(elmo_embedding, filename="test_elmo_plex")
vectors_to_plex(bert_embedding, filename="test_bert_plex")