## Generating pre-trained word embeddings for WOI with word2vec

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import numpy as np

### Read word2vec format and produce word: vector map as dictionary


In [2]:
#testing other version of code
import mmap
import numpy as np

def read_word_vectors(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m:
            # Iterate over the lines of the file
            for line in iter(m.readline, b''):
                # Split the line into the word and the word vector
                word, vec = line.decode().split(maxsplit=1)
                # Convert the word vector to a NumPy array and yield it
                yield word, np.fromstring(vec, sep=' ')

# Use the generator to create a dictionary
emb_map = {word: vec for word, vec in read_word_vectors("vectors_word2vec_de.txt")}

### Produce test vector

In [3]:
test_vector = np.array(emb_map["b'umwelt'"]) #["b'demokratie'"], ["b'populismus'"]

### Generate similariy matrix calculating the cos similarity between the test vector and all other word vectors

In [4]:
def generate_sim_matrix(test_vector):
    sim_matrix = []
    for k, v in emb_map.items():
        v = np.array(v)
        sim_matrix.append((k, cosine_similarity(test_vector.reshape(1, -1), v.reshape(1, -1))))
    sim_matrix.sort(key=lambda x: x[1], reverse=True)
    return sim_matrix

In [5]:
# sort most similar word vectors

sim_matrix = generate_sim_matrix(test_vector)

In [6]:
sim_matrix[:35]

[("b'umwelt'", array([[1.]])),
 ("b'umweltschutz'", array([[0.74999176]])),
 ("b'klimaschutz'", array([[0.70032432]])),
 ("b'naturschutz'", array([[0.67125974]])),
 ("b'gesundheit'", array([[0.67042043]])),
 ("b'nachhaltigkeit'", array([[0.66631933]])),
 ("b'klimawandel'", array([[0.65843577]])),
 ("b'nachhaltige'", array([[0.65502093]])),
 ("b'natur'", array([[0.63835597]])),
 ("b'raumordnung'", array([[0.63803572]])),
 ("b'umweltpolitik'", array([[0.63557845]])),
 ("b'arbeitsschutz'", array([[0.61883308]])),
 ("b'energiewende'", array([[0.61787225]])),
 ("b'energiepolitik'", array([[0.61762217]])),
 ("b'\\xc3\\xb6kologische'", array([[0.61610089]])),
 ("b'soziales'", array([[0.60977773]])),
 ("b'tierschutz'", array([[0.60484044]])),
 ("b'bmu'", array([[0.60274491]])),
 ("b'mobilit\\xc3\\xa4t'", array([[0.6019124]])),
 ("b'gew\\xc3\\xa4sserschutz'", array([[0.60137755]])),
 ("b'raumplanung'", array([[0.60009331]])),
 ("b'umweltbundesamt'", array([[0.59946164]])),
 ("b'nachhaltiges'", 

In [8]:
woi_umwelt = sim_matrix[:35]

with open("woi_35_umwelt.txt", "w") as output:
    output.write(str(woi_umwelt))