In [1]:
!pip3 show gensim

Name: gensim
Version: 3.6.0
Summary: Python framework for fast Vector Space Modelling
Home-page: http://radimrehurek.com/gensim
Author: Radim Rehurek
Author-email: me@radimrehurek.com
License: LGPLv2.1
Location: /home/fox/.local/lib/python3.10/site-packages
Requires: numpy, scipy, six, smart-open
Required-by: 


In [21]:
from gensim.models import Word2Vec
from datetime import datetime
import os
import numpy as np
from collections import defaultdict

In [31]:
model = 'word2vec'
#model = 'glove'
dim = 300
language = 'he'

In [32]:
def display_embedding_properties(embedding):
    print(f'Embedding entries: {len(embedding.keys())}')
    print(f'Embedding dimension: {len(embedding[list(embedding.keys())[0]])}')

In [33]:
def build_dict_from_keyedvectors(m):
    """ For processing Word2Vec .model files. The .npy files are also needed. Works with gensim 3.6"""
    embedding = defaultdict()
    for word in list(m.wv.vocab.keys()):
        embedding[word] = m.wv[word]
    display_embedding_properties(embedding)
    return embedding

In [34]:
def build_dict_from_vector_file(path_to_vecs, filename):
    """ For processing GloVe .txt models. Might no longer be needed for newer versions of gensim,
    as the gensim load_word2vec_format function can handle GloVe outputs as well"""
    embedding = {}
    if filename not in os.listdir(path_to_vecs):
        print(f'File not found. Generate it using GloVe.')
    with open(path_to_vecs+filename, 'r') as f:
        f = f.read().split('\n')
        f = [l.split(' ') for l in f]
        n_entries = len(f)
        for i, l in enumerate(f):
            w = l[0]
            try:
                v = np.array([float(x) for x in l[1:]])
            except ValueError:
                print(f'Line: {i}')
                print(f'Word: {w}')
                print(f'Vector: {l[1:]}')
            embedding[w] = v
            if (i % 20000 == 0):
                print(f'Processed {i} / {n_entries} entries')
    display_embedding_properties(embedding)
    return embedding

In [35]:
if model == 'word2vec':
    inp = f'word2vec_{dim}_{language}.model'
    print(f'Loading started {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    m = Word2Vec.load(f'./models/raws/{inp}')
    print(f'Loading ended {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    embedding = build_dict_from_keyedvectors(m)
if model == 'glove':
    inp = f'glove_{dim}_{language}_vectors.txt'
    print(f'Loading started {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    embedding = build_dict_from_vector_file('./models/raws/', inp)
    print(f'Loading ended {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

Loading started 2022-10-01 17:16:41
Loading ended 2022-10-01 17:16:49
Embedding entries: 595528
Embedding dimension: 300


['קובץ', 'mathematics', 'lecture', 'at', 'the', 'helsinki', 'university', 'of', 'technology', 'jpg']


array([-0.17736691,  0.33108684, -0.25844473,  0.2639548 ,  0.32674396,
        0.10968599, -0.30168736, -0.13412088, -0.02850869,  0.05250233,
        0.20113151, -0.04400417,  0.38172805,  0.20361969, -0.11396456,
        0.1479513 , -0.01490134,  0.05840242, -0.27284765, -0.10708918,
        0.1672865 , -0.12336667, -0.04416154, -0.04901268, -0.299237  ],
      dtype=float32)