# Creating embedding using gensim

In [1]:
!pip install --upgrade gensim

Collecting gensim
  Downloading gensim-4.0.1-cp37-cp37m-manylinux1_x86_64.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 93 kB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.1


In [2]:
!git clone https://github.com/hgrif/wiki-word2vec.git

Cloning into 'wiki-word2vec'...
remote: Enumerating objects: 42, done.[K
remote: Total 42 (delta 0), reused 0 (delta 0), pack-reused 42[K
Unpacking objects: 100% (42/42), done.


In [3]:
#Get swahili data
!mkdir -p data/sw/
!wget -P data/sw/ https://dumps.wikimedia.org/swwiki/latest/swwiki-latest-pages-articles.xml.bz2


--2021-08-24 10:35:07--  https://dumps.wikimedia.org/swwiki/latest/swwiki-latest-pages-articles.xml.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.7, 2620:0:861:1:208:80:154:7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34806466 (33M) [application/octet-stream]
Saving to: ‘data/sw/swwiki-latest-pages-articles.xml.bz2’


2021-08-24 10:35:15 (4.36 MB/s) - ‘data/sw/swwiki-latest-pages-articles.xml.bz2’ saved [34806466/34806466]



In [4]:
import multiprocessing
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec



In [12]:
import multiprocessing
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec

wiki = WikiCorpus('data/sw/swwiki-latest-pages-articles.xml.bz2', 
                   dictionary={})
sentences = list(wiki.get_texts())


In [13]:
params = {'window': 10, 'min_count': 10, 
          'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1E-3,}

In [14]:
word2vec = Word2Vec(sentences, **params, epochs =1)
word2vec.save("word2vec.model")

In [15]:
from scipy.spatial.distance import cosine

In [16]:
vector_man = word2vec.wv['mwanaume']  # get numpy vector of a word

vector_boy = word2vec.wv['mvulana']  # get numpy vector of a word
print("Cosine similarity between boy and man in "+ str(1) +" epochs, is: " + str(1 - cosine(vector_man, vector_boy)))

vector_woman = word2vec.wv['mwanamke']  # get numpy vector of a word

vector_queen = word2vec.wv['malkia']  # get numpy vector of a word
print("Cosine similarity between woman and queen in "+ str(1) +" epochs, is: " + str(1 - cosine(vector_woman, vector_queen)))

print("Most 10 similar words to man")
sims = word2vec.wv.most_similar('mwanaume', topn=10)  # get other similar words
print(sims)

Cosine similarity between boy and man in 1 epochs, is: 0.8737907409667969
Cosine similarity between woman and queen in 1 epochs, is: 0.6921840310096741
Most 10 similar words to man
[('aje', 0.8981823921203613), ('kumwambia', 0.8881581425666809), ('mohinder', 0.8873194456100464), ('yustus', 0.8758364915847778), ('mvulana', 0.8737908005714417), ('niko', 0.8714087605476379), ('mwenzie', 0.870768129825592), ('anazidi', 0.870003879070282), ('anampenda', 0.869492769241333), ('alipenda', 0.868550181388855)]


In [17]:
for i in range(4):  
  word2vec.train(sentences, total_examples=word2vec.corpus_count, epochs=1)
  vector_man = word2vec.wv['mwanaume']  # get numpy vector of a word
  vector_boy = word2vec.wv['mvulana']  # get numpy vector of a word

  print("Cosine similarity between boy and man in "+ str(i+2) +" epochs, is: " + str(1 - cosine(vector_man, vector_boy)))

  vector_woman = word2vec.wv['mwanamke']  # get numpy vector of a word

  vector_queen = word2vec.wv['malkia']  # get numpy vector of a word
  print("Cosine similarity between woman and queen in "+ str(i+2) +" epochs, is: " + str(1 - cosine(vector_woman, vector_queen)))

  print("Most 10 similar words to man")
  sims = word2vec.wv.most_similar('mwanaume', topn=10)  # get other similar words
  print(sims)

Cosine similarity between boy and man in 2 epochs, is: 0.7506097555160522
Cosine similarity between woman and queen in 2 epochs, is: 0.4878472089767456
Most 10 similar words to man
[('mahaba', 0.8240833878517151), ('anaona', 0.8000137805938721), ('nae', 0.7809977531433105), ('mwanamume', 0.7683743238449097), ('mwenzi', 0.7664150595664978), ('uchumba', 0.7661716938018799), ('atakuwa', 0.7606471180915833), ('anaonekana', 0.7572895884513855), ('msichana', 0.7555936574935913), ('msela', 0.7535034418106079)]
Cosine similarity between boy and man in 3 epochs, is: 0.6831587553024292
Cosine similarity between woman and queen in 3 epochs, is: 0.4275827407836914
Most 10 similar words to man
[('anaona', 0.7696461081504822), ('mahaba', 0.7653655409812927), ('msela', 0.7226854562759399), ('anaanza', 0.7118600606918335), ('aliye', 0.7118346095085144), ('mwenzi', 0.6991514563560486), ('anaonekana', 0.696577250957489), ('nae', 0.6944088339805603), ('atakuwa', 0.68865567445755), ('msichana', 0.68864744

In [18]:
for i in range(5, 10):  
  word2vec.train(sentences, total_examples=word2vec.corpus_count, epochs=5)
  vector_man = word2vec.wv['mwanaume']  # get numpy vector of a word
  vector_boy = word2vec.wv['mvulana']  # get numpy vector of a word

  print("Cosine similarity between boy and man in "+ str(5 + i*5) +" epochs, is: " + str(1 - cosine(vector_man, vector_boy)))

  vector_woman = word2vec.wv['mwanamke']  # get numpy vector of a word

  vector_queen = word2vec.wv['malkia']  # get numpy vector of a word
  print("Cosine similarity between woman and queen in "+ str(5 + i*5) +" epochs, is: " + str(1 - cosine(vector_woman, vector_queen)))

  print("Most 10 similar words to man")
  sims = word2vec.wv.most_similar('mwanaume', topn=10)  # get other similar words
  print(sims)

Cosine similarity between boy and man in 30 epochs, is: 0.5796116590499878
Cosine similarity between woman and queen in 30 epochs, is: 0.3289449214935303
Most 10 similar words to man
[('mwanamume', 0.619727373123169), ('anaona', 0.6174345016479492), ('mahaba', 0.6098613739013672), ('asiye', 0.6018980741500854), ('anayeitwa', 0.6004301905632019), ('aliye', 0.5986660718917847), ('msichana', 0.5888607501983643), ('yule', 0.582199215888977), ('mvulana', 0.5796117782592773), ('zopa', 0.5671093463897705)]
Cosine similarity between boy and man in 35 epochs, is: 0.5513689517974854
Cosine similarity between woman and queen in 35 epochs, is: 0.31930071115493774
Most 10 similar words to man
[('mwanamume', 0.5920594334602356), ('mahaba', 0.589745044708252), ('anaona', 0.5841481685638428), ('anayeitwa', 0.5664746165275574), ('yule', 0.5593873858451843), ('msichana', 0.5557315945625305), ('mvulana', 0.5513689517974854), ('aliye', 0.5466966032981873), ('yupo', 0.5394044518470764), ('shoga', 0.5323488

In [19]:
word2vec.save("word2vec.model")

In [20]:
model_swahili = Word2Vec.load("word2vec.model")

In [21]:
vector_man = model_swahili.wv['mwanaume']  # get numpy vector of a word
sims = model_swahili.wv.most_similar('mwanaume', topn=10)  # get other similar words

In [None]:
sims

[('mvulana', 0.7277518510818481),
 ('msichana', 0.7257815599441528),
 ('nae', 0.7235980033874512),
 ('bennet', 0.7187550067901611),
 ('nusura', 0.7069060802459717),
 ('dully', 0.6989878416061401),
 ('wakina', 0.6972491145133972),
 ('bi', 0.6946593523025513),
 ('mahaba', 0.6930020451545715),
 ('aje', 0.6900444626808167)]

In [None]:
vector_boy = model_swahili.wv['mvulana']  # get numpy vector of a word

In [None]:
1 - cosine(vector_man, vector_boy)

0.7277519106864929

In [None]:
vector_queen = model_swahili.wv['malkia']  # get numpy vector of a word
sims = model_swahili.wv.most_similar('malkia', topn=10)  # get other similar words

In [None]:
sims

[('mfalme', 0.8444384932518005),
 ('mtawala', 0.7251664996147156),
 ('mkabaila', 0.6911629438400269),
 ('farao', 0.6742110252380371),
 ('mtemi', 0.6704172492027283),
 ('mrithi', 0.6614590883255005),
 ('alirithi', 0.6611064672470093),
 ('kifalme', 0.6572237610816956),
 ('alitawala', 0.6564803719520569),
 ('mke', 0.6315830945968628)]

In [None]:
#mfalme -> king
#mtawala -> ruler
#mkabaila -> landlord
#farao -> pharaoh
#mtemi -> spit
#mrithi -> heir
#alirithi -> he does not inherit
#kifalme -> royal
#alitawala -> he ruled
#mke -> wife

# Creating a Keras model from embeddings

In [22]:
l = len(list(model_swahili.wv.index_to_key))

In [23]:
# TensorFlow imports
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras import models
from tensorflow.keras.layers import Dense, Dropout, Embedding, Conv1D, MaxPooling1D, GlobalAveragePooling1D

In [24]:
from keras.preprocessing.text import Tokenizer
import numpy as np

In [25]:
all_words = list(model_swahili.wv.index_to_key)

In [26]:
vocab = model_swahili.wv.index_to_key    
t = Tokenizer()

vocab_size = len(all_words) + 1
t.fit_on_texts(all_words)

def get_weight_matrix():
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, model_swahili.vector_size))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for i in range(len(all_words)):
        weight_matrix[i + 1] = model_swahili.wv[all_words[i]]
    return weight_matrix

embedding_vectors = get_weight_matrix()
emb_layer = Embedding(vocab_size, output_dim=model_swahili.vector_size, weights=[embedding_vectors], trainable=False)

In [27]:
emb_layer

<keras.layers.embeddings.Embedding at 0x7f113e8b6e90>

In [28]:
# Create model instance
model = models.Sequential()

In [29]:
model.add(emb_layer)

In [30]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         3834400   
Total params: 3,834,400
Trainable params: 0
Non-trainable params: 3,834,400
_________________________________________________________________


In [31]:
t.texts_to_sequences(["mwanaume"])

[[4994]]

Checking similarity between keras model and the real model embeddings

In [32]:
vector_man = model(t.texts_to_sequences(["mwanaume"]))

In [33]:
vector_man

<tf.Tensor: shape=(100,), dtype=float32, numpy=
array([-3.0172608 , -1.5864413 , -1.4447039 , -0.14817056, -0.02159376,
       -2.7592428 ,  0.71549946, -1.5223407 , -1.1996162 ,  2.046477  ,
       -1.1711432 , -1.2182232 , -2.232007  , -0.25378558, -0.8981542 ,
        0.585518  ,  0.22908595, -2.8375628 ,  1.0793303 ,  1.7231343 ,
        0.11506318,  1.6207782 ,  1.112322  ,  0.31028506,  0.98109245,
        0.14098765,  0.9957303 , -0.60776424, -2.9587255 , -0.7362973 ,
        2.164359  ,  0.7275612 , -1.6150517 ,  1.9877877 , -1.5568479 ,
        2.7859802 , -1.9994256 , -0.54866195,  0.01171932, -0.6619937 ,
        2.1711128 ,  1.0920217 , -1.2081722 ,  0.19694555, -0.3982888 ,
        0.42264226, -0.16007124,  3.2177942 , -0.3301792 ,  1.2515919 ,
       -0.251235  , -3.4790795 , -1.4575737 , -0.6303651 ,  2.8331814 ,
        0.237851  ,  0.99203706,  1.102811  , -0.91195667, -2.6602073 ,
       -2.3903718 , -1.655804  , -2.7919054 , -1.315015  , -1.0214618 ,
        0.288926

In [34]:
model_swahili.wv['mwanaume']

array([-3.0172608 , -1.5864413 , -1.4447039 , -0.14817056, -0.02159376,
       -2.7592428 ,  0.71549946, -1.5223407 , -1.1996162 ,  2.046477  ,
       -1.1711432 , -1.2182232 , -2.232007  , -0.25378558, -0.8981542 ,
        0.585518  ,  0.22908595, -2.8375628 ,  1.0793303 ,  1.7231343 ,
        0.11506318,  1.6207782 ,  1.112322  ,  0.31028506,  0.98109245,
        0.14098765,  0.9957303 , -0.60776424, -2.9587255 , -0.7362973 ,
        2.164359  ,  0.7275612 , -1.6150517 ,  1.9877877 , -1.5568479 ,
        2.7859802 , -1.9994256 , -0.54866195,  0.01171932, -0.6619937 ,
        2.1711128 ,  1.0920217 , -1.2081722 ,  0.19694555, -0.3982888 ,
        0.42264226, -0.16007124,  3.2177942 , -0.3301792 ,  1.2515919 ,
       -0.251235  , -3.4790795 , -1.4575737 , -0.6303651 ,  2.8331814 ,
        0.237851  ,  0.99203706,  1.102811  , -0.91195667, -2.6602073 ,
       -2.3903718 , -1.655804  , -2.7919054 , -1.315015  , -1.0214618 ,
        0.28892642,  1.7666311 ,  1.3728402 , -2.0674553 , -1.43

In [None]:
tf.keras.models.save_model(
    model, "./model")





INFO:tensorflow:Assets written to: ./model/assets


INFO:tensorflow:Assets written to: ./model/assets


In [None]:
!tar -czvf swahili_word2vec.tar.gz -C model .

./
./keras_metadata.pb
./assets/
./variables/
./variables/variables.index
./variables/variables.data-00000-of-00001
./saved_model.pb


In [None]:
import pickle

# saving
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(t, handle, protocol=pickle.HIGHEST_PROTOCOL)

Usage

In [None]:
# loading
with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
!curl https://gsoc-tf.web.app/swahili_word2vec.tar.gz -o swahili.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 13.4M  100 13.4M    0     0  5187k      0  0:00:02  0:00:02 --:--:-- 5185k


In [None]:
!tar -xzvf swahili.tar.gz -C model/

./
./keras_metadata.pb
./assets/
./variables/
./variables/variables.index
./variables/variables.data-00000-of-00001
./saved_model.pb


In [None]:
from tensorflow import keras
model = keras.models.load_model('model/')



In [None]:
!curl https://gsoc-tf.web.app/tokenizer.pkl -o tokenizer.pkl

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1489k  100 1489k    0     0   848k      0  0:00:01  0:00:01 --:--:--  848k


In [None]:
import pickle
with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
example = "mwanaume"

In [None]:
model(tokenizer.texts_to_sequences([example]))

<tf.Tensor: shape=(100,), dtype=float32, numpy=
array([ 0.62907344,  0.2276254 ,  0.22085622, -0.46895516,  0.27026492,
        0.2784383 ,  0.54466283,  0.21442215,  0.12079044,  0.94317925,
       -0.34540728, -0.01303885,  0.20965706,  0.23807919,  0.0609422 ,
        0.03674065,  0.21173401, -0.47123212,  0.4488169 ,  0.10567676,
       -0.65623206,  0.17985752, -0.03540061,  0.3520905 , -0.3233151 ,
       -0.24789533, -0.4004243 , -0.07531579, -0.07195444,  0.410435  ,
        0.3338795 ,  0.25405818, -0.8489223 ,  0.29918787, -1.1747959 ,
        0.47070527, -1.0429802 , -0.87005335,  0.696955  , -1.1065627 ,
        0.33444297,  0.53932905,  0.48503667, -0.3742581 ,  0.9630083 ,
        0.40159884, -0.8021837 , -0.07805784, -0.4203436 , -0.8308751 ,
        0.09017416, -0.45730403, -0.37233385,  0.07526768, -0.2897628 ,
       -0.62796044,  0.9930027 , -0.5539022 ,  0.09428282, -0.31144488,
       -0.49341264, -1.4873661 , -0.36284766, -0.21989343, -0.23004624,
       -0.355934