# Creating embedding using gensim

In [None]:
!pip install --upgrade gensim

Collecting gensim
  Downloading gensim-4.0.1-cp37-cp37m-manylinux1_x86_64.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 1.3 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.1


In [None]:
!git clone https://github.com/hgrif/wiki-word2vec.git

Cloning into 'wiki-word2vec'...
remote: Enumerating objects: 42, done.[K
remote: Total 42 (delta 0), reused 0 (delta 0), pack-reused 42[K
Unpacking objects: 100% (42/42), done.


In [None]:
#Get swahili data
!mkdir -p data/sw/
!wget -P data/sw/ https://dumps.wikimedia.org/swwiki/latest/swwiki-latest-pages-articles.xml.bz2


--2021-08-13 22:46:48--  https://dumps.wikimedia.org/swwiki/latest/swwiki-latest-pages-articles.xml.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.7, 2620:0:861:1:208:80:154:7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34487959 (33M) [application/octet-stream]
Saving to: ‘data/sw/swwiki-latest-pages-articles.xml.bz2’


2021-08-13 22:46:55 (4.98 MB/s) - ‘data/sw/swwiki-latest-pages-articles.xml.bz2’ saved [34487959/34487959]



In [None]:
import multiprocessing
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec

wiki = WikiCorpus('data/sw/swwiki-latest-pages-articles.xml.bz2', 
                   dictionary={})
sentences = list(wiki.get_texts())




In [None]:
params = {'window': 10, 'min_count': 10, 
          'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1E-3,}

In [None]:
word2vec = Word2Vec(sentences, **params)
word2vec.save("word2vec.model")

In [None]:
from scipy.spatial.distance import cosine

In [None]:
vector_man = word2vec.wv['mwanaume']  # get numpy vector of a word

vector_boy = word2vec.wv['mvulana']  # get numpy vector of a word
print("Cosine between boy and man in "+ str(5) +" epochs, is: " + str(1 - cosine(vector_man, vector_boy)))

Cosine between boy and man in 5 epochs, is: 0.7277519106864929


In [None]:
for i in range(10):  
  word2vec.train(sentences, total_examples=word2vec.corpus_count, epochs=5)
  vector_man = word2vec.wv['mwanaume']  # get numpy vector of a word
  vector_boy = word2vec.wv['mvulana']  # get numpy vector of a word
  print("Cosine between boy and man in "+ str(i*5) +" epochs, is: " + str(1 - cosine(vector_man, vector_boy)))



Cosine between boy and man in 0 epochs, is: 0.6671359539031982




Cosine between boy and man in 5 epochs, is: 0.6383607387542725




Cosine between boy and man in 10 epochs, is: 0.5517609119415283




Cosine between boy and man in 15 epochs, is: 0.49562448263168335




Cosine between boy and man in 20 epochs, is: 0.49191808700561523




Cosine between boy and man in 25 epochs, is: 0.4959765076637268




Cosine between boy and man in 30 epochs, is: 0.47750186920166016




Cosine between boy and man in 35 epochs, is: 0.48383042216300964




Cosine between boy and man in 40 epochs, is: 0.4489280879497528
Cosine between boy and man in 45 epochs, is: 0.44285863637924194


In [None]:
word2vec.save("word2vec.model")

In [None]:
model_swahili = Word2Vec.load("word2vec.model")

In [None]:
vector_man = model_swahili.wv['mwanaume']  # get numpy vector of a word
sims = model_swahili.wv.most_similar('mwanaume', topn=10)  # get other similar words

In [None]:
sims

[('mvulana', 0.7277518510818481),
 ('msichana', 0.7257815599441528),
 ('nae', 0.7235980033874512),
 ('bennet', 0.7187550067901611),
 ('nusura', 0.7069060802459717),
 ('dully', 0.6989878416061401),
 ('wakina', 0.6972491145133972),
 ('bi', 0.6946593523025513),
 ('mahaba', 0.6930020451545715),
 ('aje', 0.6900444626808167)]

In [None]:
vector_boy = model_swahili.wv['mvulana']  # get numpy vector of a word

In [None]:
#Import cosine distance
from scipy.spatial.distance import cosine

In [None]:
1 - cosine(vector_man, vector_boy)

0.7277519106864929

In [None]:
vector_queen = model_swahili.wv['malkia']  # get numpy vector of a word
sims = model_swahili.wv.most_similar('malkia', topn=10)  # get other similar words

In [None]:
sims

[('mfalme', 0.8444384932518005),
 ('mtawala', 0.7251664996147156),
 ('mkabaila', 0.6911629438400269),
 ('farao', 0.6742110252380371),
 ('mtemi', 0.6704172492027283),
 ('mrithi', 0.6614590883255005),
 ('alirithi', 0.6611064672470093),
 ('kifalme', 0.6572237610816956),
 ('alitawala', 0.6564803719520569),
 ('mke', 0.6315830945968628)]

In [None]:
#mfalme -> king
#mtawala -> ruler
#mkabaila -> landlord
#farao -> pharaoh
#mtemi -> spit
#mrithi -> heir
#alirithi -> he does not inherit
#kifalme -> royal
#alitawala -> he ruled
#mke -> wife

# Creating a Keras model from embeddings

In [None]:
l = len(list(model_swahili.wv.index_to_key))

In [None]:
# TensorFlow imports
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras import models
from tensorflow.keras.layers import Dense, Dropout, Embedding, Conv1D, MaxPooling1D, GlobalAveragePooling1D

In [None]:
from keras.preprocessing.text import Tokenizer
import numpy as np

In [None]:
all_words = list(model_swahili.wv.index_to_key)

In [None]:
vocab = model_swahili.wv.index_to_key    
t = Tokenizer()

vocab_size = len(all_words) + 1
t.fit_on_texts(all_words)

def get_weight_matrix():
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, model_swahili.vector_size))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for i in range(len(all_words)):
        weight_matrix[i + 1] = model_swahili.wv[all_words[i]]
    return weight_matrix

embedding_vectors = get_weight_matrix()
emb_layer = Embedding(vocab_size, output_dim=model_swahili.vector_size, weights=[embedding_vectors], trainable=False)

In [None]:
emb_layer

<tensorflow.python.keras.layers.embeddings.Embedding at 0x7f2efac368d0>

In [None]:
# Create model instance
model = models.Sequential()

In [None]:
model.add(emb_layer)

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         3811500   
Total params: 3,811,500
Trainable params: 0
Non-trainable params: 3,811,500
_________________________________________________________________


In [None]:
t.texts_to_sequences(["mwanaume"])

[[5006]]

In [None]:
vector_man = model(t.texts_to_sequences(["mwanaume"]))

In [None]:
vector_man.shape

TensorShape([100])

In [None]:
tf.keras.models.save_model(
    model, "./model")





INFO:tensorflow:Assets written to: ./model/assets


INFO:tensorflow:Assets written to: ./model/assets


In [None]:
!tar -czvf swahili_word2vec.tar.gz -C model .

./
./keras_metadata.pb
./assets/
./variables/
./variables/variables.index
./variables/variables.data-00000-of-00001
./saved_model.pb


In [None]:
import pickle

# saving
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(t, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# loading
with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)