Importing useful libraries and APIs.

In [0]:
import pickle
import pandas as pd
import numpy as np

import tensorflow_datasets as tfds

from gensim.models import Word2Vec, FastText

Loading the dataset to a DataFrame.

In [0]:
dataset = pd.read_pickle('/content/drive/My Drive/dataset.pkl')

# copying docstring_tokens column.
docstring_tokens = dataset['docstring_tokens'].copy(deep=True)

Building vocabulary and training with it a Word2Vec and a FastText model for 10 epochs. The embedding vector dimensions are set to 512.

In [0]:
# Word2Vec model.
w2v = Word2Vec(size=512, min_count=0)
w2v.build_vocab(docstring_tokens)
w2v.train(docstring_tokens, total_examples=len(docstring_tokens), epochs=10)

# FastText model.
ft = FastText(size=512, min_count=0)
ft.build_vocab(docstring_tokens)
ft.train(docstring_tokens, total_examples=len(docstring_tokens), epochs=10)

Defining a function that creates embedding matrices. Each row contains the embedding vector of the corresponding word in the vocabulary.

In [0]:
def embedding_matrix(embedding, vocabulary):
  vocab_size = len(vocab_list)
  # initializing the weight matrix.
  embedding_matrix = np.zeros((vocab_size, 512))

  for index, word in enumerate(vocab_list):
    embedding_matrix[index] = embedding.wv[word]

  return embedding_matrix

Creating the embedding matrices for each model.

In [0]:
# storing the vocabulary in a list.
vocab_list = list(w2v.wv.vocab.keys())

# Word2Vec embedding matrix.
w2v_matrix = embedding_matrix(w2v, vocab_list)

# FastText embedding matrix.
ft_matrix = embedding_matrix(ft, vocab_list)

Exporting the vocabulary and the embedding matrices in pickle format.

In [0]:
with open('/content/drive/My Drive/vocabulary.pkl', 'wb') as vocab_pkl:
    pickle.dump(vocab_list, vocab_pkl, protocol=pickle.HIGHEST_PROTOCOL)

with open('/content/drive/My Drive/w2v_matrix.pkl', 'wb') as w2v_pkl:
    pickle.dump(w2v_matrix, w2v_pkl, protocol=pickle.HIGHEST_PROTOCOL)

with open('/content/drive/My Drive/ft_matrix.pkl', 'wb') as ft_pkl:
    pickle.dump(ft_matrix, ft_pkl, protocol=pickle.HIGHEST_PROTOCOL)