Importing useful libraries.

In [0]:
import pickle
import pandas as pd
import numpy as np

from gensim.models import Word2Vec, FastText
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

Loading the dataset to a DataFrame.

In [0]:
dataset = pd.read_pickle('/content/drive/My Drive/dataset.pkl')

# copying docstring_tokens column.
docstring_tokens = dataset['docstring_tokens'].copy(deep=True)
# copying function_tokens column.
function_tokens = dataset['function_tokens'].copy(deep=True)

Building the docstring vocabulary and training with it a Word2Vec and a FastText model for 10 epochs. The embedding vector dimensions are set to 512.

In [0]:
# Word2Vec model.
w2v = Word2Vec(size=512, min_count=0)
w2v.build_vocab(docstring_tokens)
w2v.train(docstring_tokens, total_examples=len(docstring_tokens), epochs=10)
w2v.save('/content/drive/My Drive/w2v_model')

# FastText model.
ft = FastText(size=512, min_count=0)
ft.build_vocab(docstring_tokens)
ft.train(docstring_tokens, total_examples=len(docstring_tokens), epochs=10)
ft.save('/content/drive/My Drive/ft_model')

Building the function vocabulary and training with it a Doc2Vec model for 10 epochs. The embedding vector dimensions are set to 512.

In [0]:
# building the function token vocabulary.
tagged_data = [TaggedDocument(function, [index]) for index, function in enumerate(function_tokens)]

# Doc2Vec model.
d2v = Doc2Vec(vector_size=512, min_count=0, workers=4)
d2v.build_vocab(tagged_data)
d2v.train(tagged_data, total_examples=len(tagged_data), epochs=10)
d2v.save('/content/drive/My Drive/d2v_model')

Defining a function that creates embedding matrices. Each row contains the embedding vector of the corresponding word in the vocabulary.

In [0]:
def embedding_matrix(embedding, vocabulary, doc2vec=False):
  vocab_size = len(vocabulary)
  # initializing the weight matrix.
  embedding_matrix = np.zeros((vocab_size, 512))

  if doc2vec:
    for index, function in enumerate(vocabulary):
      embedding_matrix[index] = embedding.infer_vector(function)
  if not doc2vec:
    for index, word in enumerate(vocabulary):
      embedding_matrix[index] = embedding.wv[word]

  return embedding_matrix

Creating the embedding matrices for each model.

In [0]:
# storing the docstring vocabulary in a list.
docstring_vocab = list(w2v.wv.vocab.keys())
# storing the function vocabulary in a list.
function_vocab = list(d2v.wv.vocab.keys())

# Word2Vec embedding matrix.
w2v_matrix = embedding_matrix(w2v, docstring_vocab)
# FastText embedding matrix.
ft_matrix = embedding_matrix(ft, docstring_vocab)
# Doc2Vec embedding matrix.
d2v_matrix = embedding_matrix(d2v, function_tokens, doc2vec=True)

Exporting the docstring vocabulary and the embedding matrices in pickle format.

In [0]:
with open('/content/drive/My Drive/docstring_vocab.pkl', 'wb') as docstring_vocab_pkl:
    pickle.dump(docstring_vocab, docstring_vocab_pkl, protocol=pickle.HIGHEST_PROTOCOL)

with open('/content/drive/My Drive/function_vocab.pkl', 'wb') as function_vocab_pkl:
    pickle.dump(function_vocab, function_vocab_pkl, protocol=pickle.HIGHEST_PROTOCOL)

with open('/content/drive/My Drive/w2v_matrix.pkl', 'wb') as w2v_pkl:
    pickle.dump(w2v_matrix, w2v_pkl, protocol=pickle.HIGHEST_PROTOCOL)

with open('/content/drive/My Drive/ft_matrix.pkl', 'wb') as ft_pkl:
    pickle.dump(ft_matrix, ft_pkl, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('/content/drive/My Drive/d2v_matrix.pkl', 'wb') as d2v_pkl:
    pickle.dump(d2v_matrix, d2v_pkl, protocol=pickle.HIGHEST_PROTOCOL)