Installing Spotify's Annoy.

In [0]:
pip install annoy

Installing the W&B package.

In [0]:
pip install wandb

Importing libraries and APIs.

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras

import re
import time
import datetime
import pickle
import math as m
import pandas as pd
import numpy as np
from annoy import AnnoyIndex

Initiating a run on W&B.

In [0]:
import wandb
wandb.init(project='self-attention-cosine')

Loading the TensorBoard notebook extension.

In [0]:
%load_ext tensorboard

Loading the dataset and the vocabularies.

In [0]:
# loading the dataset.
dataset = pd.read_pickle('/content/drive/My Drive/dataset.pkl')

# copying docstring_tokens column.
docstring_tokens = dataset['docstring_tokens'].copy(deep=True)
# copying function_tokens column.
function_tokens = dataset['function_tokens'].copy(deep=True)

# loading the docstring vocabulary.
docstring_vocab =  pickle.load(open('/content/drive/My Drive/docstring_vocab.pkl', 'rb'))
# loading the function vocabulary.
function_vocab =  pickle.load(open('/content/drive/My Drive/function_vocab.pkl', 'rb'))

Defining the encoding functions.

*   *encode*: encodes all words in a dataset to integers.
*   *to_numpy*: converts the dataframes to numpy and pads them with zeros.

In [0]:
def encode(inp, tar, input_encoder, target_encoder):
  # encoding input data.
  for index, row in enumerate(inp):
    inp[index] = [input_encoder.encode(token)[0] for token in row]

  # encoding target data.
  for index, row in enumerate(tar):
    tar[index] = [target_encoder.encode(token)[0] for token in row]

  return inp, tar

def to_numpy(inp, tar):
  # converting input data to numpy.
  inp = pd.DataFrame(list(inp))
  inp = inp.to_numpy()
  inp = np.nan_to_num(inp)
  inp = inp.astype(int)

  # converting target data to numpy.
  tar = pd.DataFrame(list(tar))
  tar = tar.to_numpy()
  tar = np.nan_to_num(tar)
  tar = tar.astype(int)

  return inp, tar

Splitting the dataset to training and validation sets and applying the encoding functions to the docstring and function tokens.

In [0]:
# building input_encoder.
input_encoder = tfds.features.text.TokenTextEncoder(docstring_vocab)
# building target_encoder.
target_encoder = tfds.features.text.TokenTextEncoder(function_vocab)

# initializing original_input list of lists.
original_input = docstring_tokens.copy(deep=True)
# initializing original_target list.
original_target = function_tokens.copy(deep=True)

# splitting to training set.
training_input = original_input[:480000]
training_target = original_target[:480000]
# splitting to validation set.
validation_input = original_input[480000:]
validation_input = validation_input.reset_index(drop=True)
validation_target = original_target[480000:]
validation_target = validation_target.reset_index(drop=True)

# applying encoding to input and target data.
encoded_training_input, encoded_training_target = encode(training_input, training_target, input_encoder, target_encoder)
encoded_validation_input, encoded_validation_target = encode(validation_input, validation_target, input_encoder, target_encoder)
# converting input and target data to numpy.
encoded_training_input, encoded_training_target = to_numpy(encoded_training_input, encoded_training_target)
encoded_validation_input, encoded_validation_target = to_numpy(encoded_validation_input, encoded_validation_target)

Creating the training and validation tensor datasets using Tensorflow Datasets API.

In [0]:
BATCH_SIZE = 64

# creating a tensor dataset with the training data.
train_dataset = tf.data.Dataset.from_tensor_slices((encoded_training_input, encoded_training_target))
# caching the dataset for performance optimizations.
train_dataset = train_dataset.cache()
train_dataset = train_dataset.batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

# creating a tensor dataset with the validation data.
valid_dataset = tf.data.Dataset.from_tensor_slices((encoded_validation_input, encoded_validation_target))
# caching the dataset for performance optimizations.
valid_dataset = valid_dataset.cache()
valid_dataset = valid_dataset.batch(BATCH_SIZE)
valid_dataset = valid_dataset.prefetch(tf.data.experimental.AUTOTUNE)

Creating the Positional Encoding layer of the Transformer Encoder.

In [0]:
class PositionalEncoding(tf.keras.layers.Layer):

  def __init__(self, position, d_model):
    super(PositionalEncoding, self).__init__()

    # creating the positional encoding matrix.
    self.pe = self.positional_encoding(position, d_model)

  def positional_encoding(self, position, d_model):
    # storing word positions to a matrix.
    position = tf.range(position, dtype=tf.float32)[:, tf.newaxis]
    # storing embedding components to a matrix.
    i = tf.range(d_model, dtype=tf.float32)[tf.newaxis, :]

    # calculating the angles.
    angle = tf.multiply(position, 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, dtype=tf.float32)))

    # applying sine to the angles of even indices.
    sines = tf.sin(angle[:, 0::2])
    # applying cosine to the angles of odd indices.
    cosines = tf.cos(angle[:, 1::2])

    # concatenating sines and cosines in one matrix.
    pe = tf.concat([sines, cosines], axis=-1)[tf.newaxis, ...]

    return tf.cast(pe, dtype=tf.float32)

  def call(self, x):
    # adding positional encoding to the input embeddings on call.
    return x + self.pe[:, :tf.shape(x)[-2], :]

Creating the Multi-Head Attention layer of the Transformer Encoder.

In [0]:
class MultiHeadAttention(tf.keras.layers.Layer):

  def __init__(self, num_heads, d_model):
    super(MultiHeadAttention, self).__init__()
    
    self.num_heads = num_heads
    self.d_model = d_model
    
    self.head_size = d_model // num_heads

    # creating the weight matrices for each head.
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)

    # creating the weight matrix for the output.
    self.dense = tf.keras.layers.Dense(d_model)

  def call(self, query, key, value, mask):
    # storing the batch size.
    batch_size = tf.shape(query)[-3]

    # passing query, key and value as input to the weight matrices.
    query = self.wq(query)
    key = self.wk(key)
    value = self.wv(value)

    # splitting the dense tensors for each head.
    query = tf.reshape(query, [batch_size, -1, self.num_heads, self.head_size])
    key = tf.reshape(key, [batch_size, -1, self.num_heads, self.head_size])
    value = tf.reshape(value, [batch_size, -1, self.num_heads, self.head_size])

    # transposing the number of heads and sequence length columns.
    query = tf.transpose(query, perm=[0, 2, 1, 3])
    key = tf.transpose(key, perm=[0, 2, 1, 3])
    value = tf.transpose(value, perm=[0, 2, 1, 3])

    # calculating the similarity score.
    query_keyT = tf.matmul(query, key, transpose_b=True)

    # calculating the depth.
    depth = tf.cast(tf.shape(key)[-1], dtype=tf.float32)
    # calculating the scale factor.
    scale = 1 / tf.sqrt(depth)

    # calculating the scaled similarity scores.
    scores = query_keyT * scale

    # masking out key/value pairs.
    if mask is not None:
      scores *= mask
      scores = tf.where(tf.equal(scores, 0), tf.ones_like(scores) * -1e9, scores)

    # calculating the scaled similarity scores' softmax matrix.
    softmax = tf.nn.softmax(scores)

    # calculating the scaled dot-product attention for each head.
    attention = tf.matmul(softmax, value)
    attention = tf.transpose(attention, [0, 2, 1, 3])

    # concatenating the attention heads.
    output = tf.reshape(attention, [batch_size, -1, self.d_model])
    # passing the concatenation as input to a dense layer.
    output = self.dense(output)

    return output

Creating the Feed Forward Network layer of the Transformer Encoder.

In [0]:
class FeedForwardNetwork(tf.keras.layers.Layer):

  def __init__(self, dff, d_model):
    super(FeedForwardNetwork, self).__init__()

    # creating the dense layers of the feed forward network.
    self.fc1 = tf.keras.layers.Dense(dff, activation='relu')
    self.fc2 = tf.keras.layers.Dense(d_model)

  def call(self, x):
    # applying the layer with dff units and relu activation.
    fc1 = self.fc1(x)
    # applying the layer with d_model units and no activation.
    output = self.fc2(fc1)

    return output

Creating the Encoder layer of the Transformer Encoder.

In [0]:
class EncoderLayer(tf.keras.layers.Layer):

  def __init__(self, num_heads, dff, d_model, rate):
    super(EncoderLayer, self).__init__()

    # creating the MHA and FFN layers.
    self.mha = MultiHeadAttention(num_heads, d_model)
    self.ffn = FeedForwardNetwork(dff, d_model)

    # creating the dropout layers.
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)

    # creating the normalization layers.
    self.normalization1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.normalization2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

  def call(self, x, padding_mask, training):
    # applying multi-head attention, dropout and then normalization.
    attention = self.mha(x, x, x, padding_mask)
    dropout1 = self.dropout1(attention, training=training)
    normalization1 = self.normalization1(x + dropout1)

    # applying the feed forward network.
    ffn = self.ffn(normalization1)
    dropout2 = self.dropout2(ffn, training=training)
    output = self.normalization2(normalization1 + dropout2)

    return output

Creating the Encoder layer of the Matching Network.

In [0]:
class Encoder(tf.keras.layers.Layer):

  def __init__(self, num_layers, vocab_size, position, num_heads, dff, d_model, rate):
    super(Encoder, self).__init__()

    self.num_layers = num_layers
    self.d_model = d_model

    # creating the embedding and positional encoding layers.
    self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=d_model)
    self.pe = PositionalEncoding(position, d_model)

    # creating the dropout layer.    
    self.dropout = tf.keras.layers.Dropout(rate)

    # creating the encoder layers.    
    self.encoder_layers = [EncoderLayer(num_heads, dff, d_model, rate) for index in range(num_layers)]

  def call(self, x, padding_mask, training):
    # calculating the embeddings and applying the positional encoding.
    x = self.embedding(x)
    x *= tf.sqrt(tf.cast(self.d_model, dtype=tf.float32))
    x = self.pe(x)
    x = self.dropout(x, training=training)

    for index in range(self.num_layers):
      # stacking the encoder layers.
      x = self.encoder_layers[index](x, padding_mask, training)

    return x

Creating the Cosine Similarity layer of the Matching Network.

In [0]:
class Cosine(tf.keras.layers.Layer):
  
  def __init__(self):
    super(Cosine, self).__init__()

  def call(self, x, y):
    # calculating the cosine.
    x_normalized = tf.math.l2_normalize(x, axis=-1)
    y_normalized = tf.math.l2_normalize(y, axis=-1)
    theta = tf.matmul(x_normalized, y_normalized, transpose_b=True)
    
    return theta

Creating the model of the Matching Network.

In [0]:
class MatchingNetwork(tf.keras.Model):

  def __init__(self, num_layers, input_vocab_size, target_vocab_size, input_position, target_position, num_heads, dff, d_model, rate):
    super(MatchingNetwork, self).__init__()

    # creating the Encoders.
    self.encoder1 = Encoder(num_layers, input_vocab_size, input_position, num_heads, dff, d_model, rate)
    self.encoder2 = Encoder(num_layers, target_vocab_size, target_position, num_heads, dff, d_model, rate)

    # creating the Cosine Similarity layer.
    self.similarity = Cosine()

  def call(self, x, y, padding_mask_x, padding_mask_y, training):
    # creating the encoded input padding mask.
    mask1 = tf.squeeze(padding_mask_x)[:, :, tf.newaxis]
    mask1 = -1e9 * (1 - mask1)

    # passing the input data to its corresponding encoder.
    encoded1 = self.encoder1(x, padding_mask_x, training)
    encoded1 = encoded1 + mask1
    encoded1 = tf.reduce_max(encoded1, axis=-2)
    encoded1 = tf.keras.activations.tanh(encoded1)

    # creating the encoded target padding mask.
    mask2 = tf.squeeze(padding_mask_y)[:, :, tf.newaxis]
    mask2 = -1e9 * (1 - mask2)

    # passing the target data to its corresponding encoder.
    encoded2 = self.encoder2(y, padding_mask_y, training)
    encoded2 = encoded2 + mask2
    encoded2 = tf.reduce_max(encoded2, axis=-2)
    encoded2 = tf.keras.activations.tanh(encoded2)

    # calculating the similarity and the corresponding probabilities.
    similarity = self.similarity(encoded1, encoded2)
    similarity = tf.nn.softmax(similarity, axis=-1)

    return similarity, encoded1, encoded2

Creating the Matching Network with specific hyperparameters.

In [0]:
NUM_LAYERS = 4
INPUT_VOCAB_SIZE = input_encoder.vocab_size
TARGET_VOCAB_SIZE = target_encoder.vocab_size
INPUT_POSITION = input_encoder.vocab_size
TARGET_POSITION = target_encoder.vocab_size
NUM_HEADS = 8
DFF = 1024
D_MODEL = 256
RATE = 0.1

matching_network = MatchingNetwork(NUM_LAYERS, INPUT_VOCAB_SIZE, TARGET_VOCAB_SIZE,
                                   INPUT_POSITION, TARGET_POSITION, NUM_HEADS,
                                   DFF, D_MODEL, RATE)

Finding the best learning rate and creating the Adam optimizer.

In [0]:
# finding the best learning rate using Exponential Decay.
#learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(1e-10, decay_steps=100, decay_rate=1.1, staircase=True)

# creating the Adam optimizer.
optimizer = tf.keras.optimizers.Adam(4.5e-5)

Defining the checkpoint manager and restoring a checkpoint if it exists.

In [0]:
ckpt = tf.train.Checkpoint(matching_network=matching_network, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, '/content/drive/My Drive/COSINE', max_to_keep=5)

if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print('Model restored.')

Defining the metrics of the training process.
*   *SquaredMarginLoss*: calculates a squared margin-like loss for the output probabilities of the Matching Network. Margin is set to 1, as probabilities cannot increase higher than that.
*   *CategoricalAccuracy*: calculates the percentage of the correct predictions.

In [0]:
def SquaredMarginLoss(predictions, margin=1):
  # calculating the positive loss.
  positive_loss = tf.linalg.diag_part(predictions)
  positive_loss = tf.pow(margin - positive_loss, 2)

  # calculating the negative loss.  
  diag_minus_infinity = tf.linalg.diag(tf.fill(dims=[tf.shape(predictions)[0]], value=-1e9))
  negative_loss = tf.nn.relu(predictions + diag_minus_infinity)
  negative_loss = tf.pow(negative_loss, 2)
  negative_loss = tf.reduce_sum(negative_loss, axis=-1)

  # summing both losses.
  total_loss = positive_loss + negative_loss

  return total_loss

# training metrics.
train_loss = tf.keras.metrics.Mean()
train_accuracy = tf.keras.metrics.CategoricalAccuracy()
# validation metrics.
valid_loss = tf.keras.metrics.Mean()
valid_accuracy = tf.keras.metrics.CategoricalAccuracy()

Defining the Mean Reciprocal Rank (MRR) function. The MRR measures the rank of the correct prediction using the current position of the correct predictions. For example, for the first place it is 1, for the second place it is 1/2, for the third place it is 1/3 etc.

In [0]:
def MRR(predictions):
  # getting the correct predictions.
  positive_scores = tf.linalg.diag_part(predictions)
  # calculating their position in respect to the other predictions.
  compared_scores = predictions >= tf.expand_dims(positive_scores, axis=-1)
  # calculating the MRR metric.
  mrr = 1 / tf.reduce_sum(tf.cast(compared_scores, dtype=tf.float32), axis=-1)

  return mrr

Defining the training and validation step function. Both function use the @tf.function decorator to execute in graph mode and speed-up the training process using less resources.
*   *train_step*: receives the predictions of the Matching Network, calculates the loss and the loss' gradients. Then it applies the gradients to the model's trainable variables. To avoid the exploding gradient phenomenon, gradient clipping is applied to the gradients. The function also calculates the accuracy and the MRR metrics for the predictions.
*   *valid_step*: receives the predictions of the Matching Network and calculates the loss. The function also calculates the accuracy and the MRR metrics for the predictions.

In [0]:
@tf.function
def train_step(inp, tar):
  # creating the input padding mask.
  padding_mask_inp = 1 - tf.cast(tf.equal(inp, 0), dtype=tf.float32)
  padding_mask_inp = padding_mask_inp[:, tf.newaxis, tf.newaxis, :]
  # creating the target padding mask.
  padding_mask_tar = 1 - tf.cast(tf.equal(tar, 0), dtype=tf.float32)
  padding_mask_tar = padding_mask_tar[:, tf.newaxis, tf.newaxis, :]

  # creating the ground truth for the accuracy metric.
  diagonal = tf.ones(tf.shape(inp)[-2])
  one_hot_y = tf.linalg.tensor_diag(diagonal)

  with tf.GradientTape() as tape:
    predictions, en1, en2 = matching_network(inp, tar, padding_mask_inp, padding_mask_tar, training=True)

    loss = SquaredMarginLoss(predictions)
  
  # calculating and applying the gradients.
  gradients = tape.gradient(loss, matching_network.trainable_variables)
  gradients, _ = tf.clip_by_global_norm(gradients, 1)
  optimizer.apply_gradients(zip(gradients, matching_network.trainable_variables))
  
  # calculating the metrics.
  train_loss(loss)
  train_accuracy(one_hot_y, predictions)
  train_mrr = MRR(predictions)
  
  return train_mrr

@tf.function
def valid_step(inp, tar):
  # creating the input padding mask.
  padding_mask_inp = 1 - tf.cast(tf.equal(inp, 0), dtype=tf.float32)
  padding_mask_inp = padding_mask_inp[:, tf.newaxis, tf.newaxis, :]
  # creating the target padding mask.
  padding_mask_tar = 1 - tf.cast(tf.equal(tar, 0), dtype=tf.float32)
  padding_mask_tar = padding_mask_tar[:, tf.newaxis, tf.newaxis, :]

  # creating the ground truth for the accuracy metric.
  diagonal = tf.ones(tf.shape(inp)[-2])
  one_hot_y = tf.linalg.tensor_diag(diagonal)

  with tf.GradientTape() as tape:
    predictions, en1, en2 = matching_network(inp, tar, padding_mask_inp, padding_mask_tar, training=True)

    loss = SquaredMarginLoss(predictions)
  
  # calculating the metrics.
  valid_loss(loss)
  valid_accuracy(one_hot_y, predictions)
  valid_mrr = MRR(predictions)

  return valid_mrr

Creating a summary writer for TensorBoard logging.

In [0]:
summary_writer = tf.summary.create_file_writer('logs/gradient_tape/')

Creating the training loop. The loop:
*   calls the training and validation step functions.
*   outputs the metrics from both step functions for each batch and an average for each epoch.
*   logs all metrics to TensorBoard.
*   saves a checkpoint of the best model taking into account the best validation MRR.
*   calculates the time taken for each epoch, as well as the total training time.

In [0]:
stopwatch = []
step = 0
best_valid_mrr = 0

for epoch in range(50):
  # initializing the timer.
  start = time.time()

  # initializing the training metric storing lists.
  epoch_train_loss = []
  epoch_train_accuracy = []
  epoch_train_mrr = []

  # initializing the validation metric storing lists.
  epoch_valid_loss = []
  epoch_valid_accuracy = []
  epoch_valid_mrr = []
  
  for batch, (inp, tar) in enumerate(train_dataset):
    # resetting the loss and accuracy states for every training batch.
    train_loss.reset_states()
    train_accuracy.reset_states()

    # calling the training step function and storing the metrics.
    epoch_train_mrr.append(tf.reduce_mean(train_step(inp, tar)))
    epoch_train_loss.append(train_loss.result())
    epoch_train_accuracy.append(train_accuracy.result())

    # outputting the training metrics every 1000 batches.
    if batch % 1000 == 0:
      print('Epoch {} Batch {} Train Loss {:.10f} Train Accuracy {:.10f}'.format(epoch + 1, batch, train_loss.result(), train_accuracy.result()))
      
    # logging the training metrics every 100 batches.
    if batch % 100 == 0:
      with summary_writer.as_default():
        tf.summary.scalar('train_loss', data=train_loss.result(), step=step)
        tf.summary.scalar('train_accuracy', data=train_accuracy.result(), step=step)
        #tf.summary.scalar('learning_rate', data=learning_rate(step), step=step)

    step += 1

  for inp, tar in valid_dataset:
    # resetting the loss and accuracy states for every validation batch.
    valid_loss.reset_states()
    valid_accuracy.reset_states()

    # calling the validation step function and storing the metrics.
    epoch_valid_mrr.append(tf.reduce_mean(valid_step(inp, tar)))
    epoch_valid_loss.append(valid_loss.result())
    epoch_valid_accuracy.append(valid_accuracy.result())

  # logging the average training and validation metrics every epoch.
  with summary_writer.as_default():
    tf.summary.scalar('epoch_train_loss', data=tf.reduce_mean(epoch_train_loss), step=epoch)
    tf.summary.scalar('epoch_train_accuracy', data=tf.reduce_mean(epoch_train_accuracy), step=epoch)
    tf.summary.scalar('epoch_train_mrr', data=tf.reduce_mean(epoch_train_mrr), step=epoch)
    tf.summary.scalar('epoch_valid_loss', data=tf.reduce_mean(epoch_valid_loss), step=epoch)
    tf.summary.scalar('epoch_valid_accuracy', data=tf.reduce_mean(epoch_valid_accuracy), step=epoch)
    tf.summary.scalar('epoch_valid_mrr', data=tf.reduce_mean(epoch_valid_mrr), step=epoch)

  # outputting the average training metrics every epoch.
  print('Epoch {} Train Loss {:.10f} Train Accuracy {:.10f} Train MRR {:.10f}'.format(epoch + 1, 
                                                tf.reduce_mean(epoch_train_loss), 
                                                tf.reduce_mean(epoch_train_accuracy),
                                                tf.reduce_mean(epoch_train_mrr)))
  
  # outputting the average validation metrics every epoch.
  print('Epoch {} Valid Loss {:.10f} Valid Accuracy {:.10f} Valid MRR {:.10f}'. format(epoch + 1,
                                                tf.reduce_mean(epoch_valid_loss), 
                                                tf.reduce_mean(epoch_valid_accuracy),
                                                tf.reduce_mean(epoch_valid_mrr)))

  # outputting the epoch time.
  print('Time taken for 1 epoch: {} seconds\n'.format(time.time() - start))
  stopwatch.append(time.time() - start)

  # saving the best model.
  if tf.reduce_mean(epoch_valid_mrr) > best_valid_mrr:
    best_valid_mrr = tf.reduce_mean(epoch_valid_mrr)

    ckpt_manager.save()
    print('Model saved at epoch {}\n'.format(epoch+1))

# outputting the total training time.
print('Total training time: {} seconds\n'.format(tf.reduce_sum(stopwatch)))

Syncing the W&B run with Tensorboard.

In [0]:
!wandb sync logs/gradient_tape/

Opening the TensorBoard panel for log reviewing.

In [0]:
%tensorboard --logdir logs/gradient_tape/

Defining the raw input preprocessing functions.
*   *remove_special*: replaces all special characters in the raw input with an empty string.
*   *remove_empty*: removes all empty strings.


In [0]:
def remove_special(data):
  for index, row in enumerate(data):
    for token in row:
      token_index = row.index(token)
      # replacing special characters with an empty string.
      token = re.sub(r'[^A-Za-z0-9]+', '', token)
      data[index][token_index] = token

  return data

def remove_empty(data):
  for index, row in enumerate(data):
    for token in row:
      if not token:  
        # removing empty strings from the list.
        data[index] = list(filter(None, row))

  return data

Reading the 99 queries from a TXT file and preprocessing them.

In [0]:
with open('/content/drive/My Drive/queries.txt', 'r') as f:
    queries_file = f.readlines()

# removing newline characters.
queries_file = [[line.strip()] for line in queries_file]
# tokenizing the queries.
queries = [token.split() for line in queries_file for token in line]
queries = remove_special(queries)
queries = remove_empty(queries)

# applying encoding to the queries.
encoded_queries, _ = encode(queries, [], input_encoder, None)
# converting the queries to numpy.
encoded_queries, _ = to_numpy(encoded_queries, [])
# creating a constant tensor with the encoded queries.
queries_set = tf.constant(encoded_queries)

Creating and storing the vector representations of the 99 queries and the whole training corpus.

In [0]:
# creating the input padding mask.
padding_mask_inp = 1 - tf.cast(tf.equal(queries_set, 0), dtype=tf.float32)
padding_mask_inp = padding_mask_inp[:, tf.newaxis, tf.newaxis, :]

_, query_representations, _ = matching_network(queries_set, queries_set, padding_mask_inp, padding_mask_inp, False)

function_representations = []

for inp, tar in train_dataset:
  # creating the input padding mask.
  padding_mask_inp = 1 - tf.cast(tf.equal(inp, 0), dtype=tf.float32)
  padding_mask_inp = padding_mask_inp[:, tf.newaxis, tf.newaxis, :]
  # creating the target padding mask.
  padding_mask_tar = 1 - tf.cast(tf.equal(tar, 0), dtype=tf.float32)
  padding_mask_tar = padding_mask_tar[:, tf.newaxis, tf.newaxis, :]

  _, _, function_vectors = matching_network(inp, tar, padding_mask_inp, padding_mask_tar, False)

  # storing the function vectors in a list.
  function_representations.append(function_vectors)

# concatenating all vectors on the first axis.
function_representations = tf.concat(function_representations, axis=-2)

Adding all function vectors to the AnnoyIndex, building the trees and saving the ANN.

In [0]:
indices = AnnoyIndex(tf.shape(function_representations)[-1], 'angular')

for index, vector in enumerate(function_representations):
  indices.add_item(index, vector)

indices.build(10)
indices.save('/content/drive/My Drive/Cosine/functions.ann')

Getting the top 100 predictions for the 99 queries using Annoy.

In [0]:
def get_predictions(vector, indices):
  function_index, distance = indices.get_nns_by_vector(vector, n=100, include_distances=True)

  return function_index, distance

Storing the queries, their predictions, as well as the prediction URLs.

In [0]:
predictions = []

for query_index, vector in enumerate(query_representations):
  function_index, distance = get_predictions(vector, indices)
  
  for index in function_index:
    predictions.append([queries_file[query_index][0], 'java', dataset.url[index]])

Creating a DataFrame of the predictions and exporting it as a CSV file.

In [0]:
predictions_dataframe = pd.DataFrame(predictions, columns=['query', 'language', 'url'])
predictions_dataframe.to_csv('/content/drive/My Drive/COSINE/model_predictions.csv', index=False)

Saving the CSV file to the W&B run.

In [0]:
wandb.save('/content/drive/My Drive/COSINE/model_predictions.csv')

Testing section.

In [0]:
i = 0

for inp, tar in valid_dataset:
  padding_mask_inp = 1 - tf.cast(tf.equal(inp, 0), dtype=tf.float32)
  padding_mask_inp = padding_mask_inp[:, tf.newaxis, tf.newaxis, :]

  padding_mask_tar = 1 - tf.cast(tf.equal(tar, 0), dtype=tf.float32)
  padding_mask_tar = padding_mask_tar[:, tf.newaxis, tf.newaxis, :]

  out, en1, en2 = matching_network(inp, tar, padding_mask_inp, padding_mask_tar, False)

  i += 1
  if i == 1:
    break

In [0]:
out

In [0]:
t1 = en1

In [0]:
t2 = en2

In [0]:
cosine = Cosine()
result = cosine(t1, t2)
result