In [0]:
pip install wandb

In [0]:
!wandb login

In [0]:
import wandb
wandb.init(project='cosine')

In [0]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras

import time
import datetime
import pickle
import math as m
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from __future__ import absolute_import, division, print_function, unicode_literals

In [0]:
%load_ext tensorboard

In [0]:
# loading the dataset.
dataset = pd.read_pickle('/content/drive/My Drive/dataset.pkl')

# copying docstring_tokens column.
docstring_tokens = dataset['docstring_tokens'].copy(deep=True)
# copying function_tokens column.
function_tokens = dataset['function_tokens'].copy(deep=True)

# loading the docstring vocabulary.
docstring_vocab =  pickle.load(open('/content/drive/My Drive/docstring_vocab.pkl', 'rb'))
# loading the function vocabulary.
function_vocab =  pickle.load(open('/content/drive/My Drive/function_vocab.pkl', 'rb'))

In [0]:
def encode(inp, tar, input_encoder, target_encoder):
  # encoding input data.
  for index, row in enumerate(inp):
    inp[index] = [input_encoder.encode(token)[0] for token in row]

  # encoding target data.
  for index, row in enumerate(tar):
    tar[index] = [target_encoder.encode(token)[0] for token in row]

  return inp, tar

def to_numpy(inp, tar):
  # converting input data to numpy.
  inp = pd.DataFrame(list(inp))
  inp = inp.to_numpy()
  inp = np.nan_to_num(inp)
  inp = inp.astype(int)

  # converting target data to numpy.
  tar = pd.DataFrame(list(tar))
  tar = tar.to_numpy()
  tar = np.nan_to_num(tar)
  tar = tar.astype(int)

  return inp, tar

In [0]:
# building input_encoder.
input_encoder = tfds.features.text.TokenTextEncoder(docstring_vocab)
# building target_encoder.
target_encoder = tfds.features.text.TokenTextEncoder(function_vocab)

# initializing encoded_input list of lists.
encoded_input = docstring_tokens.copy(deep=True)
# initializing encoded_target list.
encoded_target = function_tokens.copy(deep=True)

# applying encoding to input and target data.
encoded_input, encoded_target = encode(encoded_input, encoded_target, input_encoder, target_encoder)
# converting input and target data to numpy.
encoded_input, encoded_target = to_numpy(encoded_input, encoded_target)

In [0]:
BATCH_SIZE = 64

# creating a tensor dataset with the input data.
train_dataset = tf.data.Dataset.from_tensor_slices((encoded_input, encoded_target))
# caching the dataset for performance optimizations.
train_dataset = train_dataset.cache()
train_dataset = train_dataset.batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [0]:
class PositionalEncoding(tf.keras.layers.Layer):

  def __init__(self, position, d_model):
    super(PositionalEncoding, self).__init__()

    # creating the positional encoding matrix.
    self.pe = self.positional_encoding(position, d_model)

  def positional_encoding(self, position, d_model):
    # storing word positions to a matrix.
    position = tf.range(position, dtype=tf.float32)[:, tf.newaxis]
    # storing embedding components to a matrix.
    i = tf.range(d_model, dtype=tf.float32)[tf.newaxis, :]

    # calculating the angles.
    angle = tf.multiply(position, 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, dtype=tf.float32)))

    # applying sine to the angles of even indices.
    sines = tf.sin(angle[:, 0::2])
    # applying cosine to the angles of odd indices.
    cosines = tf.cos(angle[:, 1::2])

    # concatenating sines and cosines in one matrix.
    pe = tf.concat([sines, cosines], axis=-1)[tf.newaxis, ...]

    return tf.cast(pe, dtype=tf.float32)

  def call(self, x):
    # adding positional encoding to the input embeddings on call.
    return x + self.pe[:, :tf.shape(x)[-2], :]

In [0]:
class MultiHeadAttention(tf.keras.layers.Layer):

  def __init__(self, num_heads, d_model):
    super(MultiHeadAttention, self).__init__()
    
    self.num_heads = num_heads
    self.d_model = d_model
    
    self.head_size = d_model // num_heads

    # creating the weight matrices for each head.
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)

    # creating the weight matrix for the output.
    self.dense = tf.keras.layers.Dense(d_model)

  def call(self, query, key, value, mask):
    # storing the batch size.
    batch_size = tf.shape(query)[-3]

    # passing query, key and value as input to the weight matrices.
    query = self.wq(query)
    key = self.wk(key)
    value = self.wv(value)

    # splitting the dense tensors for each head.
    query = tf.reshape(query, [batch_size, -1, self.num_heads, self.head_size])
    key = tf.reshape(key, [batch_size, -1, self.num_heads, self.head_size])
    value = tf.reshape(value, [batch_size, -1, self.num_heads, self.head_size])

    # transposing the number of heads and sequence length columns.
    query = tf.transpose(query, perm=[0, 2, 1, 3])
    key = tf.transpose(key, perm=[0, 2, 1, 3])
    value = tf.transpose(value, perm=[0, 2, 1, 3])

    # calculating the similarity score.
    query_keyT = tf.matmul(query, key, transpose_b=True)

    # calculating the depth.
    depth = tf.cast(tf.shape(key)[-1], dtype=tf.float32)
    # calculating the scale factor.
    scale = 1 / tf.sqrt(depth)

    # calculating the scaled similarity scores.
    scores = query_keyT * scale

    # masking out key/value pairs.
    if mask is not None:
      scores *= mask
      scores = tf.where(tf.equal(scores, 0), tf.ones_like(scores) * -1e9, scores)

    # calculating the scaled similarity scores' softmax matrix.
    softmax = tf.nn.softmax(scores)

    # calculating the scaled dot-product attention for each head.
    attention = tf.matmul(softmax, value)
    attention = tf.transpose(attention, [0, 2, 1, 3])

    # concatenating the attention heads.
    output = tf.reshape(attention, [batch_size, -1, self.d_model])
    # passing the concatenation as input to a dense layer.
    output = self.dense(output)

    return output

In [0]:
class FeedForwardNetwork(tf.keras.layers.Layer):

  def __init__(self, dff, d_model):
    super(FeedForwardNetwork, self).__init__()

    # creating the dense layers of the feed forward network.
    self.fc1 = tf.keras.layers.Dense(dff, activation='relu')
    self.fc2 = tf.keras.layers.Dense(d_model)

  def call(self, x):
    # applying the layer with dff units and relu activation.
    fc1 = self.fc1(x)
    # applying the layer with d_model units and no activation.
    output = self.fc2(fc1)

    return output

In [0]:
class EncoderLayer(tf.keras.layers.Layer):

  def __init__(self, num_heads, dff, d_model, rate):
    super(EncoderLayer, self).__init__()

    # creating the MHA and FFN layers.
    self.mha = MultiHeadAttention(num_heads, d_model)
    self.ffn = FeedForwardNetwork(dff, d_model)

    # creating the dropout layers.
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)

    # creating the normalization layers.
    self.normalization1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.normalization2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

  def call(self, x, padding_mask, training):
    # applying multi-head attention, dropout and then normalization.
    attention = self.mha(x, x, x, padding_mask)
    dropout1 = self.dropout1(attention, training=training)
    normalization1 = self.normalization1(x + dropout1)

    # applying the feed forward network.
    ffn = self.ffn(normalization1)
    dropout2 = self.dropout2(ffn, training=training)
    output = self.normalization2(normalization1 + dropout2)

    return output

In [0]:
class Encoder(tf.keras.layers.Layer):

  def __init__(self, num_layers, vocab_size, position, num_heads, dff, d_model, rate):
    super(Encoder, self).__init__()

    self.num_layers = num_layers
    self.d_model = d_model

    # creating the embedding and positional encoding layers.
    self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=d_model)
    self.pe = PositionalEncoding(position, d_model)

    # creating the dropout layer.    
    self.dropout = tf.keras.layers.Dropout(rate)

    # creating the encoder layers.    
    self.encoder_layers = [EncoderLayer(num_heads, dff, d_model, rate) for index in range(num_layers)]

  def call(self, x, padding_mask, training):
    # calculating the embeddings and applying the positional encoding.
    x = self.embedding(x)
    x *= tf.sqrt(tf.cast(self.d_model, dtype=tf.float32))
    x = self.pe(x)
    x = self.dropout(x, training=training)

    for index in range(self.num_layers):
      # stacking the encoder layers.
      x = self.encoder_layers[index](x, padding_mask, training)

    return x

In [0]:
class Cosine(tf.keras.layers.Layer):
  
  def __init__(self):
    super(Cosine, self).__init__()

  def call(self, x, y):
    # calculating the cosine.
    x_normalized = tf.math.l2_normalize(x, axis=-1)
    y_normalized = tf.math.l2_normalize(y, axis=-1)
    theta = tf.matmul(x_normalized, y_normalized, transpose_b=True)
    
    return theta

In [0]:
class MatchingNetwork(tf.keras.Model):

  def __init__(self, num_layers, input_vocab_size, target_vocab_size, input_position, target_position, num_heads, dff, d_model, rate):
    super(MatchingNetwork, self).__init__()

    self.encoder1 = Encoder(num_layers, input_vocab_size, input_position, num_heads, dff, d_model, rate)
    self.encoder2 = Encoder(num_layers, target_vocab_size, target_position, num_heads, dff, d_model, rate)

    self.similarity = Cosine()

  def call(self, x, y, padding_mask_x, padding_mask_y, training, inference):
    mask1 = tf.squeeze(padding_mask_x)[:, :, tf.newaxis]
    mask1 = -1e9 * (1 - mask1)

    encoded1 = self.encoder1(x, padding_mask_x, training)
    encoded1 = encoded1 + mask1
    encoded1 = tf.reduce_max(encoded1, axis=-2)
    encoded1 = tf.keras.activations.tanh(encoded1)

    if not inference:
      mask2 = tf.squeeze(padding_mask_y)[:, :, tf.newaxis]
      mask2 = -1e9 * (1 - mask2)

      encoded2 = self.encoder2(y, padding_mask_y, training)
      encoded2 = encoded2 + mask2
      encoded2 = tf.reduce_max(encoded2, axis=-2)
      encoded2 = tf.keras.activations.tanh(encoded2)

      similarity = self.similarity(encoded1, encoded2)
      similarity = tf.nn.softmax(similarity, axis=-1)

      return similarity, encoded1, encoded2

    elif inference:
      similarity = self.similarity(encoded1, y)
      similarity = tf.nn.softmax(similarity, axis=-1)

      return similarity

In [0]:
NUM_LAYERS = 4
INPUT_VOCAB_SIZE = input_encoder.vocab_size
TARGET_VOCAB_SIZE = target_encoder.vocab_size
INPUT_POSITION = input_encoder.vocab_size
TARGET_POSITION = target_encoder.vocab_size
NUM_HEADS = 8
DFF = 512
D_MODEL = 128
RATE = 0.1

matching_network = MatchingNetwork(NUM_LAYERS, INPUT_VOCAB_SIZE, TARGET_VOCAB_SIZE,
                                   INPUT_POSITION, TARGET_POSITION, NUM_HEADS,
                                   DFF, D_MODEL, RATE)

In [0]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)

In [0]:
#learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(1e-10, decay_steps=100, decay_rate=1.3, staircase=True)

optimizer = tf.keras.optimizers.Adam(7.5e-5)

In [0]:
def BoostedProbabilityLoss(predictions):
  positive_loss = tf.linalg.diag_part(predictions)
  positive_loss = 1 - tf.pow(positive_loss, 2)
  
  diag_minus_infinity = tf.linalg.diag(tf.fill(dims=[tf.shape(predictions)[0]], value=-1e9))
  negative_loss = tf.nn.relu(predictions + diag_minus_infinity)
  negative_loss = tf.pow(negative_loss, 2)
  negative_loss = tf.reduce_sum(negative_loss, axis=-1)

  total_loss = positive_loss + negative_loss

  return total_loss

train_loss = tf.keras.metrics.Mean()
train_accuracy = tf.keras.metrics.CategoricalAccuracy()

In [0]:
@tf.function
def train_step(inp, tar):
  padding_mask_inp = 1 - tf.cast(tf.equal(inp, 0), dtype=tf.float32)
  padding_mask_inp = padding_mask_inp[:, tf.newaxis, tf.newaxis, :]

  padding_mask_tar = 1 - tf.cast(tf.equal(tar, 0), dtype=tf.float32)
  padding_mask_tar = padding_mask_tar[:, tf.newaxis, tf.newaxis, :]

  diagonal = tf.ones(tf.shape(inp)[-2])
  one_hot_y = tf.linalg.tensor_diag(diagonal)

  with tf.GradientTape() as tape:
    predictions, en1, en2 = matching_network(inp, tar, padding_mask_inp, padding_mask_tar, training=True, inference=False)

    loss = BoostedProbabilityLoss(predictions)
  
  gradients = tape.gradient(loss, matching_network.trainable_variables)
  gradients, _ = tf.clip_by_global_norm(gradients, 1)
  optimizer.apply_gradients(zip(gradients, matching_network.trainable_variables))
  
  train_loss(loss)
  train_accuracy(one_hot_y, predictions)

In [0]:
step = 0

for epoch in range(500):
  start = time.time()
  
  train_loss.reset_states()
  train_accuracy.reset_states()

  epoch_train_loss = []
  epoch_train_accuracy = []
  
  for batch, (inp, tar) in enumerate(train_dataset):
    train_step(inp, tar)

    epoch_train_loss.append(train_loss.result())
    epoch_train_accuracy.append(train_accuracy.result())

    if batch % 100 == 0:
      print ('Epoch {} Batch {} Loss {:.10f} Accuracy {:.10f}'.format(epoch + 1, batch, train_loss.result(), train_accuracy.result()))
      
    if batch % 10 == 0:
      with train_summary_writer.as_default():
        tf.summary.scalar('loss', data=train_loss.result(), step=step)
        tf.summary.scalar('accuracy', data=train_accuracy.result(), step=step)
        #tf.summary.scalar('learning_rate', data=learning_rate(step), step=step)

      wandb.log({'loss': train_loss.result(), 'accuracy': train_accuracy.result()}, step=step)

    step += 1
      
  print ('Epoch {} Loss {:.10f} Accuracy {:.10f}'.format(epoch + 1, 
                                                tf.reduce_mean(epoch_train_loss), 
                                                tf.reduce_mean(epoch_train_accuracy)))

  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

In [0]:
%tensorboard --logdir logs/gradient_tape/

In [0]:
# testing
i = 0
for inp, tar in train_dataset:
  padding_mask_inp = 1 - tf.cast(tf.equal(inp, 0), dtype=tf.float32)
  padding_mask_inp = padding_mask_inp[:, tf.newaxis, tf.newaxis, :]

  padding_mask_tar = 1 - tf.cast(tf.equal(tar, 0), dtype=tf.float32)
  padding_mask_tar = padding_mask_tar[:, tf.newaxis, tf.newaxis, :]

  out, en1, en2 = matching_network(inp, tar, padding_mask_inp, padding_mask_tar, False, False)

  i += 1
  if i == 1:
    break

In [0]:
out

In [0]:
t1 = en1

In [0]:
t2 = en2

In [0]:
cosine = Cosine()
result = cosine(t1, t2)
result