<a href="https://colab.research.google.com/github/AdamW1002/CodeCloneDetectionCOMP599/blob/main/codebertsimilar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 13.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 57.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 54.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datetime import datetime
import psutil 
import matplotlib.pyplot as plt


In [3]:

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
codebert = AutoModel.from_pretrained("microsoft/codebert-base")
MAX_TOKEN_DIM = 384 #controls padding and input to classifier
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/498 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

In [4]:
def load_data():
  f = open("/content/drive/MyDrive/CloneData/data.jsonl") #read sniipets and indices
  entries = f.readlines()
  objects = [json.loads(x) for x in entries] #load all functions
  idx_to_function = dict()
 
  for snippet in objects:#map to associate index to func
    
    idx_to_function[snippet["idx"]] = snippet["func"]

  return idx_to_function

In [5]:
def pairify_file(lines : list, idx_to_function : dict) -> tuple:
  examples = []
  
  for line in lines:
    line_entries = line.replace("\t", " ").split(" ") #given line x y label, divide to find if x is y according to label
    #print(line)
    x = line_entries[0]
    y = line_entries[1]
    label = line_entries[2]
    
    examples.append((idx_to_function[x], idx_to_function[y], float(label))) #convert label to float for pytorch
  return examples


In [6]:
def split_and_label_data(idx_to_function : dict): #convert pairs to useful training examples
  return tuple(map(  lambda x : pairify_file(open(x).readlines(), idx_to_function)  , ["/content/drive/MyDrive/CloneData/train.txt","/content/drive/MyDrive/CloneData/test.txt", "/content/drive/MyDrive/CloneData/valid.txt"]))


In [7]:
def embed(x : str) -> tuple:
  with torch.no_grad():
    code_tokens=tokenizer.tokenize(x)

    if len(code_tokens) >= 510: #confirm tokes arent too big for model
      return None
    tokens=[tokenizer.cls_token]+code_tokens+[tokenizer.sep_token]

    tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
    context_embeddings=codebert(torch.tensor(tokens_ids)[None,:])[0]
    
    flattened = torch.flatten(context_embeddings)
    
    
    
    return torch.clamp(flattened, min = 0, max = 1) #return flattened embedding vector

In [8]:
def embed_data(data : list) -> list: #takes prog1, prog2, label and replaces prog with their embedding for every item in the list and filters out too long items
  embedded_data = []
  i = 0
  for x,y, label in data:
    print("using {} MB for {} of {}, embedded {}".format(psutil.Process().memory_info().rss / (1024 * 1024),i, len(data), len(embedded_data)))
    emb_x = embed(x)
    emb_y = embed(y)
   
    if emb_x != None and emb_y != None: #check code isnt too long
      x_embed = emb_x #Standardize embeddings lengths since they depend on #of tokens
      y_embed = emb_y
     
      padding_length_x  = (MAX_TOKEN_DIM * 768 - x_embed.size()[0])
      padding_length_y  = (MAX_TOKEN_DIM * 768 - y_embed.size()[0])
      
      x_padded = torch.nn.functional.pad(x_embed, (int(padding_length_x/2), int(padding_length_x/2)))
      y_padded = torch.nn.functional.pad(y_embed, (int(padding_length_y/2), int(padding_length_y/2)))
      embedded_data.append((x_padded,y_padded, label))
    i += 1
  return embedded_data 

In [9]:
class CloneDataset(Dataset):

  def __init__(self,x : list ,y : list,labels : list):
    assert len(x) == len(y) and len(y) == len(labels) #make sure all the same size
    #standard boilerplate
    self.x = (x)
    self.y = (y)
    self.labels = torch.tensor(labels)
    self.length = len(x)
  def __getitem__(self, idx):
    return self.x[idx], self.y[idx], self.labels[idx]
  
  def __len__(self):
    return self.length

In [41]:
idx_to_function = load_data()
train_data, test_data,validation_data = split_and_label_data(idx_to_function)
train_data = embed_data(train_data[:1000])
#test_data = embed_data(test_data)
#validation_data = embed_data(validation_data)

using 9494.37109375 MB for 436 of 1000, embedded 141
using 9494.37109375 MB for 437 of 1000, embedded 142
using 9494.37109375 MB for 438 of 1000, embedded 143
using 9494.37109375 MB for 439 of 1000, embedded 143
using 9494.37109375 MB for 440 of 1000, embedded 144
using 9494.37109375 MB for 441 of 1000, embedded 144
using 9494.37109375 MB for 442 of 1000, embedded 145
using 9494.37109375 MB for 443 of 1000, embedded 146
using 9494.37109375 MB for 444 of 1000, embedded 146
using 9494.37109375 MB for 445 of 1000, embedded 147
using 9494.37109375 MB for 446 of 1000, embedded 147
using 9494.37109375 MB for 447 of 1000, embedded 147
using 9494.37109375 MB for 448 of 1000, embedded 148
using 9494.37109375 MB for 449 of 1000, embedded 148
using 9494.37109375 MB for 450 of 1000, embedded 148
using 9494.37109375 MB for 451 of 1000, embedded 148
using 9494.37109375 MB for 452 of 1000, embedded 149
using 9494.37109375 MB for 453 of 1000, embedded 149
using 9494.37109375 MB for 454 of 1000, embedd

In [11]:
def build_dataset(data : list):
  x_list = []
  y_list = []
  label_list = []
  for x,y,l in data:#convert list of tuples to 3 separate lists
    x_list.append(torch.flatten(x))
    y_list.append(torch.flatten(y))
    label_list.append(l)

  return CloneDataset(x_list, y_list, label_list)

In [42]:
train_data = build_dataset(train_data)
trainLoader = DataLoader(train_data, batch_size= 10, shuffle = False)

In [13]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        # Number of input features is 12.
        #self.layer_1 = nn.Linear(12, 64) 
        #self.layer_2 = nn.Linear(64, 64)
        #self.layer_out = nn.Linear(64, 1) 
        #
        #self.relu = nn.ReLU()
        #self.dropout = nn.Dropout(p=0.1)
        #self.batchnorm1 = nn.BatchNorm1d(64)
        #self.batchnorm2 = nn.BatchNorm1d(64)
        
        #A note on architecture for those interested, we eat CodeBERT embeddings of size X  * 768 which have been flattened
        # Now those vectors are each fed into FF layer(s)
        #Then they're concatnated and fed thru more FF layer(s)
        # Then their dimensionality is shrunk down to 1, which is sigmoided
        layer2_size = 256
        self.xlayer_1 = nn.Linear(MAX_TOKEN_DIM * 768, layer2_size)
        self.ylayer_1 = nn.Linear(MAX_TOKEN_DIM * 768, layer2_size)

        self.ff1 = nn.Linear(2 * layer2_size, 1 )
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

        nn.init.xavier_normal_(self.xlayer_1.weight)
        nn.init.xavier_normal_(self.ylayer_1.weight)
        nn.init.xavier_normal_(self.ff1.weight)


    def forward(self, x,y):
       #x = self.relu(self.layer_1(inputs))
       #x = self.batchnorm1(x)
       #x = self.relu(self.layer_2(x))
       #x = self.batchnorm2(x)
       #x = self.dropout(x)
       #x = self.layer_out(x)
       #
       #return x
       xtemp = self.xlayer_1(x)
       xtemp = self.relu(xtemp)

       ytemp = self.ylayer_1(y)
       ytemp = self.relu(ytemp)

       
       combined = torch.cat((xtemp, ytemp),1)
      
       out = self.ff1(combined)
       
       out = self.sigmoid(out)
       return out


In [38]:
def train():
  epochs  = 15 #standard boilerplate
  model = Classifier()
  criterion = nn.BCELoss()
  #criterion = nn.BCEWithLogitsLoss()
  optimizer = optim.Adam(model.parameters())
  

  loss_history = []
  f1_history = []
  for epoch in range(epochs): #standard training procedure
    
    epoch_loss = 0
    
    tp_count = 0 #setup for f1 score
    fp_count = 0
    fn_count = 0
    f1 = 0
    
    i = 0
    for x,y, label in trainLoader:
      
      start = datetime.now()
      optimizer.zero_grad()
      #print("max x {}".format(torch.max(x)))
      pred = model(x,y)

      #print(label.shape)
      #print(pred.view(10).shape)
      loss = criterion(torch.flatten(pred.unsqueeze(1)),torch.flatten(label.unsqueeze(1)))
      loss.backward()
      optimizer.step()
      #print("pred is {}".format(pred))
      epoch_loss += loss.item()

      #calculate scores
      pred_rounded = torch.round(pred)
      for i in range(label.shape[0]):
        if pred_rounded[i] == 1 and label[i] == 1:
          tp_count += 1
        elif pred_rounded[i] == 1 and label[i] == 0:
          fp_count += 1
        elif pred_rounded[i] == 0 and label[i] == 1:
          fn_count += 1
      
      end = datetime.now()
      delta_t = end-start 

      
      if (tp_count + .5 * (fp_count + fn_count)) != 0: #dont get 0 for denom of f1
        f1 = tp_count/(tp_count + .5 * (fp_count + fn_count))
      if i %1 == 0:
        print("time per iteration {} s".format(delta_t.microseconds / 10**6))
        print("at iteration{} of epoch {} total loss is {} , f1 is {}, tp is {}, current loss is {}".format(i,epoch,epoch_loss, f1, tp_count, loss.item()))
      i+=1
      loss_history.append(loss.item())
      f1_history.append(f1)

  return (loss_history,f1_history)

In [43]:
loss_history, f1_history = train()

time per iteration 0.988705 s
at iteration9 of epoch 4 total loss is 1140.0 , f1 is 0.7323943661971831, tp is 156, current loss is 30.0
time per iteration 0.010508 s
at iteration9 of epoch 4 total loss is 1180.0 , f1 is 0.7330316742081447, tp is 162, current loss is 40.0
time per iteration 0.012671 s
at iteration9 of epoch 4 total loss is 1230.0 , f1 is 0.7308533916849015, tp is 167, current loss is 50.0
time per iteration 0.001951 s
at iteration9 of epoch 4 total loss is 1260.0 , f1 is 0.7341772151898734, tp is 174, current loss is 30.0
time per iteration 0.999887 s
at iteration9 of epoch 4 total loss is 1320.0 , f1 is 0.7295081967213115, tp is 178, current loss is 60.0
time per iteration 0.886065 s
at iteration0 of epoch 4 total loss is 1420.0 , f1 is 0.7280163599182005, tp is 178, current loss is 100.0
time per iteration 0.013867 s
at iteration9 of epoch 5 total loss is 30.0 , f1 is 0.8235294117647058, tp is 7, current loss is 30.0
time per iteration 0.996891 s
at iteration9 of epoc

KeyboardInterrupt: ignored

In [None]:
plt.scatter(list(range(len(f1_history))), f1_history)
plt.title("F1 score")
plt.xlabel("Iterations")
plt.ylabel("F1")
plt.show()
plt.scatter(list(range(len(loss_history))), loss_history)
plt.title("Loss")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.show()