<a href="https://colab.research.google.com/github/AdamW1002/CodeCloneDetectionCOMP599/blob/main/codebertsimilar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 4.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 39.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 46.3 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 46.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [11]:
!unzip data.zip

Archive:  data.zip
   creating: content/CodeT5/data/clone/
  inflating: content/CodeT5/data/clone/train.txt  
  inflating: content/CodeT5/data/clone/data.jsonl  
  inflating: content/CodeT5/data/clone/test.txt  
  inflating: content/CodeT5/data/clone/valid.txt  


In [62]:
from transformers import AutoTokenizer, AutoModel
import torch
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")
MAX_TOKEN_DIM = 384 #controls padding and input to classifier

In [54]:
def load_data():
  f = open("data.jsonl") #read sniipets and indices
  entries = f.readlines()
  objects = [json.loads(x) for x in entries] #load all functions
  idx_to_function = dict()
 
  for snippet in objects:#map to associate index to func
    
    idx_to_function[snippet["idx"]] = snippet["func"]

  return idx_to_function

In [52]:
def pairify_file(lines : list, idx_to_function : dict) -> tuple:
  examples = []
  
  for line in lines:
    line_entries = line.replace("\t", " ").split(" ") #given line x y label, divide to find if x is y according to label
    #print(line)
    x = line_entries[0]
    y = line_entries[1]
    label = line_entries[2]
    
    examples.append((idx_to_function[x], idx_to_function[y], float(label))) #convert label to float for pytorch
  return examples


In [48]:
def split_and_label_data(idx_to_function : dict): #convert pairs to useful training examples
  return tuple(map(  lambda x : pairify_file(open(x).readlines(), idx_to_function)  , ["train.txt","test.txt", "valid.txt"]))


In [55]:
idx_to_function = load_data()
train_data, test_data,validation_data = split_and_label_data(idx_to_function)

In [None]:
print(test_data[0][0])
print(test_data[0][1])
print(test_data[0][2])

In [97]:
def embed(x : str) -> tuple:
  code_tokens=tokenizer.tokenize(x)
  tokens=[tokenizer.cls_token]+code_tokens[:510]+[tokenizer.sep_token]

  tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
  context_embeddings=model(torch.tensor(tokens_ids)[None,:])[0]
  return torch.flatten(context_embeddings) #return flattened embedding vector

In [91]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        # Number of input features is 12.
        #self.layer_1 = nn.Linear(12, 64) 
        #self.layer_2 = nn.Linear(64, 64)
        #self.layer_out = nn.Linear(64, 1) 
        #
        #self.relu = nn.ReLU()
        #self.dropout = nn.Dropout(p=0.1)
        #self.batchnorm1 = nn.BatchNorm1d(64)
        #self.batchnorm2 = nn.BatchNorm1d(64)
        
        #A note on architecture for those interested, we eat CodeBERT embeddings of size X  * 768 which have been flattened
        # Now those vectors are each fed into FF layer(s)
        #Then they're concatnated and fed thru more FF layer(s)
        # Then their dimensionality is shrunk down to 1, which is sigmoided
        layer2_size = 256
        self.xlayer_1 = nn.Linear(MAX_TOKEN_DIM * 768, layer2_size)
        self.ylayer_1 = nn.Linear(MAX_TOKEN_DIM * 768, layer2_size)

        self.ff1 = nn.Linear(2 * layer2_size, 1 )
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()



    def forward(self, x,y):
       #x = self.relu(self.layer_1(inputs))
       #x = self.batchnorm1(x)
       #x = self.relu(self.layer_2(x))
       #x = self.batchnorm2(x)
       #x = self.dropout(x)
       #x = self.layer_out(x)
       #
       #return x
       xtemp = self.xlayer_1(x)
       xtemp = self.relu(xtemp)

       ytemp = self.ylayer_1(y)
       ytemp = self.relu(ytemp)

       combined = torch.cat((xtemp, ytemp),0)
       out = self.ff1(combined)
       out = self.sigmoid(out)
       return out


In [101]:
def train():
  epochs  = 3 #standard boilerplate
  model = Classifier()
  criterion = nn.BCELoss()
  optimizer = optim.Adam(model.parameters())
  
  for epoch in range(epochs): #standard training procedure
    
    epoch_loss = 0
    
    tp_count = 0 #setup for f1 score
    fp_count = 0
    fn_count = 0
    f1 = 0
    
    i = 0
    for x,y, label in train_data:
      optimizer.zero_grad()
      x_embed = embed(x) #get and pad embeddings
      y_embed = embed(y)
     
      padding_length_x  = (MAX_TOKEN_DIM * 768 - x_embed.size()[0])
      padding_length_y  = (MAX_TOKEN_DIM * 768 - y_embed.size()[0])
      
      x_padded = torch.nn.functional.pad(x_embed, (int(padding_length_x/2), int(padding_length_x/2)))
      y_padded = torch.nn.functional.pad(y_embed, (int(padding_length_y/2), int(padding_length_y/2)))
     
       
      pred = model(x_padded,y_padded)

      loss = criterion(pred,torch.tensor([label]))
      loss.backward()

      epoch_loss += loss.item()

      #calculate scores
      pred_rounded = torch.round(pred)
      if pred_rounded == 1 and label == 1:
        tp_count += 1
      elif pred_rounded == 1 and label == 0:
        fp_count += 1
      elif pred_rounded == 0 and label == 1:
        fn_count += 1
      
      if (tp_count + .5 * (fp_count + fn_count)) != 0:
        f1 = tp_count/(tp_count + .5 * (fp_count + fn_count))
      if i %1 == 0:
        print("loss is {} and f1 is {}".format(epoch_loss, f1))
      i+=1




In [None]:
train()

loss is 57.41894340515137 and f1 is 0.5609756097560976
torch.Size([393216])
loss is 58.153509855270386 and f1 is 0.5542168674698795
torch.Size([176640])
