In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '6, 7'
import copy
import json
import random
import numpy as np
from tqdm import tqdm
import multiprocessing
from tree_sitter import Language, Parser
from parser import remove_comments_and_docstrings, tree_to_token_index, index_to_code_token

#load parsers
parsers = {}        
for lang in ['python', 'java', 'ruby', 'go', 'php', 'javascript']:
    LANGUAGE = Language('parser/my-languages.so', lang)
    parser = Parser()
    parser.set_language(LANGUAGE)
    parsers[lang] = parser

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import CrossEntropyLoss, MSELoss
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizer
from transformers import WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import recall_score, precision_score, f1_score

In [None]:
class arguments(object):
    def __init__(self):
        pass
args = arguments()

In [None]:
args.total_length = 512
args.epochs = 1
args.train_batch_size = 16
args.eval_batch_size = 16

args.gradient_accumulation_steps = 1
args.max_grad_norm = 1.0
args.learning_rate = 5e-5
args.weight_decay = 0.0
args.adam_epsilon = 1e-8

args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args.n_gpu = torch.cuda.device_count()
args.seed = 978

In [None]:
def set_seed():
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
set_seed()

class RobertaClassificationHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size * 2, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.out_proj = nn.Linear(config.hidden_size, 2)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = x.reshape(-1, x.size(-1) * 2)
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x
        
class Model(nn.Module):   
    def __init__(self, encoder,config,tokenizer,args):
        super(Model, self).__init__()
        self.encoder = encoder
        self.config = config
        self.tokenizer = tokenizer
        self.classifier = RobertaClassificationHead(config)
        self.args = args
        
    def forward(self, inputs_ids_1, position_idx_1, inputs_ids_2, position_idx_2, labels = None): 
        bs,l = inputs_ids_1.size()
        inputs_ids = torch.cat((inputs_ids_1.unsqueeze(1), inputs_ids_2.unsqueeze(1)), 1).view(bs * 2, l)
        position_idx = torch.cat((position_idx_1.unsqueeze(1), position_idx_2.unsqueeze(1)), 1).view(bs * 2, l)

        #embedding
        inputs_embeddings = self.encoder.roberta.embeddings.word_embeddings(inputs_ids)
        outputs = self.encoder.roberta(inputs_embeds = inputs_embeddings, position_ids = position_idx)
        outputs = outputs[0]
        logits = self.classifier(outputs)
        prob = F.softmax(logits, dim = 1)
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits, labels)
            return loss, prob
        else:
            return prob

config = RobertaConfig.from_pretrained('microsoft/graphcodebert-base')
config.num_labels = 1
tokenizer = RobertaTokenizer.from_pretrained('microsoft/graphcodebert-base')
model = RobertaForMaskedLM(config)
model.load_state_dict(torch.load('../saved-model/pretrain/4-graphcodebert-reform.bin'))   
model = Model(model, config, tokenizer, args)
model.to(args.device)
if args.n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [None]:
def extract_dataflow(code, parser,lang):
    try:
        code = remove_comments_and_docstrings(code,lang)
    except:
        pass
    if lang == "php":
        code = "<?php" + code + "?>"
    tree = parser.parse(bytes(code,'utf8'))
    tokens_index = tree_to_token_index(tree.root_node)
    code = code.split('\n')
    code_tokens = [index_to_code_token(x, code) for x in tokens_index]
    return code_tokens

class InputFeatures(object):
    """A single training/test features for a example."""
    def __init__(self, input_tokens_1, input_ids_1, position_idx_1,
                       input_tokens_2, input_ids_2, position_idx_2, label):
        self.input_tokens_1 = input_tokens_1
        self.input_ids_1 = input_ids_1
        self.position_idx_1 = position_idx_1
        self.input_tokens_2 = input_tokens_2
        self.input_ids_2 = input_ids_2
        self.position_idx_2 = position_idx_2
        self.label = label

def convert_examples_to_features(item):
    #source
    url1, url2, label, tokenizer, args, cache, url_to_code = item
    parser = parsers['java']
    
    for url in [url1,url2]:
        if url not in cache:
            func = url_to_code[url]
            #extract data flow
            code_tokens = extract_dataflow(func,parser,'java')
            code_tokens = [tokenizer.tokenize('@ '+x)[1:] if idx!=0
                           else tokenizer.tokenize(x) for idx,x in enumerate(code_tokens)]
            code_tokens = [y for x in code_tokens for y in x]
            
            code_tokens = code_tokens[: args.total_length - 3]
            source_tokens = [tokenizer.cls_token] + code_tokens + [tokenizer.sep_token]
            source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
            position_idx = [i + tokenizer.pad_token_id + 1 for i in range(len(source_tokens))]
            padding_length = args.total_length - len(source_ids)
            position_idx += [tokenizer.pad_token_id] * padding_length
            source_ids += [tokenizer.pad_token_id] * padding_length      
            cache[url] = source_tokens, source_ids, position_idx
            
    source_tokens_1, source_ids_1, position_idx_1 = cache[url1]   
    source_tokens_2, source_ids_2, position_idx_2 = cache[url2]   
    return InputFeatures(source_tokens_1, source_ids_1, position_idx_1,
                         source_tokens_2, source_ids_2, position_idx_2, label)

In [None]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, args, file_path = 'train'):
        self.examples = []
        self.args = args
        index_filename = file_path
        
        #load index
        url_to_code = {}
        with open('/'.join(index_filename.split('/')[: -1]) + '/data.jsonl') as f:
            for line in f:
                line = line.strip()
                js = json.loads(line)
                url_to_code[js['idx']] = js['func']
                
        #load code function according to index
        data = []
        cache = {}
        f = open(index_filename)
        with open(index_filename) as f:
            for line in f:
                line = line.strip()
                url1, url2, label = line.split('\t')
                if url1 not in url_to_code or url2 not in url_to_code:
                    continue
                if label == '0':
                    label = 0
                else:
                    label = 1
                data.append((url1, url2, label, tokenizer, args, cache, url_to_code))
                
        #only use 10% valid data to keep best model        
        if 'valid' in file_path:
            data = random.sample(data, int(len(data) * 0.1))
        
        #convert example to input features
        if 'valid' in file_path:
            bar = data
        else:
            bar = tqdm(data, total = len(data))
        self.examples = [convert_examples_to_features(x) for x in bar]

    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, item):
        return (torch.tensor(self.examples[item].input_ids_1),
                torch.tensor(self.examples[item].position_idx_1),
                torch.tensor(self.examples[item].input_ids_2),
                torch.tensor(self.examples[item].position_idx_2),
                torch.tensor(self.examples[item].label))

train_dataset = TextDataset(tokenizer, args, file_path = '../data/clonedetection/train.txt')
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler = train_sampler,
                              batch_size = args.train_batch_size, num_workers = 4)

In [None]:
def evaluate(args, model, tokenizer, is_test):
    #build dataloader
    if (is_test):
        eval_dataset = TextDataset(tokenizer, args, file_path = '../data/clonedetection/test.txt')
    else:
        eval_dataset = TextDataset(tokenizer, args, file_path = '../data/clonedetection/valid.txt')
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler = eval_sampler,
                                 batch_size = args.eval_batch_size, num_workers = 4)
    
    eval_loss = 0.0
    nb_eval_steps = 0
    logits = []  
    y_trues = []
    model.eval()
    
    if is_test:
        bar = tqdm(eval_dataloader, total = len(eval_dataloader))
    else:
        bar = eval_dataloader
    for step, batch in enumerate(bar):
        (inputs_ids_1, position_idx_1,
         inputs_ids_2, position_idx_2, labels) = [x.to(args.device) for x in batch]
        with torch.no_grad():
            lm_loss, logit = model(inputs_ids_1, position_idx_1,
                                   inputs_ids_2, position_idx_2, labels)
            eval_loss += lm_loss.mean().item()
            logits.append(logit.cpu().numpy())
            y_trues.append(labels.cpu().numpy())
        nb_eval_steps += 1
    
    #calculate scores
    result = {}
    best_threshold = 0.5
    logits = np.concatenate(logits,0)
    y_trues = np.concatenate(y_trues,0)
    y_preds = logits[:, 1] > best_threshold
    result["eval_recall"] = float(recall_score(y_trues, y_preds, average = 'macro'))
    result["eval_precision"] = float(precision_score(y_trues, y_preds, average = 'macro'))
    result["eval_f1"] = float(f1_score(y_trues, y_preds, average = 'macro'))
    result["eval_threshold"] = best_threshold
    return result

In [None]:
args.max_steps = args.epochs * len(train_dataloader)
args.save_steps = len(train_dataloader) // 10
args.warmup_steps = args.max_steps // 5

# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay': args.weight_decay},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr = args.learning_rate, eps = args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = args.warmup_steps,
                                            num_training_steps = args.max_steps)

In [None]:
best_f1 = 0
model.zero_grad()
for epoch_num in range(args.epochs):
    train_num = 0
    train_loss = 0
    bar = tqdm(train_dataloader, total = len(train_dataloader))
    
    for step, batch in enumerate(bar):
        (inputs_ids_1, position_idx_1,
         inputs_ids_2, position_idx_2, labels) = [x.to(args.device) for x in batch]
        model.train()
        loss, logits = model(inputs_ids_1, position_idx_1,
                             inputs_ids_2, position_idx_2, labels)

        if args.n_gpu > 1:
            loss = loss.mean()
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        train_num += 1
        train_loss += loss.item()
        avg_loss = round(train_loss / train_num, 2)
        bar.set_description("{}: loss {} f1 {}".format(epoch_num, avg_loss, round(best_f1 * 100, 2)))

        if ((step + 1) % args.gradient_accumulation_steps == 0):
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

            if ((step + 1) % (args.gradient_accumulation_steps * args.save_steps) == 0):
                results = evaluate(args, model, tokenizer, False)
                print(results)
                if results['eval_f1'] > best_f1:
                    best_f1 = results['eval_f1']
                    save_dir = '../saved-model/clonedetection'
                    if not os.path.exists(save_dir):
                        os.makedirs(save_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model
                    torch.save(model_to_save.state_dict(), save_dir + '/4-graphcodebert-reform.bin')

In [None]:
results = evaluate(args, model, tokenizer, True)
print('Test Set :', results)