In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4, 5'
import json
import torch
import random
import numpy as np
from tqdm import tqdm
import multiprocessing
from torch.utils.data import DataLoader, Dataset, RandomSampler
from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizer
from transformers import WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup

In [None]:
class arguments(object):
    def __init__(self):
        pass
args = arguments()

In [None]:
args.total_length = 512
args.graph_length = 0
args.epochs = 1
args.train_batch_size = 16
args.eval_batch_size = 16

args.gradient_accumulation_steps = 1
args.max_grad_norm = 1.0
args.learning_rate = 5e-5
args.weight_decay = 0.0
args.adam_epsilon = 1e-8

args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args.n_gpu = torch.cuda.device_count()
args.seed = 978

In [None]:
def set_seed():
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

set_seed()
config = RobertaConfig.from_pretrained('microsoft/graphcodebert-base')
config.num_labels = 1
tokenizer = RobertaTokenizer.from_pretrained('microsoft/graphcodebert-base')
model = RobertaForMaskedLM.from_pretrained('microsoft/graphcodebert-base', config = config)
model.to(args.device)
if args.n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [None]:
def read_data(filename):
    with open(filename) as f:
        text = []
        for line in f:
            text.append(line.strip())
        dataset = []
        for x in tqdm(text, total=len(text)):
            dataset.append(eval(x))
    return dataset

dataset = read_data('../data/py150/washed_python150k.txt')

In [None]:
class InputFeatures(object):
    def __init__(self, code_ids, position_idx, edges, cross_edges):
        self.code_ids = code_ids
        self.position_idx = position_idx
        self.edges = edges
        self.cross_edges = cross_edges

def convert_example_to_feature(example):
    tokens = example['tokens']
    nodes = example['nodes']
    edges = example['edges']
    cross_edges = example['cross_edges']

    code_length = args.total_length - min(args.graph_length, len(nodes)) - 3
    tokens = tokens[: code_length] 
    tokens = [tokenizer.tokenize(tokens[0])] \
           + [tokenizer.tokenize('@ ' + x)[1 :] for x in tokens[1 :]]
    ori2cur_pos = {-1 : (0, 0)}
    for i in range(len(tokens)):
        ori2cur_pos[i] = (ori2cur_pos[i - 1][1], ori2cur_pos[i - 1][1] + len(tokens[i]))
    tokens=[y for x in tokens for y in x] 

    #truncating
    tokens = tokens[: code_length]
    nodes = nodes[: args.graph_length]
    edges = [(a, b) for (a, b) in edges if (a < len(nodes)) and (b < len(nodes))]
    cross_edges = [(ori2cur_pos[a], b) for (a, b) in cross_edges\
                   if (a in ori2cur_pos) and (ori2cur_pos[a][1] < len(tokens)) and (b < len(nodes))]

    #adding code tokens
    code_tokens = [tokenizer.cls_token] + tokens + [tokenizer.sep_token]
    code_ids = tokenizer.convert_tokens_to_ids(code_tokens)
    position_idx = [i + tokenizer.pad_token_id + 1 for i in range(len(code_tokens))]

    #adding graph nodes
    code_tokens += [x for x in nodes]
    code_ids += [tokenizer.unk_token_id] * len(nodes)
    position_idx += [0] * len(nodes)
    assert(len(code_ids) == len(position_idx))
    assert(len(code_ids) < args.total_length)

    #padding
    padding_length = args.total_length - len(code_ids)
    code_ids += [tokenizer.pad_token_id] * padding_length
    position_idx += [tokenizer.pad_token_id] * padding_length
    return InputFeatures(code_ids, position_idx, edges, cross_edges)

def convert_examples_to_features(examples):
    features = []
    pool = multiprocessing.Pool(processes = 24)
    for example in examples:
        features.append(pool.apply_async(convert_example_to_feature, (example, )))
    pool.close()
    pool.join()
    for i in range(len(features)):
        features[i] = features[i].get()
    return features

# train_examples = dataset[: int(len(dataset) * 0.67)]
# eval_examples = dataset[int(len(dataset) * 0.67) :]
# train_features = convert_examples_to_features(train_examples)
# eval_features = convert_examples_to_features(eval_examples)
train_features = convert_examples_to_features(dataset)

In [None]:
class TextDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples
        
    def __len__(self):
        return len(self.examples)
        
    def __getitem__(self, item):
        attn_mask = np.zeros((args.total_length, args.total_length), dtype = np.bool)
        node_index = sum([i > 1 for i in self.examples[item].position_idx])
        max_length = sum([i != 1 for i in self.examples[item].position_idx])
        
        attn_mask[: node_index, : node_index] = True
        for i, x in enumerate(self.examples[item].code_ids):
            if x in [tokenizer.cls_token_id, tokenizer.sep_token_id]:
                attn_mask[i, 0 : max_length] = True # [cls/sep, all]
#                 attn_mask[0 : max_length, i] = True # test [all, cls/sep]
        attn_mask[1 : node_index - 1, node_index] = True # cross edge (token, graph ROOT)
        attn_mask[node_index, 1 : node_index - 1] = True # cross edge (graph ROOT, token)
        for ((a, b), c) in self.examples[item].cross_edges:
            attn_mask[a + 1 : b + 1, node_index + c] = True # cross edge (token, graph node)
            attn_mask[node_index + c, a + 1 : b + 1] = True # cross edge (token, graph node)
        for (a, b) in self.examples[item].edges:
            attn_mask[node_index + a, node_index + b] = True # edge (source, target)
#             attn_mask[node_index + b, node_index + a] = True # test (target, source)

        input_ids = []
        labels = []
        for x in self.examples[item].code_ids:
            if (x in [tokenizer.cls_token_id, tokenizer.sep_token_id,
                      tokenizer.unk_token_id, tokenizer.pad_token_id]):
                input_ids.append(x)
                labels.append(-100)
            elif (random.randint(0, 99) < 15):
                input_ids.append(tokenizer.mask_token_id)
                labels.append(x)
            else:
                input_ids.append(x)
                labels.append(-100)

        return (torch.tensor(input_ids),
                torch.tensor(self.examples[item].position_idx),
                torch.tensor(attn_mask),
                torch.tensor(labels))

train_data = TextDataset(train_features)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, drop_last = True,
                              batch_size = args.train_batch_size, num_workers = 4)
# eval_data = TextDataset(eval_features)
# eval_sampler = RandomSampler(eval_data)
# eval_dataloader = DataLoader(eval_data, sampler = eval_sampler, shuffle = False, drop_last = False,
#                              batch_size = args.eval_batch_size, num_workers = 4)

In [None]:
# def evaluate():
#     bar = tqdm(eval_dataloader, total = len(eval_dataloader))
#     total = 0
#     correct = 0
#     model.eval()
#     for batch in bar:
#         (input_ids, position_ids, attention_mask, labels) = [x.to(args.device) for x in batch]
#         with torch.no_grad():
#             output = model(input_ids = input_ids,
#                            position_ids = position_ids,
#                            attention_mask = attention_mask)
#         _, predicted = torch.max(output.logits, 2)
#         predicted = predicted.view(1, -1).squeeze()
#         labels = labels.view(1, -1).squeeze()
#         total += (labels != -100).sum().item()
#         correct += (predicted == labels).sum().item()
#     return correct / total
# print(round(evaluate() * 100, 2))

In [None]:
args.max_steps = args.epochs * len(train_dataloader)
args.save_steps = len(train_dataloader) // 10
args.warmup_steps = args.max_steps // 5

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay': args.weight_decay},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr = args.learning_rate, eps = args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = args.warmup_steps,
                                            num_training_steps = args.max_steps)
# optimizer = torch.optim.Adam(model.parameters(), lr = args.learning_rate)

In [None]:
optimizer.zero_grad()
for epoch_id in range(args.epochs): 
    train_num = 0
    train_loss = 0
    avg_loss = 0
    bar = tqdm(train_dataloader, total = len(train_dataloader))
    
    for step, batch in enumerate(bar):
        (input_ids, position_ids, attention_mask, labels) = [x.to(args.device) for x in batch]
        model.train()
        output = model(input_ids = input_ids,
                       position_ids = position_ids,
                       attention_mask = attention_mask,
                       labels = labels)
        loss = output.loss

        if args.n_gpu > 1:
            loss = loss.mean()
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        train_num += 1
        train_loss += loss.item()
        avg_loss = round(train_loss / train_num, 2)
        bar.set_description("{}: loss {}".format(epoch_id, avg_loss))

        if (step + 1) % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            
#     print(evaluate())

In [None]:
save_dir = '../saved-model/pretrain'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
torch.save(model.state_dict(), save_dir + '/3-graphcodebert-baseline.bin')