In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4'
# os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import json
import torch
import random
import tokenize
import numpy as np
from tqdm import tqdm
import multiprocessing
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizer
from transformers import WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup
# from parser import remove_comments_and_docstrings

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import CrossEntropyLoss, MSELoss
from sklearn.metrics import recall_score, precision_score, f1_score

# from sentence_transformers import SentenceTransformer
# sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
f = open('../data/dataset.jsonl', 'r')
dataset0 = json.loads(f.readline())
f.close()
f = open('../data/graphs2.jsonl', 'r')
graphs2 = json.loads(f.readline())
f.close()
f = open('../data/labels01.jsonl', 'r')
labels01 = json.loads(f.readline())
f.close()
topic_number = len(labels01[0])

In [None]:
class arguments(object):
    def __init__(self):
        pass
args = arguments()

In [None]:
args.epochs = 25
args.input_limit = 10
args.gradient_accumulation_steps = 32

args.total_length = 512
args.graph_length = 200
args.max_grad_norm = 1.0
args.learning_rate = 1e-5
args.weight_decay = 0.0
args.adam_epsilon = 1e-8

args.current_topic = 0
args.topic_number = topic_number
args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args.n_gpu = torch.cuda.device_count()
args.seed = 978438233

In [None]:
def set_seed():
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
set_seed()

config = RobertaConfig.from_pretrained('microsoft/codebert-base')
config.num_labels = 1
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')
pretrain_model = RobertaForMaskedLM.from_pretrained('microsoft/codebert-base', config = config)

In [None]:
dataset = []
for x, E, y in tqdm(zip(dataset0, graphs2, labels01), total = len(graphs2)):
    if (len(x) != 0 and sum(y) != 0):
        V = [[1] + [0 for i in range(384 - 1)] for j in range(200)]
        dataset.append([x, V, E, y])
random.shuffle(dataset)

# dataset = []
# for x, (V, E), y in tqdm(zip(dataset0, graphs2, labels01), total = len(graphs2)):
#     if (len(x) != 0 and sum(y) != 0):
#         for i in range(200 - len(V)):
#             V.append([0] * 384)
#         dataset.append([x, V, E, y])
# random.shuffle(dataset)

In [None]:
class TextDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples
        
    def __len__(self):
        return len(self.examples)
        
    def __getitem__(self, item):
        code, V, M, labels = self.examples[item]
        random.shuffle(code)
        
        code_ids = []
        position_ids = []
        for x in code:
            code_ids.append([y for y in x if y != 1] + [1])
            position_ids.append([i + tokenizer.pad_token_id + 1 for i in range(len(code_ids[-1]) - 1)] + [1])
        code_ids = [y for x in code_ids for y in x]
        position_ids = [y for x in position_ids for y in x]
        
        length = args.input_limit * args.total_length 
        code_ids = code_ids[: length]
        position_ids = position_ids[: length]
        code_ids.extend([1] * (length - len(code_ids)))
        position_ids.extend([1] * (length - len(position_ids)))
        
        code_ids = torch.tensor(code_ids).view(-1, args.total_length)
        position_ids = torch.tensor(position_ids).view(-1, args.total_length)
        return code_ids, position_ids, torch.tensor(V), M, torch.Tensor([labels[args.current_topic]])

test_data = TextDataset(dataset[int(len(dataset) * 0.75) :])
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler = test_sampler, drop_last = False)#, num_workers = 4)

In [None]:
class Model(nn.Module):
    def __init__(self, encoder, config):
        super(Model, self).__init__()
        self.W = nn.Linear(384 * 2, 384)
        self.encoder = encoder
        self.config = config
        self.rnn = nn.LSTM(config.hidden_size, config.hidden_size, 1)
        self.first_dense = nn.Linear(384, 40)
        
        self.dense = nn.Linear(config.hidden_size + args.graph_length * 40, config.hidden_size * 6)
        self.dropout = nn.Dropout(0.1)
        self.out_proj = nn.Linear(config.hidden_size * 6, 2)
        
    def forward(self, code_ids, position_ids, nodes, edges, labels):
        nodes = nodes.view(args.graph_length, -1)
        for k in range(20):
            new_nodes = []
            for u in range(nodes.size(0)):
                h = torch.zeros(384).to(args.device)
                V = random.sample(edges[u], min(5, len(edges[u])))
                for v in V:
                    h += nodes[v] / len(V)
                h = torch.cat((nodes[u], h))
                h = F.relu(self.W(h))
                new_nodes.append(h / (h * h).sum())
            nodes = torch.stack(new_nodes, dim = 0)
        y = F.relu(self.first_dense(nodes)).view(-1)

        h = torch.randn(1, code_ids.size(0), self.config.hidden_size).to(args.device)
        c = torch.randn(1, code_ids.size(0), self.config.hidden_size).to(args.device)
        code_ids = code_ids.transpose(0, 1)
        position_ids = position_ids.transpose(0, 1)
        code_embeddings = self.encoder.roberta.embeddings.word_embeddings(code_ids)
        for i in range(code_embeddings.size(0)):
            bert_output = self.encoder.roberta(inputs_embeds = code_embeddings[i],
                                               position_ids = position_ids[i])
            _, (h, c) = self.rnn(bert_output[0][:, 0, :].view(1, -1, config.hidden_size), (h, c))
        x = h[0].view(-1)
        
        x = torch.cat((x, y), dim = 0)
        x = self.dropout(x)
        x = F.relu(self.dense(x))
        x = self.dropout(x)
        x = self.out_proj(x)
        x = F.softmax(x, dim = 0)[1:]
        loss_function = MSELoss()
        loss = loss_function(x.view(-1), labels.view(-1))
        return x, loss

In [None]:
def evaluate(model, epoch_id):
    loss_sum = 0
    loss_cnt = 0
    y_trues = []
    y_preds = []
    bar = tqdm(test_dataloader, total = len(test_dataloader))
    for data in bar:
        code_ids, position_ids, nodes, edges, labels = data
        code_ids = code_ids.to(args.device)
        position_ids = position_ids.to(args.device)
        nodes = nodes.to(args.device)
        edges = [[b.item() for b in a] for a in edges]
        labels = labels.to(args.device)
        model.eval()
        with torch.no_grad():
            prob, loss = model(code_ids, position_ids, nodes, edges, labels)
            prob = prob.view(-1)
            if args.n_gpu > 1:
                loss = loss.mean()
            loss_sum = loss_sum + loss.item() * code_ids.size(0)
            loss_cnt = loss_cnt + code_ids.size(0)
            y_preds.append((prob > 0.5).long().cpu().numpy())
            y_trues.append(labels.long().view(-1).cpu().numpy())
    y_trues = np.concatenate(y_trues, 0)
    y_preds = np.concatenate(y_preds, 0)
    TP = sum([x == 1 and y == 1 for x, y in zip(y_trues, y_preds)])
    FP = sum([x == 0 and y == 1 for x, y in zip(y_trues, y_preds)])
    TN = sum([x == 0 and y == 0 for x, y in zip(y_trues, y_preds)])
    FN = sum([x == 1 and y == 0 for x, y in zip(y_trues, y_preds)])
    print('TP FP TN FN =', TP, FP, TN, FN)

    f1 = float(f1_score(y_trues, y_preds))
    rs = float(recall_score(y_trues, y_preds))
    ps = float(precision_score(y_trues, y_preds))
    os.system('mkdir -p result')
    print('f1:', f1)
    print('recall:', rs)
    print('precision:', ps)
    print('loss:', loss_sum / loss_cnt)
    f = open('result/concat' + str(args.current_topic).zfill(2) + '-' + str(epoch_id).zfill(3) + '.txt', 'w')
    print(f1, rs, ps, loss_sum / loss_cnt, TP, FP, TN, FN, file = f)
    f.close()
    return f1

In [None]:
def get_dataloader():
    posi_data = []
    nega_data = []
    for x in dataset[: int(len(dataset) * 0.75)]:
        if (x[3][args.current_topic]):
            posi_data.append(x)
        else:
            nega_data.append(x)
    print(len(posi_data), len(nega_data))
    if (len(posi_data) < len(nega_data)):
        nega_data = random.sample(nega_data, max(1, len(posi_data)))
    train_data = TextDataset(posi_data + nega_data)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler = train_sampler, drop_last = False)#, num_workers = 4)
    return train_dataloader

for i in range(args.topic_number):
    args.current_topic = i
    model = Model(pretrain_model, config)
    model.to(args.device)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    optimizer = torch.optim.Adam(model.parameters(), lr = args.learning_rate)
    
    for epoch_num in range(args.epochs):
        step = 0
        for t in range(5):
            train_dataloader = get_dataloader()
            bar = tqdm(train_dataloader, total = len(train_dataloader))
            for data in bar:
                code_ids, position_ids, nodes, edges, labels = data
                code_ids = code_ids.to(args.device)
                position_ids = position_ids.to(args.device)
                nodes = nodes.to(args.device)
                edges = [[b.item() for b in a] for a in edges]
                labels = labels.to(args.device)
                model.train()
                _, loss = model(code_ids, position_ids, nodes, edges, labels)
                if args.n_gpu > 1:
                    loss = loss.mean()
                loss = loss / args.gradient_accumulation_steps
                loss.backward()
                bar.set_description("topic {} epoch {}".format(i, epoch_num))
                step += 1
                if (step % args.gradient_accumulation_steps == 0):
                    optimizer.step()
                    optimizer.zero_grad()
        if (step % args.gradient_accumulation_steps != 0):
            optimizer.step()
            optimizer.zero_grad()
        evaluate(model, epoch_num)

In [None]:
exit()