In [1]:
import os
import torch
import tensorflow as tf
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [2]:
!pwd

/scratch/sj3233


In [3]:
# The tokenizer. Megatron was trained with standard tokenizer(s).
tokenizer = GPT2Tokenizer(
    vocab_file='/home/sj3233/project21S/nvidia/megatron-gpt2-345m/vocab.json',
    merges_file='/home/sj3233/project21S/nvidia/megatron-gpt2-345m/merges.txt')
tokenizer.add_special_tokens({
    "eos_token": "</s>",
    "bos_token": "<s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "mask_token": "<mask>",
    "additional_special_tokens": ["<to-buggy>"]})

1

In [4]:
# Load the model from $MYDIR/nvidia/megatron-gpt2-345m.
directory = '/scratch/sj3233/model_mask_0.10379147529602051_0.09535404294729233'
model = GPT2LMHeadModel.from_pretrained(directory)

In [5]:
model.resize_token_embeddings(len(tokenizer))
device = torch.device("cuda")
model.to(device)
print(' ')

 


In [6]:
# freeze
i = 0
for name, param in model.named_parameters():
    # print(i)
    # print(name)
    if i < 422:   # 398 410 422
        param.requires_grad = False
    else:
        print(name)
    # print(param.requires_grad)
    i = i+1

transformer.h.35.ln_1.weight
transformer.h.35.ln_1.bias
transformer.h.35.attn.c_attn.weight
transformer.h.35.attn.c_attn.bias
transformer.h.35.attn.c_proj.weight
transformer.h.35.attn.c_proj.bias
transformer.h.35.ln_2.weight
transformer.h.35.ln_2.bias
transformer.h.35.mlp.c_fc.weight
transformer.h.35.mlp.c_fc.bias
transformer.h.35.mlp.c_proj.weight
transformer.h.35.mlp.c_proj.bias
transformer.ln_f.weight
transformer.ln_f.bias


In [7]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [8]:
import pandas as pd
import os
path = '/scratch/sj3233/result1000.csv'
df = pd.read_csv(path, sep='@')
print(len(df))

225202


In [9]:
MAX_LEN = 1024

In [10]:
from torch.utils.data import Dataset

class BuggyPatchDataset(Dataset):
  def __init__(self, buggy_code, patched_code, before, tokenizer, max_len):
    self.buggy_code = buggy_code
    self.patched_code = patched_code
    self.before = before
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.buggy_code)

  def __getitem__(self, item):
    buggy = str(self.buggy_code[item])
    patched = str(self.patched_code[item])
    before = str(self.before[item])
    tps = '<s>' + before + patched + '<to-buggy>' + before + buggy + '</s>'

    encoding = self.tokenizer.encode_plus(
        tps,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        truncation=True,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    encoding_p = self.tokenizer.encode_plus(
        '<s>' + before + patched + '<to-buggy>',
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        truncation=True,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    encoding_b = self.tokenizer.encode_plus(
        before + buggy + '</s>',
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        truncation=True,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    return {
        'input_tps': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'patch_ids': encoding_p['input_ids'].flatten(),
        'buggy_ids': encoding_b['input_ids'].flatten()
    }

In [11]:
from sklearn.model_selection import train_test_split
df_train, val_test = train_test_split(df, train_size=0.8, random_state=42)
df_val, df_test = train_test_split(val_test, train_size=0.5, random_state=42)
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(180161, 4)
(22520, 4)
(22521, 4)


In [12]:
from torch.utils.data import DataLoader

def creat_data_loader(df, tokenizer, max_len, batch_size):
  ds = BuggyPatchDataset(
      buggy_code=df.buggy_code.to_numpy(),
      patched_code=df.patched_code.to_numpy(),
      before=df.before.to_numpy(),
      tokenizer=tokenizer,
      max_len=max_len
  )

  return DataLoader(
      ds,
      batch_size=batch_size,
      num_workers=2
  )

In [13]:
BATCH_SIZE = 8

train_data_loader = creat_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = creat_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = creat_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [14]:
data = next(iter(val_data_loader))

print(data.keys())
print(data['input_tps'].shape)
print(data['attention_mask'].shape)
print(data['buggy_ids'].shape)
print(data['buggy_ids'].dtype)

dict_keys(['input_tps', 'attention_mask', 'patch_ids', 'buggy_ids'])
torch.Size([8, 1024])
torch.Size([8, 1024])
torch.Size([8, 1024])
torch.int64


In [15]:
def get_output_ids(out_logits):
    pred_ids = torch.argmax(out_logits, dim=2)
    return pred_ids

In [16]:
def label_mask(input_tps):
    tps_np = input_tps.cpu().numpy()
    delim_index = np.argwhere(tps_np==52001)
    for i in range(len(delim_index)):
        tps_np[i,0:delim_index[i,1]] = -100
        
    return torch.from_numpy(tps_np) 

In [17]:
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

def compute_metrics_bleu(labels, output_ids, tokenizer):
    labels_np = labels[..., 1:].cpu().numpy()
    output_np = output_ids[..., :-1].cpu().numpy()
    tokenizer = tokenizer
    
    delim_index = np.argwhere(labels_np==52001)
    # eos_index = np.argwhere(labels_np==2)
    expect_out = []
    actual_out = []
    
    for i in range(len(delim_index)):
        expect_out = expect_out + labels_np[i,delim_index[i,1]:].tolist()
        actual_out = actual_out + output_np[i,delim_index[i,1]:].tolist()
    
    expect_out = np.delete(np.array(expect_out), np.argwhere(np.array(expect_out)==1))
    actual_out = np.delete(np.array(actual_out), np.argwhere(np.array(actual_out)==1))
        
    expect_out = torch.from_numpy(expect_out)
    actual_out = torch.from_numpy(actual_out)

    label_token = tokenizer.convert_ids_to_tokens(expect_out)
    output_token = tokenizer.convert_ids_to_tokens(actual_out)

    smooth = SmoothingFunction()
    bleu_model = sentence_bleu([label_token], output_token, smoothing_function=smooth.method1)

    return bleu_model

In [18]:
compute_metrics_bleu(data['input_tps'][..., :-1], data['input_tps'][..., 1:], tokenizer)

1.0

In [19]:
def eval_model(model, val_data_loader, device):
  model.eval()
  total_loss = 0
  total_bleu = 0
  batch_num = 1

  for batch in val_data_loader:

    input_tps = batch['input_tps'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = label_mask(input_tps).to(device)
    
    with torch.no_grad():
        outputs = model(input_tps, attention_mask=attention_mask, labels=labels)
    output_ids = get_output_ids(outputs.logits)

    loss = outputs.loss
    total_loss += loss
    
    bleu_model = compute_metrics_bleu(labels, output_ids, tokenizer)
    total_bleu += bleu_model
    
    print("\r evaluating...{:d}/{:d} ".format(batch_num, len(val_data_loader)), end='',  flush=True)
    batch_num = batch_num + 1
  
  val_loss = total_loss/len(val_data_loader)
  val_bleu = total_bleu/len(val_data_loader)
  
  print('\neval loss:',val_loss.item())
  print('eval bleu:',val_bleu)

  return val_loss.item(), val_bleu

In [None]:
import os
import time
from transformers import AdamW, get_linear_schedule_with_warmup
from collections import defaultdict

EPOCHS = 10
history = defaultdict(list)
min_loss = 100

device = torch.device("cuda")
model.to(device)
model.train()
optim = torch.optim.AdamW(
    model.parameters(), 
    lr=2e-5,  
    betas=(0.9, 0.999), 
    eps=1e-08,
    weight_decay=0.01)  

print('** Start **')

for epoch in range(EPOCHS):
  total_loss = 0
  total_bleu = 0
  batch_num = 1
  print('EPOCH: %d/%d '%(epoch+1, EPOCHS))

  for batch in train_data_loader:

    optim.zero_grad()

    input_tps = batch['input_tps'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = label_mask(input_tps).to(device)

    outputs = model(input_tps, attention_mask=attention_mask, labels=labels)
    output_ids = get_output_ids(outputs.logits)
    loss = outputs.loss
    
    total_loss += loss
    
    bleu_model = compute_metrics_bleu(labels, output_ids, tokenizer)
    total_bleu += bleu_model

    loss.backward()
    optim.step()
    
    print("\r training...{:d}/{:d} ".format(batch_num, len(train_data_loader)), end='',  flush=True)
    batch_num = batch_num + 1
    
    

  train_loss = total_loss/len(train_data_loader)
  train_bleu = total_bleu/len(train_data_loader)

  print('\ntrain loss:',train_loss.item())
  print('train bleu:',train_bleu)
  val_loss, val_bleu = eval_model(model, val_data_loader, device)

  history['train_loss'].append(train_loss.item())
  history['train_bleu'].append(train_bleu)
  history['val_loss'].append(val_loss)
  history['val_bleu'].append(val_bleu)  
    
  print('-'*60)
  
 
  savepath = 'model_mask_best'
  if val_loss < min_loss:
    model.save_pretrained(savepath)
    min_loss = val_loss


print('** Finish **')

** Start **
EPOCH: 1/10 
 training...12106/22521 

In [55]:
# savepath = 'model_masktry'+str(epoch+1)+'_'+str(train_loss.item())+'_'+str(val_loss)
# model.save_pretrained(savepath)

In [22]:
def generate_bug(model, patch_ids, tokenizer, device):
    model.eval()
    next_ids = patch_ids
    for _ in range(512):
        with torch.no_grad():
            outputs = model(input_ids=next_ids.to(device))
            next_logits = torch.argmax(outputs.logits, dim=1)[...,-1].item()
            next_ids = torch.from_numpy(np.hstack((next_ids.cpu().numpy(), next_logits)))
        if next_logits == 2:
            break

    bug_ids = next_ids[...,len(patch_ids):]
    
    print('source code: ')
    print(tokenizer.decode(patch_ids[...,1:-1]))
    print('-'*60)
    print('bug injection: ')
    print(tokenizer.decode(bug_ids[...,:-1]))


In [23]:
data = next(iter(train_data_loader))
sample_ids = data['patch_ids'][1].cpu().numpy()
sample_ids = torch.from_numpy(np.delete(sample_ids, np.argwhere(sample_ids==1))) 

In [24]:
generate_bug(model, sample_ids, tokenizer, device)
actualbug_ids = data['buggy_ids'][1].cpu().numpy()
actualbug_ids = torch.from_numpy(np.delete(actualbug_ids, np.argwhere(actualbug_ids==1))) 
print('-'*60)
print('actual bug: ')
print(tokenizer.decode(actualbug_ids[...,:-1]))

source code: 
void sc_osc_handler::handle_message_int_address(ReceivedMessage const& message,
         break;
     case cmd_cmd:
         handle_cmd(message, msg_size, endpoint);
         break;
     case cmd_version:
------------------------------------------------------------
bug injection: 
void sc_handler::handle_handler_address(receivedMessage& message,
         break;
     case handle_size_size(message, break);
     case handle_command_command_version,
         break;
     case CMD_command_version, handle_version,
         break;
------------------------------------------------------------
actual bug: 
 void sc_osc_handler::handle_message_int_address(ReceivedMessage const& message,
         break;
     case cmd_cmd:
         handle_cmd(message, msg_size, endpoint, 4);
         break;
     case cmd_version:


In [25]:
def test_model(model, test_data_loader, batch_size, tokrnizer, device):
    
    for batch in test_data_loader:
        for i in range(batch_size):
            source_ids = batch['patch_ids'][i].cpu().numpy()
            source_ids = torch.from_numpy(np.delete(source_ids, np.argwhere(source_ids==1)))
            generate_bug(model, source_ids, tokenizer, device)
            actualbug_ids = batch['buggy_ids'][i].cpu().numpy()
            actualbug_ids = torch.from_numpy(np.delete(actualbug_ids, np.argwhere(actualbug_ids==1))) 
            print('-'*60)
            print('actual bug: ')
            print(tokenizer.decode(actualbug_ids[...,:-1]))
            print('+'*70)
    

In [None]:
test_model(model, val_data_loader, BATCH_SIZE, tokenizer, device)