# Prepare the model

In [None]:
import os
import torch
import tensorflow as tf
from transformers import GPT2Tokenizer, GPT2LMHeadModel

The pre-trained Megatron GPT2 model uses a sequence length of 1024 and an embedding size of 1280. It contains 36 transformer decoder blocks and has 20 attention heads for each attention layer. 

In [None]:
# Download the pre-trained Megatron GPT2 checkpoints
!mkdir -p /content/nvidia/megatron-gpt2-345m
!wget --content-disposition https://moyix.net/~moyix/csrc_final.zip -O /content/nvidia/megatron-gpt2-345m/checkpoint.zip
%cd /content/nvidia/megatron-gpt2-345m
!unzip checkpoint.zip
%cd /content

To run this model using HuggingFace Transfoemers, we first need to convert the checkpoints.

In [None]:
!git clone https://github.com/ShumengJ/transformers.git /content/transformers
!pip install transformers
!pip install tokenizers

In [None]:
!python /content/transformers/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py /content/nvidia/megatron-gpt2-345m/ 

Now we can load the model using `from_pretrained()`

In [None]:
# Load the model from $MYDIR/nvidia/megatron-gpt2-345m
directory = '/content/nvidia/megatron-gpt2-345m'
model = GPT2LMHeadModel.from_pretrained(directory)
device = torch.device("cuda")
model.to(device)
print('** Model loaded **')

During fine-tuning, we only updated the parameters in the last decoder and the output linear layer.

In [6]:
# freeze parameters in first 35 decoders and print the rest trainable layers
i = 0
for name, param in model.named_parameters():
    if i < 422:
        param.requires_grad = False
    else:
        print(name)
    i = i+1

transformer.h.35.ln_1.weight
transformer.h.35.ln_1.bias
transformer.h.35.attn.c_attn.weight
transformer.h.35.attn.c_attn.bias
transformer.h.35.attn.c_proj.weight
transformer.h.35.attn.c_proj.bias
transformer.h.35.ln_2.weight
transformer.h.35.ln_2.bias
transformer.h.35.mlp.c_fc.weight
transformer.h.35.mlp.c_fc.bias
transformer.h.35.mlp.c_proj.weight
transformer.h.35.mlp.c_proj.bias
transformer.ln_f.weight
transformer.ln_f.bias


# Tokenizer

In [None]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

Our model uses byte pair encoding, the pre-trained tokenizer contains 52001 different tokens including special tokens: 

In [None]:
# Megatron was trained with standard tokenizer(s).
tokenizer = GPT2Tokenizer(
    vocab_file='/home/sj3233/project21S/nvidia/megatron-gpt2-345m/vocab.json',
    merges_file='/home/sj3233/project21S/nvidia/megatron-gpt2-345m/merges.txt')
tokenizer.add_special_tokens({
    "eos_token": "</s>",
    "bos_token": "<s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "mask_token": "<mask>",
    "additional_special_tokens": ["<to-buggy>"]})

We add one additional special token: `<to-buggy>: 52001` as a delimiter between the patched code and buggy code, so the size of vocabulary is 52002.

In [None]:
model.resize_token_embeddings(len(tokenizer))

# Dataset

In [4]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

Our dataset is a `.csv` file that contains 225202 datapoints, each datapoint has four columns. The first column is bug `id`, with this id we can verify the raw commit message where this bug came from. Then we saved diff of the code in three columns, `[before]` has the codes that appear before the bugs and remain unchanged during fix. `[buggy_code]` and `[patched_code]` contain the bugs and patches respectively. The reason why we save data like this is to reduce the size of the file so the complete bugs can be obtained by concatenating `[before]` and `[buggy_code]`, the complete patches can be obtained by concatenating `[before]` and `[patched_code]`.

In [8]:
path = '/scratch/sj3233/result1000.csv'
df = pd.read_csv(path, sep='@')
print(len(df))
df.head(3)

225202


In [9]:
MAX_LEN = 1024
BATCH_SIZE = 8

Each input sequence (tps) is composed of the following parts: `<s> + [patched_code] + <to-buggy> + [buggy_code] + </s>`. 
All the sequences are padded to the same maximum length. 

In [10]:
class BuggyPatchDataset(Dataset):
  def __init__(self, buggy_code, patched_code, before, tokenizer, max_len):
    self.buggy_code = buggy_code
    self.patched_code = patched_code
    self.before = before
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.buggy_code)

  def __getitem__(self, item):
    buggy = str(self.buggy_code[item])
    patched = str(self.patched_code[item])
    before = str(self.before[item])
    tps = '<s>' + before + patched + '<to-buggy>' + before + buggy + '</s>'

    encoding = self.tokenizer.encode_plus(
        tps,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        truncation=True,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    encoding_p = self.tokenizer.encode_plus(
        '<s>' + before + patched + '<to-buggy>',
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        truncation=True,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    encoding_b = self.tokenizer.encode_plus(
        before + buggy + '</s>',
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        truncation=True,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    return {
        'input_tps': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'patch_ids': encoding_p['input_ids'].flatten(),
        'buggy_ids': encoding_b['input_ids'].flatten()
    }

Split the dataset: 80% training, 10% validation, 10% test

In [11]:
df_train, val_test = train_test_split(df, train_size=0.8, random_state=42)
df_val, df_test = train_test_split(val_test, train_size=0.5, random_state=42)
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(180161, 4)
(22520, 4)
(22521, 4)


Creat a dataloader to load the data in batches.

In [12]:
def creat_data_loader(df, tokenizer, max_len, batch_size):
  ds = BuggyPatchDataset(
      buggy_code=df.buggy_code.to_numpy(),
      patched_code=df.patched_code.to_numpy(),
      before=df.before.to_numpy(),
      tokenizer=tokenizer,
      max_len=max_len
  )

  return DataLoader(
      ds,
      batch_size=batch_size,
      num_workers=2
  )

In [13]:
train_data_loader = creat_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = creat_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = creat_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [14]:
data = next(iter(val_data_loader))
# verify the size of input and datatype
print(data.keys())
print(data['input_tps'].shape)
print(data['attention_mask'].shape)
print(data['buggy_ids'].shape)
print(data['buggy_ids'].dtype)

dict_keys(['input_tps', 'attention_mask', 'patch_ids', 'buggy_ids'])
torch.Size([8, 1024])
torch.Size([8, 1024])
torch.Size([8, 1024])
torch.int64


# Train

In [None]:
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import AdamW

Get the token with highest probability from the output logits

In [15]:
def get_output_ids(out_logits):
    pred_ids = torch.argmax(out_logits, dim=2)
    return pred_ids

Mask all the tokens before the delimiter so thaat they would be ignored when calculating loss.

In [16]:
def label_mask(input_tps):
    tps_np = input_tps.cpu().numpy()
    delim_index = np.argwhere(tps_np==52001)
    for i in range(len(delim_index)):
        tps_np[i,0:delim_index[i,1]] = -100
        
    return torch.from_numpy(tps_np) 

Monitor BLEU score for up to 4-grams using uniform weights (called BLEU-4) for both training and validation

In [17]:
def compute_metrics_bleu(labels, output_ids, tokenizer):
    labels_np = labels[..., 1:].cpu().numpy()
    output_np = output_ids[..., :-1].cpu().numpy()
    tokenizer = tokenizer
    
    delim_index = np.argwhere(labels_np==52001)
    
    expect_out = []
    actual_out = []
    
    for i in range(len(delim_index)):
        expect_out = expect_out + labels_np[i,delim_index[i,1]:].tolist()
        actual_out = actual_out + output_np[i,delim_index[i,1]:].tolist()
    
    expect_out = np.delete(np.array(expect_out), np.argwhere(np.array(expect_out)==1))
    actual_out = np.delete(np.array(actual_out), np.argwhere(np.array(actual_out)==1))
        
    expect_out = torch.from_numpy(expect_out)
    actual_out = torch.from_numpy(actual_out)

    label_token = tokenizer.convert_ids_to_tokens(expect_out)
    output_token = tokenizer.convert_ids_to_tokens(actual_out)

    smooth = SmoothingFunction()
    bleu_model = sentence_bleu([label_token], output_token, smoothing_function=smooth.method1)

    return bleu_model

In [19]:
def eval_model(model, val_data_loader, device):
  model.eval()
  total_loss = 0
  total_bleu = 0
  batch_num = 1

  for batch in val_data_loader:

    input_tps = batch['input_tps'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = label_mask(input_tps).to(device)
    
    with torch.no_grad():
        outputs = model(input_tps, attention_mask=attention_mask, labels=labels)
    output_ids = get_output_ids(outputs.logits)

    loss = outputs.loss
    total_loss += loss
    
    bleu_model = compute_metrics_bleu(labels, output_ids, tokenizer)
    total_bleu += bleu_model
    
    print("\r evaluating...{:d}/{:d} ".format(batch_num, len(val_data_loader)), end='',  flush=True)
    batch_num = batch_num + 1
  
  val_loss = total_loss/len(val_data_loader)
  val_bleu = total_bleu/len(val_data_loader)
  
  print('\neval loss:',val_loss.item())
  print('eval bleu:',val_bleu)

  return val_loss.item(), val_bleu

Start training:

In [None]:
EPOCHS = 10
history = defaultdict(list)
min_loss = 100

device = torch.device("cuda")
model.to(device)
model.train()
optim = torch.optim.AdamW(
    model.parameters(), 
    lr=2e-5,  
    betas=(0.9, 0.999), 
    eps=1e-08,
    weight_decay=0.01)  

print('** Start **')

for epoch in range(EPOCHS):
  total_loss = 0
  total_bleu = 0
  batch_num = 1
  print('EPOCH: %d/%d '%(epoch+1, EPOCHS))

  for batch in train_data_loader:

    optim.zero_grad()

    input_tps = batch['input_tps'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = label_mask(input_tps).to(device)

    outputs = model(input_tps, attention_mask=attention_mask, labels=labels)
    output_ids = get_output_ids(outputs.logits)
    loss = outputs.loss
    
    total_loss += loss
    
    bleu_model = compute_metrics_bleu(labels, output_ids, tokenizer)
    total_bleu += bleu_model

    loss.backward()
    optim.step()
    
    print("\r training...{:d}/{:d} ".format(batch_num, len(train_data_loader)), end='',  flush=True)
    batch_num = batch_num + 1
    
    

  train_loss = total_loss/len(train_data_loader)
  train_bleu = total_bleu/len(train_data_loader)

  print('\ntrain loss:',train_loss.item())
  print('train bleu:',train_bleu)
  val_loss, val_bleu = eval_model(model, val_data_loader, device)

  history['train_loss'].append(train_loss.item())
  history['train_bleu'].append(train_bleu)
  history['val_loss'].append(val_loss)
  history['val_bleu'].append(val_bleu)  
    
  print('-'*60)
  
    
  savepath = 'model_mask_'+str(epoch+25)
  # if val_loss < min_loss:
  model.save_pretrained(savepath)
  # min_loss = val_loss
  
  f = open('model_train_record.txt', mode='a')
  f.write('EPOCH: '+str(epoch+25)+'\n'+
          'train loss: '+str(train_loss.item())+'\n'+
          'train bleu: '+str(train_bleu)+'\n'+
          'eval loss: '+str(val_loss)+'\n'+
          'eval bleu: '+str(val_bleu)+'\n')
  f.close()


print('** Finish **')

# Generate

In [None]:
import random
import IPython.display as display

For bug generation with Transformers, our input will be patched code end with the transfer delimiter. We use beam search to do next token selection, number of beams is 5.

In [22]:
def generate_bug(model, patch_ids, tokenizer, device, num_beams):
    
    output = model.generate(
        patch_ids, 
        do_sample=True, 
        # top_p=0.95,
        # top_k=10,
        num_beams=5, 
        early_stopping=True,
        max_length=1025)
    
    bug_ids = output[0,...,len(patch_ids[0]):-1]

    """
    print('source code: ')
    print(tokenizer.decode(patch_ids[0][...,1:-1]))
    print('-'*60)
    print('bug injection: ')
    print(tokenizer.decode(bug_ids))
    """
    return bug_ids

In [24]:
# see one example

# data = next(iter(val_data_loader))
sample_ids = data['patch_ids'][0].cpu().numpy()
sample_ids = torch.from_numpy(np.delete(sample_ids, np.argwhere(sample_ids==1)))
sample_ids = torch.unsqueeze(sample_ids, 0).to(device)

generate_bug(model, sample_ids, tokenizer, device, 10)

actualbug_ids = data['buggy_ids'][0].cpu().numpy()
actualbug_ids = torch.from_numpy(np.delete(actualbug_ids, np.argwhere(actualbug_ids==1))) 

print('-'*60)
print('actual bug: ')
print(tokenizer.decode(actualbug_ids[...,:-1]))

source code: 
void sc_osc_handler::handle_message_int_address(ReceivedMessage const& message,
         break;
     case cmd_cmd:
         handle_cmd(message, msg_size, endpoint);
         break;
     case cmd_version:
------------------------------------------------------------
bug injection: 
void sc_handler::handle_handler_address(receivedMessage& message,
         break;
     case handle_size_size(message, break);
     case handle_command_command_version,
         break;
     case CMD_command_version, handle_version,
         break;
------------------------------------------------------------
actual bug: 
 void sc_osc_handler::handle_message_int_address(ReceivedMessage const& message,
         break;
     case cmd_cmd:
         handle_cmd(message, msg_size, endpoint, 4);
         break;
     case cmd_version:


In [25]:
def generate_model(model, data_loader, batch_size, tokenizer, device, num_beams):
    
    for batch in data_loader:
        for i in range(batch_size):
            
            source_ids = batch['patch_ids'][i].cpu().numpy()
            source_ids = torch.from_numpy(np.delete(source_ids, np.argwhere(source_ids==1)))
            source_ids = torch.unsqueeze(source_ids, 0).to(device)
            
            generate_bug(model, source_ids, tokenizer, device, num_beams)
            actualbug_ids = batch['buggy_ids'][i].cpu().numpy()
            actualbug_ids = torch.from_numpy(np.delete(actualbug_ids, np.argwhere(actualbug_ids==1))) 
            
            print('-'*60)
            print('actual bug: ')
            print(tokenizer.decode(actualbug_ids[...,:-1]))
            print('+'*70)
            
# generate_model(model, val_data_loader, BATCH_SIZE, tokenizer, device, 5)

To evaluate the generated bugs, we calculated four values: 
- BLEU score between the actual buggy code and generated buggy code (BLEU-Model)
- BLEU score between actual buggy code and source code (BLEU-Baseline)
- The difference between BLEU-Model and BLEU-Baseline (delta)
- The proportion of the realistic bugs among all generated bugs

A positive delta value indicates: compare to thee patched code, the generated mutant is more similar to the actual bug.

In [None]:
def val_generate_bleu(model, data_loader, batch_size, tokenizer, device, num_beams):
    
    total_bleu = 0
    total_delta = 0
    actualbug_num = 0
    batch_num = 1
    
    for batch in data_loader:
        for i in range(batch_size):  # batch_size
                source_ids = batch['patch_ids'][i].cpu().numpy()
                source_ids = torch.from_numpy(np.delete(source_ids, np.argwhere(source_ids==1)))
                source_ids = torch.unsqueeze(source_ids, 0).to(device)
                print(source_ids.shape)
                
                bug_ids = generate_bug(model, source_ids, tokenizer, device ,num_beams)
                actualbug_ids = batch['buggy_ids'][i].cpu().numpy()
                actualbug_ids = torch.from_numpy(np.delete(actualbug_ids, np.argwhere(actualbug_ids==1)))
            
                source = tokenizer.convert_ids_to_tokens(source_ids[0][...,1:-1])
                bug = tokenizer.convert_ids_to_tokens(bug_ids)
                actualbug = tokenizer.convert_ids_to_tokens(actualbug_ids[...,:-1])

                smooth = SmoothingFunction()
                bleu_inject = sentence_bleu([actualbug], bug, smoothing_function=smooth.method1)
                bleu_baseline = sentence_bleu([actualbug], source, smoothing_function=smooth.method1)
                bleu_delta = bleu_inject - bleu_baseline
                
                if bleu_inject==1.0:
                    actualbug_num = actualbug_num + 1
            
                total_bleu += bleu_inject
                total_delta += bleu_delta
        
        display.clear_output(wait=True)
        print("\r generating...{:d}/{:d} ".format(batch_num, len(data_loader)), end='',  flush=True)
        batch_num = batch_num + 1  
        # if batch_num==1001: break
        
    gen_bleu = total_bleu/(len(data_loader)*batch_size) #len(data_loader)
    gen_delta = total_delta/(len(data_loader)*batch_size)
    gen_actualbug = actualbug_num/(len(data_loader)*batch_size)
  
    print('\ngenerate bleu: ',gen_bleu)
    print('generate delta: ',gen_delta)
    print('generate actual bugs: ',gen_actualbug)
    
    f = open('eval_gen.txt', mode='w')
    f.write('generate bleu: '+str(gen_bleu)+'\n'+
            'generate delta: '+str(gen_delta)+'\n'+
            'generate actual bugs: '+str(gen_actualbug)+'\n')
    f.close()


In [None]:
val_generate_bleu(model, val_data_loader, BATCH_SIZE, tokenizer, device, 5)