## 环境准备

需要依赖：
- datasets：加载ruby数据集
- transformers：加载预训练模型
- pytorch：fine-tuning

In [1]:
!pip install -q datasets
!pip install -q transformers

In [2]:
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel, AdamW, get_linear_schedule_with_warmup

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

使用Colab挂载文件系统，便于训练时保存模型

In [3]:
# from google.colab import drive
#
# drive.mount('/content/drive', force_remount=True)
#
# %cd drive/MyDrive/NL-PL/models/

Mounted at /content/drive


## 数据处理

### 加载预训练模型与数据集

In [5]:
dataset = load_dataset('code_x_glue_ct_code_to_text', 'ruby')
checkpoint = 'microsoft/codebert-base'
tokenizer = RobertaTokenizer.from_pretrained(checkpoint)
config = RobertaConfig.from_pretrained(checkpoint)
model = RobertaModel.from_pretrained(checkpoint)

Reusing dataset code_x_glue_ct_code_to_text (/root/.cache/huggingface/datasets/code_x_glue_ct_code_to_text/ruby/0.0.0/f8b7e9d51f609a87e7ec7c7431706d4ee0b402e3398560410313d4acc67060a0)


  0%|          | 0/3 [00:00<?, ?it/s]

### 数据规范化

将输入数据（code, docstring）tokenizer时进行padding，调整至同一长度

code的最大长度取256，labels的最大长度取128，均参考自CodeBERT的github示例



In [6]:
def data_modifier(data):
  code = data['code']
  docstring = data['docstring']

  inputs = tokenizer(code, max_length=256, padding='max_length', truncation=True)
  labels = tokenizer(docstring, max_length=128, padding='max_length', truncation=True)
  
  inputs['labels'] = labels['input_ids']
  inputs['label_mask'] = labels['attention_mask']
  return inputs

使用.map方法对每个batch的数据都做上述操作

In [7]:
dataset = dataset.map(data_modifier, batched=True)



  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

将数据调整到torch可用的格式，只保留需要的内容

再生成模型直接可用的DataLoader

In [9]:
dataset.set_format(type="torch",columns=['input_ids','attention_mask','labels', 'label_mask'])
train_loader = DataLoader(dataset['train'], shuffle=True, batch_size=16)
valid_loader = DataLoader(dataset['validation'], batch_size=16)
test_loader = DataLoader(dataset['test'], batch_size=16)

## 模型构建

使用pytorch进行网络的补全和训练，网络结构为Seq2Seq结构

encoder直接使用codeBERT即可，decoder使用nn.TransformerDecoderLayer实现

forward部分参考自CodeBERT Github仓库

forward的流程：

- 训练：
  1. input和mask输入encoder，得到encoder_output
  2. 在输入decoder之前，使用labels对encoder_output做嵌入得到词向量tgt
  3. 将词向量和encoder的输出，作为decoder的输入，得到decoder_output
  4. 使用tanh和线性层计算隐藏态hidden_state
  5. 隐藏态再次通过线性层，得到输出

- 预测：
  - 需要将每一步的输出代替labels进行embedding
  - decoder的输出需要进行一次Softmax
  - 使用束搜索(beam search)防止误差传递，提高预测准确率

**束搜索**：

每得到一轮的输出的分数后，累加到已有的分数上，取TopK作为下一轮的输入




In [12]:
# 束搜索类定义
class Beam():
  def __init__(self, beam_size, cls, sep):
    # topK的K值大小
    self.size = beam_size
    # 记录K个最高得分
    self.scores = torch.cuda.FloatTensor(beam_size).zero_()
    # 记录每个时间步之前的topK状态，用于回溯和计算最佳分数
    self.prev_outputs = []
    # 记录每个时间步的输出，初始化为0
    self.next_outputs = [torch.cuda.LongTensor(beam_size).fill_(0)]
    self.next_outputs[0][0] = cls
    # 记录已完成的搜索结果
    self.finished = []
    self.eos = sep
    self.eosTop = False

  # 获取当前时间步的输出  
  def current_outputs(self):
    return torch.cuda.LongTensor(self.next_outputs[-1]).view(-1, 1)
  
  # 获取当前时间步的前继状态
  def previous_outputs(self):
    return self.prev_outputs[-1]

  # 根据decoder的输出结果，计算得分，选取最高的K个
  def beam_search(self, out):
    word_num = out.size(1)

    # 更新当前的分数
    if len(self.prev_outputs) > 0:
      new_scores = out + self.scores.unsqueeze(1).expand_as(out)
      # 若当前结果是eos，则不应让其继续搜索，故赋予其较低分数
      for i in range(self.next_outputs[-1].size(0)):
        if self.next_outputs[-1][i] == self.eos:
          new_scores[i] = -1e20
    else:
      new_scores = out[0]
    new_scores = new_scores.view(-1)
    
    # 获取TopK
    topK_scores, topK_score_ids = new_scores.topk(self.size, 0, True, True)

    self.scores = topK_scores
    current_ids = topK_score_ids // word_num
    self.prev_outputs.append(current_ids)
    self.next_outputs.append((topK_score_ids - current_ids * word_num))

    for i in range(self.next_outputs[-1].size(0)):
      if self.next_outputs[-1][i] == self.eos:
        s = self.scores[i]
        self.finished.append((s, len(self.next_outputs) - 1, i))

    # 当最高得分已经是eos时，判断可以结束搜索
    if self.next_outputs[-1][0] == self.eos:
      self.eosTop = True    

  # 判断是否结束
  def is_finished(self):
    return self.eosTop and len(self.finished) >= self.size

  # 获取最终的搜索结果
  def get_final(self):
    if len(self.finished) == 0:
      self.finished.append((self.scores[0], len(self.next_outputs) - 1, 0))
    self.finished.sort(key=lambda x: -x[0])
    if len(self.finished) != self.size:
      unfinished = []
      for i in range(self.next_outputs[-1].size(0)):
        if self.next_outputs[-1][i] != self.eos:
          s = self.scores[i]
          unfinished.append((s, len(self.next_outputs) - 1, i))
      unfinished.sort(key=lambda x: -x[0])
      self.finished += unfinished[:self.size - len(self.finished)]
    return self.finished[:self.size]

  def get_hyp(self, finished):
    hyps = []
    for _, timestep, k in finished:
      hyp = []
      for j in range(len(self.prev_outputs[:timestep]) - 1, -1, -1):
        hyp.append(self.next_outputs[j + 1][k])
        k = self.prev_outputs[j][k]
      hyps.append(hyp[::-1])
    return hyps

  def build_target_tokens(self, preds):
    sentence = []
    for pred in preds:
      tokens = []
      for token in pred:
        if token == self.eos:
          break
        tokens.append(token)
      sentence.append(tokens)
    return sentence

In [13]:
class RubyCodeBERT(nn.Module):
  def __init__(self, encoder, decoder, config, beam_size, cls, sep):
    super(RubyCodeBERT, self).__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.config = config
    self.dense = nn.Linear(config.hidden_size, config.hidden_size)
    self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
    self.softmax = nn.LogSoftmax(dim=-1)
    self.register_buffer("bias", torch.tril(torch.ones(2048, 2048)))

    self.beam_size = beam_size
    self.sos = cls
    self.eos = sep
    self.max_length = 128

  def forward(self, input_ids, attention_mask, labels=None, label_mask=None):
    # Training
    if labels is not None:
      outputs = self.encoder(input_ids, attention_mask=attention_mask)
      encoder_output = outputs[0].permute([1,0,2]).contiguous()
      attn_mask = -1e4*(1-self.bias[:labels.shape[1],:labels.shape[1]])
      tgt = self.encoder.embeddings(labels).permute([1,0,2]).contiguous()
      decoder_output = self.decoder(tgt, encoder_output, tgt_mask=attn_mask, memory_key_padding_mask=(1-attention_mask).bool())
      linear_output = self.dense(decoder_output)
      hidden_state = torch.tanh(linear_output).permute([1,0,2]).contiguous()
      lm_logits = self.lm_head(hidden_state)
      # 计算损失函数，排除mask=0的元素的影响
      active_loss = label_mask[..., 1:].ne(0).view(-1) == 1
      shift_logits = lm_logits[..., :-1, :].contiguous()
      shift_labels = labels[..., 1:].contiguous()
      loss_func = nn.CrossEntropyLoss(ignore_index=-1)
      loss = loss_func(shift_logits.view(-1, shift_logits.size(-1))[active_loss], shift_labels.view(-1)[active_loss])

      outputs = loss, loss*active_loss.sum(), active_loss.sum()

      return outputs
    # Prediction
    else:

      outputs = self.encoder(input_ids, attention_mask=attention_mask)
      encoder_output = outputs[0].permute([1,0,2]).contiguous()
      preds = []
      zero = torch.cuda.LongTensor(1).fill_(0)
      for i in range(input_ids.shape[0]):
        context = encoder_output[:, i:i+1]
        context_mask = attention_mask[i:i+1, :]
        beam = Beam(self.beam_size, self.sos, self.eos)
        inputs = beam.current_outputs()
        context = context.repeat(1, self.beam_size, 1)
        context_mask = context_mask.repeat(self.beam_size, 1)
        for _ in range(self.max_length):
          if beam.is_finished():
            break
          attn_mask = -1e4 * (1-self.bias[:inputs.shape[1],:inputs.shape[1]])
          tgt = self.encoder.embeddings(inputs).permute([1,0,2]).contiguous()
          out = self.decoder(tgt, context, tgt_mask=attn_mask, memory_key_padding_mask=(1-context_mask).bool())
          out = torch.tanh(self.dense(out))
          hidden_states = out.permute([1,0,2]).contiguous()[:,-1,:]
          out = self.softmax(self.lm_head(hidden_states)).data
          beam.beam_search(out)
          inputs.data.copy_(inputs.data.index_select(0, beam.previous_outputs()))
          inputs = torch.cat((inputs, beam.current_outputs()), -1)
        hyp = beam.get_hyp(beam.get_final())
        # print(hyp)
        pred = beam.build_target_tokens(hyp)[:self.beam_size]
        # print(pred)
        pred = [torch.cat([x.view(-1) for x in p] + [zero]*(self.max_length-len(p))).view(1, -1) for p in pred]
        # print(pred)
        preds.append(torch.cat(pred, 0).unsqueeze(0))

      preds = torch.cat(preds, 0)
      return preds

## Fine-Tuning

传入训练集与验证集，对网络进行微调。


In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)

net = RubyCodeBERT(model, decoder, config, 3, tokenizer.cls_token_id, tokenizer.sep_token_id)
net.to(device)

# no_decay = ['bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params':[p for n,p in net.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay':0.01},
#     {'params':[p for n,p in net.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay':0.0}]

epoch_num = 10
total_step = epoch_num * len(train_loader)

optimizer = AdamW(net.parameters(), lr=5e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1*total_step, num_training_steps=total_step)

# save_path = './NL-PL/models/'



### Training

学习率采用5e-5，epoch设为10，参考自CodeBERT提供的fine-tuning参数建议。

受限于colab的GPU使用时长，每个epoch完成训练后保存模型。

In [16]:
net.train()

# train_loader = cycle(train_loader)
# train_step = 20000

# for epoch in range(5):
for epoch in range(10):
  running_loss = 0
  for i, data in enumerate(train_loader):
    data = (t[1].to(device) for t in data.items())
    input_ids, input_masks, labels, label_masks = data
    # print(input_ids.shape)
    # print(i)
    loss, _, _ = net(input_ids, input_masks, labels, label_masks)
    loss.backward()
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
    running_loss += loss.item()
    if (i + 1) % 200 == 0:
      print('[%d, %5d] loss: %.3f' % (epoch+1, i+1, running_loss / 200))
      running_loss = 0
  torch.save(net.state_dict(), 'codebert.pt')

[1,   200] loss: 10.288
[1,   400] loss: 8.527
[1,   600] loss: 6.841
[1,   800] loss: 6.149
[1,  1000] loss: 5.764
[1,  1200] loss: 5.417
[1,  1400] loss: 5.249
[2,   200] loss: 4.865
[2,   400] loss: 4.641
[2,   600] loss: 4.583
[2,   800] loss: 4.537
[2,  1000] loss: 4.456
[2,  1200] loss: 4.401
[2,  1400] loss: 4.318
[3,   200] loss: 4.164
[3,   400] loss: 4.148
[3,   600] loss: 4.073
[3,   800] loss: 4.058
[3,  1000] loss: 4.054
[3,  1200] loss: 4.015
[3,  1400] loss: 4.006
[4,   200] loss: 3.874
[4,   400] loss: 3.850
[4,   600] loss: 3.831
[4,   800] loss: 3.838
[4,  1000] loss: 3.793
[4,  1200] loss: 3.832
[4,  1400] loss: 3.796
[5,   200] loss: 3.649
[5,   400] loss: 3.687
[5,   600] loss: 3.642
[5,   800] loss: 3.679
[5,  1000] loss: 3.689
[5,  1200] loss: 3.629
[5,  1400] loss: 3.629
[6,   200] loss: 3.550
[6,   400] loss: 3.516
[6,   600] loss: 3.547
[6,   800] loss: 3.532
[6,  1000] loss: 3.528
[6,  1200] loss: 3.481
[6,  1400] loss: 3.473
[7,   200] loss: 3.443
[7,   400]

### Test

使用测试集进行模型的评估，度量方式采用naive BLEU计算

In [17]:
net = RubyCodeBERT(model, decoder, config, 3, tokenizer.cls_token_id, tokenizer.sep_token_id)
state_dict = torch.load('codebert.pt')
net.load_state_dict(state_dict)
net.to(device)

RubyCodeBERT(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768

In [18]:
net.eval()

# eval_loss = 0
p = []
ground_truth = []
for batch in test_loader:
  input_ids, input_masks, labels, label_masks = (t[1].to(device) for t in batch.items())
  # ground_truth.append(tokenizer.decode(labels))
  labels = list(labels)
  for label in labels:
    label = label.cpu().numpy()
    label = list(label)
    if 1 in label:
      label = label[1:label.index(1)-1]
    ground_truth.append(tokenizer.decode(label))

  with torch.no_grad():
    preds = net(input_ids, input_masks)
    for pred in preds:
      t = pred[0].cpu().numpy()
      t = list(t)
      if 0 in t:
        t = t[:t.index(0)]
      text = tokenizer.decode(t, clean_up_tokenization_sapces=False)
      p.append(text)

# print(p)



In [19]:
def calculate_bleu(predictions, ground_truth):
  length = len(predictions)
  # print(length)
  bleu_sum = 0
  for i in range(length):
    pred = predictions[i].split(' ')
    truth = ground_truth[i].split(' ')
    count = 0
    for word in pred:
      if word in truth:
        count += 1
    bleu_sum += count / len(pred)
  final_score = bleu_sum / length
  return final_score

In [20]:
print(calculate_bleu(p, ground_truth))

0.13240009654173585
