## 环境准备

安装依赖：

- datasets：引入数据集
- transformers：预训练模型的加载
- pytorch：微调

In [1]:
!pip install -q datasets
!pip install -q transformers

[K     |████████████████████████████████| 362 kB 4.8 MB/s 
[K     |████████████████████████████████| 1.1 MB 58.0 MB/s 
[K     |████████████████████████████████| 101 kB 10.8 MB/s 
[K     |████████████████████████████████| 140 kB 22.4 MB/s 
[K     |████████████████████████████████| 212 kB 52.6 MB/s 
[K     |████████████████████████████████| 596 kB 47.4 MB/s 
[K     |████████████████████████████████| 127 kB 68.6 MB/s 
[K     |████████████████████████████████| 271 kB 66.7 MB/s 
[K     |████████████████████████████████| 144 kB 64.0 MB/s 
[K     |████████████████████████████████| 94 kB 4.3 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
[K     |████████████████████████████████| 4.4 MB 5.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 3.1 MB/

In [2]:
from datasets import load_dataset
from transformers import RobertaTokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup

import torch
from torch.utils.data import DataLoader

In [3]:
# from google.colab import drive
#
# drive.mount('/content/drive', force_remount=True)
#
# %cd drive/MyDrive/NL-PL/models/

Mounted at /content/drive
/content/drive/MyDrive/NL-PL/models


## 数据处理

### 加载预训练模型和数据集

In [4]:
dataset = load_dataset('code_x_glue_ct_code_to_text', 'ruby')
checkpoint = 'Salesforce/codet5-small'
tokenizer = RobertaTokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint)

Downloading builder script:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/961 [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/475 [00:00<?, ?B/s]

Downloading and preparing dataset code_x_glue_ct_code_to_text/ruby (download: 118.40 MiB, generated: 54.79 MiB, post-processed: Unknown size, total: 173.19 MiB) to /root/.cache/huggingface/datasets/code_x_glue_ct_code_to_text/ruby/0.0.0/f8b7e9d51f609a87e7ec7c7431706d4ee0b402e3398560410313d4acc67060a0...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/112M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/24927 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1400 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1261 [00:00<?, ? examples/s]

Dataset code_x_glue_ct_code_to_text downloaded and prepared to /root/.cache/huggingface/datasets/code_x_glue_ct_code_to_text/ruby/0.0.0/f8b7e9d51f609a87e7ec7c7431706d4ee0b402e3398560410313d4acc67060a0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/687k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/287k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [5]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 24927
    })
    validation: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 1400
    })
    test: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 1261
    })
})


### 数据规范化

将输入数据进行tokenize，同时补齐至相同长度

同时，由于\<pad\>的存在，需要对其进行处理，防止在最后的损失函数中影响结果

In [6]:
def data_modifier(data):
  code = data['code']
  docstring = data['docstring']

  inputs = tokenizer(code, max_length=256, padding='max_length', truncation=True)
  labels = tokenizer(docstring, max_length=128, padding='max_length', truncation=True)
  # print(labels)

  inputs['labels'] = labels['input_ids']

  return inputs

In [7]:
dataset = dataset.map(data_modifier, batched=True)



  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

转化成pytorch可以使用的dataloader

In [8]:
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
train_loader = DataLoader(dataset['train'], shuffle=True, batch_size=16)
vaild_loader = DataLoader(dataset['validation'], batch_size=8)
test_loader = DataLoader(dataset['test'], batch_size=8)

## Fine-Tuning



In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

epoch_num = 5
total_step = epoch_num * len(train_loader)

optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1*total_step, num_training_steps=total_step)



### Training

In [10]:
model.train()

running_loss = 0
for epoch in range(epoch_num):
  running_loss = 0.0
  for i, data in enumerate(train_loader):
    data = (t[1].to(device) for t in data.items())
    input_ids, attention_mask, labels = data
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    # print(outputs)
    # break
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
    running_loss += loss.item()
    if (i + 1) % 200 == 0:
      print('[%d, %5d] loss: %.3f' % (epoch+1, i+1, running_loss / 200))
      running_loss = 0
  torch.save(model.state_dict(), 'codet5.pt')

[1,   200] loss: 5.214
[1,   400] loss: 1.615
[1,   600] loss: 1.304
[1,   800] loss: 1.200
[1,  1000] loss: 1.152
[1,  1200] loss: 1.149
[1,  1400] loss: 1.101
[2,   200] loss: 1.060
[2,   400] loss: 1.055
[2,   600] loss: 1.043
[2,   800] loss: 1.052
[2,  1000] loss: 1.038
[2,  1200] loss: 1.068
[2,  1400] loss: 1.024
[3,   200] loss: 0.996
[3,   400] loss: 1.036
[3,   600] loss: 1.000
[3,   800] loss: 0.991
[3,  1000] loss: 0.993
[3,  1200] loss: 0.983
[3,  1400] loss: 0.974
[4,   200] loss: 0.972
[4,   400] loss: 0.973
[4,   600] loss: 0.942
[4,   800] loss: 0.975
[4,  1000] loss: 0.955
[4,  1200] loss: 0.991
[4,  1400] loss: 0.960
[5,   200] loss: 0.942
[5,   400] loss: 0.972
[5,   600] loss: 0.959
[5,   800] loss: 0.962
[5,  1000] loss: 0.941
[5,  1200] loss: 0.923
[5,  1400] loss: 0.928


### Test

In [12]:
model.eval()

p = []
ground_truth = []
for batch in test_loader:
  input_ids, input_masks, labels = (t[1].to(device) for t in batch.items())
  # ground_truth.append(tokenizer.decode(labels))
  labels = list(labels)
  for label in labels:
    label = label.cpu().numpy()
    label = list(label)
    if 1 in label:
      label = label[1:label.index(1)-1]
    ground_truth.append(tokenizer.decode(label, skip_special_tokens=True))

  with torch.no_grad():
    preds = model.generate(input_ids, attention_mask=input_masks, num_beams=5, max_length=128)
    for pred in preds:
      t = pred.cpu().numpy()
      text = tokenizer.decode(t, skip_special_tokens=True)
      p.append(text)

In [13]:
def calculate_bleu(predictions, ground_truth):
  length = len(predictions)
  # print(length)
  bleu_sum = 0
  for i in range(length):
    pred = predictions[i].split(' ')
    truth = ground_truth[i].split(' ')
    count = 0
    for word in pred:
      if word in truth:
        count += 1
    bleu_sum += count / len(pred)
  final_score = bleu_sum / length
  return final_score

In [14]:
print(calculate_bleu(p, ground_truth))

0.25617471750892584
