In [None]:
import torch

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Mon Oct 31 15:19:04 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    46W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers wget datasets sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from datasets import load_dataset, load_metric
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch
import math
from google.colab import drive, auth
import os
import gc
import numpy as np
import random

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [None]:
torch.cuda.empty_cache()
gc.collect()

44

In [None]:
base_dir = '/content/drive'
mount_dir = base_dir + '/MyDrive'
if not os.path.exists(mount_dir):
  auth.authenticate_user()
  drive.mount(base_dir)

In [None]:
import pandas as pd

DATA_HOME = 'nlp'
train_df = pd.read_csv(os.path.join(mount_dir, DATA_HOME, 'hw_train.csv'))
test_df = pd.read_csv(os.path.join(mount_dir, DATA_HOME, 'hw_test.csv'))
(train_df.shape, test_df.shape)

((4970, 4), (2130, 4))

In [None]:
train_df.sample()

Unnamed: 0,question,answer,difficulty,context
2360,Is Spanish the most spoken language in the world?,No,M,spanish is the primary language in 20 countrie...


In [None]:
model_home = !MODEL_HOME="QG-t5" && mkdir -p /content/drive/MyDrive/$MODEL_HOME && echo $MODEL_HOME
model_home = model_home[0]
model_home

'QG-t5'

In [None]:
ckpt_epoch = 0
ckpt_batch = 0
model_mark = 'hw'
model_path = '{}/{}/{}-{}-{}.ckpt'.format(mount_dir, model_home, model_mark, ckpt_epoch, ckpt_batch)

base_model = "t5-base"
tkn = T5Tokenizer.from_pretrained(base_model)
tkn.sep_token = '<sep>'
tkn.add_tokens(['<sep>'])

if os.path.exists(model_path):
  print('Load existed model: {}'.format(model_path))
  b2b = T5ForConditionalGeneration.from_pretrained(model_path).to(device)
else:
  print('Create new model')
  b2b = T5ForConditionalGeneration.from_pretrained(base_model).to(device)
  b2b.resize_token_embeddings(len(tkn))

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Load existed model: /content/drive/MyDrive/QG-t5/hw-0-0.ckpt


In [None]:
epochs = 5
batch_size = 30
train_size = train_df.shape[0]

In [None]:
dec_max_length = 64
enc_max_length = 512

In [None]:
ctx_dict = tkn.batch_encode_plus(
    ['answer: %s <sep> context: %s' % (
      dataline['answer'],
      dataline['context'],
    ) for idx, dataline in train_df.iterrows()],
    add_special_tokens=True,
    max_length=enc_max_length,
    pad_to_max_length=True,
    truncation=True,
    return_tensors='pt',
)



In [None]:
que_dict = tkn.batch_encode_plus(
    ['question: %s' % (
      dataline['question'],
    ) for idx, dataline in train_df.iterrows()],
    add_special_tokens=True,
    max_length=dec_max_length,
    pad_to_max_length=True,
    truncation=True,
    return_tensors='pt',
)

In [None]:
ctx_dict.input_ids.shape, que_dict.input_ids.shape

(torch.Size([4970, 512]), torch.Size([4970, 64]))

In [None]:
from itertools import cycle

train_cmp_set = TensorDataset(
  ctx_dict.input_ids,
  ctx_dict.attention_mask,
  que_dict.input_ids,
  que_dict.attention_mask,
)
train_loader = DataLoader(train_cmp_set, sampler=RandomSampler(train_cmp_set), batch_size=batch_size)
train_loader_len = len(train_loader)

total_steps = train_loader_len * epochs
opt = AdamW(b2b.parameters(), lr=2e-5, eps=1e-8)



In [None]:
import time
print('Batch num: ', train_loader_len)

for epoch_i in range(ckpt_epoch, epochs):
  b2b.train()
  total_loss = 0
  if epoch_i < ckpt_epoch or (epoch_i == ckpt_epoch and ckpt_batch == 0):
      continue
  t0 = time.time()
  seg_loss = 0
  for step, batch in enumerate(train_loader):
    step_up = step + 1
    if step_up <= ckpt_batch:
      continue

    b_ctx_ids = batch[0].to(device)
    b_ctx_attn = batch[1].to(device)
    b_que_ids = batch[2].to(device)
    b_que_attn = batch[3].to(device)

    out = b2b(
      input_ids=b_ctx_ids, 
      attention_mask=b_ctx_attn,
      labels=b_que_ids, 
      decoder_attention_mask=b_que_attn,
    )
    loss = out.loss
    loss_val = loss.item()

    total_loss += loss_val
    seg_loss += loss_val

    opt.zero_grad()
    loss.backward()
    opt.step()

    if step_up % 10 == 0:
      t1 = time.time()
      time_cost = t1 - t0
      t0 = t1
      seg_avg_loss = seg_loss / 10
      seg_loss = 0
      print('\r{}\t{}:\t{}\t{}\t{:.5}'.format(epoch_i, step_up, loss_val, seg_avg_loss, time_cost), end='')
    # if step_up % 1000 == 0:
    #   b2b.save_pretrained('{}/{}/hw-{}-{}.ckpt'.format(mount_dir, model_home, epoch_i, step_up))

  avg_loss = total_loss / len(train_loader)
  print('\repoch({}):\t{}'.format(epoch_i, avg_loss))
  b2b.save_pretrained('{}/{}/hw-{}-0.ckpt'.format(mount_dir, model_home, epoch_i))
  train_loader = DataLoader(train_cmp_set, sampler=RandomSampler(train_cmp_set), batch_size=batch_size)

Batch num:  166
epoch(1):	0.3758445832743702
2	30:	0.358169287443161	0.36276060044765474	8.9624

KeyboardInterrupt: ignored

In [None]:
b2beval = b2b.eval()

In [None]:
test_line = test_df.loc[random.randint(0, test_df.shape[0])]

p_ctx_dict = tkn.batch_encode_plus(
    ['answer: %s <sep> context: %s' % (
      test_line['answer'],
      test_line['context'],
    )],
    add_special_tokens=True,
    max_length=enc_max_length,
    pad_to_max_length=True,
    truncation=True,
    return_tensors='pt',
)

p_que_dict = tkn.batch_encode_plus(
    ['question: %s' % (
      test_line['question'],
    )],
    add_special_tokens=True,
    max_length=dec_max_length,
    pad_to_max_length=True,
    truncation=True,
    return_tensors='pt',
)

test_line



question      In the Arp peculiar galaxy catalog, which two ...
answer                                     NGC 772 and NGC 770.
difficulty                                                    H
context       ngc 772 is a spiral galaxy with an integrated ...
Name: 1509, dtype: object

In [None]:
out = b2beval.generate(
    input_ids=p_ctx_dict.input_ids[:1].to(device),
    attention_mask=p_ctx_dict.attention_mask[:1].to(device),
    num_beams=4,
    length_penalty=1.5,
    no_repeat_ngram_size=3,
    early_stopping=True,
    max_length=dec_max_length
)

In [None]:
tkn.decode(out[0])

'<pad> question: Which two galaxies together are also classified as Arp 78 in the Arp peculiar galaxy catalog?</s>'

In [None]:
test_line['question']

'In the Arp peculiar galaxy catalog, which two galaxies are together classified as Arp 78?'