In [1]:
from google.colab import drive
import os
import sys
drive.mount('/content/gdrive/', force_remount=True)
outdir = "/content/gdrive/MyDrive/project"
assert not os.system("mkdir -p "+outdir)
sys.path.insert(0, outdir)

Mounted at /content/gdrive/


In [2]:
result = "/content/gdrive/MyDrive/result"
import transformers
from datasets import load_dataset, load_metric, load_from_disk
import datasets
import random
import pandas as pd
from IPython.display import display, HTML
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import nltk
import numpy as np
from transformers import AutoTokenizer
nltk.download('punkt', download_dir=result)
nltk.data.path.append(result)
raw_datasets = load_dataset("ccdv/pubmed-summarization", cache_dir=result)
metric = load_metric("rouge")

[nltk_data] Downloading package punkt to
[nltk_data]     /content/gdrive/MyDrive/result...
[nltk_data]   Package punkt is already up-to-date!


Downloading:   0%|          | 0.00/4.88k [00:00<?, ?B/s]

No config specified, defaulting to: pub_med_summarization_dataset/document
Reusing dataset pub_med_summarization_dataset (/content/gdrive/MyDrive/result/ccdv___pub_med_summarization_dataset/document/1.0.0/5792402f4d618f2f4e81ee177769870f365599daa729652338bac579552fec30)


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [3]:
def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

def preprocess_function(examples, max_input_length=4096, max_target_length = 256):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["abstract"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

# raw_datasets["train"][0]
# show_random_elements(raw_datasets["train"])
# fake_preds = ["hello there", "general kenobi"]
# fake_labels = ["hello there", "general kenobi"]
# metric.compute(predictions=fake_preds, references=fake_labels)
# tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
# tokenized_datasets.save_to_disk(result+"/tokenized_datasets")
#["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:

In [4]:
model_checkpoint = "t5-base"
#model_checkpoint = result+"/new-t5base-finetuned-pubmed/checkpoint-13000"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

tokenized_datasets = load_from_disk(result+"/tokenized_datasets")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, cache_dir=result)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [12]:
args = Seq2SeqTrainingArguments(
    f"{result}/t5base-finetuned-pubmed",
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    #weight_decay=0.01,
    eval_steps = 5000,
    save_total_limit=2,
    num_train_epochs=1,
    predict_with_generate=True,
    save_steps = 1000,
    max_steps=10000,
    fp16=True)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
max_steps is given, it will override any value given in num_train_epochs
Using amp fp16 backend


In [13]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: abstract, article.
***** Running training *****
  Num examples = 119924
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 10000


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
5000,1.8573,1.632931,13.7025,4.9971,12.3986,12.9104,19.0
10000,1.8445,1.632931,13.7025,4.9971,12.3986,12.9104,19.0


Saving model checkpoint to /content/gdrive/MyDrive/result/t5base-finetuned-pubmed/checkpoint-1000
Configuration saved in /content/gdrive/MyDrive/result/t5base-finetuned-pubmed/checkpoint-1000/config.json
Model weights saved in /content/gdrive/MyDrive/result/t5base-finetuned-pubmed/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /content/gdrive/MyDrive/result/t5base-finetuned-pubmed/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /content/gdrive/MyDrive/result/t5base-finetuned-pubmed/checkpoint-1000/special_tokens_map.json
Copy vocab file to /content/gdrive/MyDrive/result/t5base-finetuned-pubmed/checkpoint-1000/spiece.model
Saving model checkpoint to /content/gdrive/MyDrive/result/t5base-finetuned-pubmed/checkpoint-2000
Configuration saved in /content/gdrive/MyDrive/result/t5base-finetuned-pubmed/checkpoint-2000/config.json
Model weights saved in /content/gdrive/MyDrive/result/t5base-finetuned-pubmed/checkpoint-2000/pytorch_model.bin
tokenizer config 

TrainOutput(global_step=10000, training_loss=1.842577880859375, metrics={'train_runtime': 10787.8071, 'train_samples_per_second': 3.708, 'train_steps_per_second': 0.927, 'total_flos': 5.48062101504e+16, 'train_loss': 1.842577880859375, 'epoch': 0.33})

In [7]:
trainer.train(resume_from_checkpoint=result+"/new-t5base-finetuned-pubmed/checkpoint-11500")

Loading model from /content/gdrive/MyDrive/result/new-t5base-finetuned-pubmed/checkpoint-11500).
The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: article, abstract.
***** Running training *****
  Num examples = 119924
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 29981
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 11500
  Will skip the first 0 epochs then the first 11500 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/11500 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.0,,0.0,0.0,0.0,0.0,0.0


Saving model checkpoint to /content/gdrive/MyDrive/result/new-t5base-finetuned-pubmed/checkpoint-12000
Configuration saved in /content/gdrive/MyDrive/result/new-t5base-finetuned-pubmed/checkpoint-12000/config.json
Model weights saved in /content/gdrive/MyDrive/result/new-t5base-finetuned-pubmed/checkpoint-12000/pytorch_model.bin
tokenizer config file saved in /content/gdrive/MyDrive/result/new-t5base-finetuned-pubmed/checkpoint-12000/tokenizer_config.json
Special tokens file saved in /content/gdrive/MyDrive/result/new-t5base-finetuned-pubmed/checkpoint-12000/special_tokens_map.json
Copy vocab file to /content/gdrive/MyDrive/result/new-t5base-finetuned-pubmed/checkpoint-12000/spiece.model
Deleting older checkpoint [/content/gdrive/MyDrive/result/new-t5base-finetuned-pubmed/checkpoint-10500] due to args.save_total_limit
Saving model checkpoint to /content/gdrive/MyDrive/result/new-t5base-finetuned-pubmed/checkpoint-12500
Configuration saved in /content/gdrive/MyDrive/result/new-t5base-fi

TrainOutput(global_step=29981, training_loss=0.05502376236395029, metrics={'train_runtime': 17131.1848, 'train_samples_per_second': 7.0, 'train_steps_per_second': 1.75, 'total_flos': 1.6431449865191424e+17, 'train_loss': 0.05502376236395029, 'epoch': 1.0})

In [None]:
tokenized_datasets["test"][1:10]

In [None]:
#trainer.evaluate()
outputs = trainer.predict(tokenized_datasets["test"])
output_str = tokenizer.batch_decode(outputs[0], skip_special_tokens=True)
raw_datasets["test"].to_csv(result+"/test.csv")
pd.DataFrame(output_str).to_csv(result+"/pred.csv")