In [1]:
import evaluate
import numpy as np
import tensorflow as tf
from transformers import (AutoTokenizer, DataCollatorForSeq2Seq, create_optimizer, 
                          AdamWeightDecay, TFAutoModelForSeq2SeqLM, AutoModelForSeq2SeqLM, 
                          Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline)
from datasets import Dataset

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/BBC_data.csv')

In [4]:
df.head()

Unnamed: 0,id,article,summary
0,1,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...
1,2,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...
2,3,Ask Jeeves tips online ad revival\n\nAsk Jeeve...,Ask Jeeves has become the third leading online...
3,4,Australia rates at four year high\n\nAustralia...,The Reserve Bank of Australia lifted interest ...
4,5,US company admits Benin bribery\n\nA US defenc...,A US defence and telecommunications company ha...


In [5]:
df = Dataset.from_pandas(df)

In [6]:
df

Dataset({
    features: ['id', ' article ', ' summary'],
    num_rows: 2225
})

In [7]:
df[' article '][0]

'Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.\n\nThe firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.\n\nTime Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sig

In [8]:
df[' summary'][0]

"TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn.Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues.Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins.It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters.Time Warner's fourth quarter profits were slightly better than analysts' expectations."

In [9]:
df = df.train_test_split(test_size=0.2)

In [10]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [11]:
prefix = "summarize: "


def preprocess_function(dataset):
    inputs = [prefix + doc for doc in dataset[' article ']]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=dataset[' summary'], max_length=128, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [12]:
tokenized_dataset = df.map(preprocess_function, batched=True)

Map:   0%|          | 0/1780 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [14]:
rouge = evaluate.load("rouge")

In [15]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [16]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [17]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=1,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: id,  summary,  article . If id,  summary,  article  are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1780
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 1780
  Number of trainable parameters = 60506624
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.3758,0.290318,0.226,0.1867,0.2162,0.2158,19.0


Saving model checkpoint to my_awesome_billsum_model\checkpoint-500
Configuration saved in my_awesome_billsum_model\checkpoint-500\config.json
Model weights saved in my_awesome_billsum_model\checkpoint-500\pytorch_model.bin
tokenizer config file saved in my_awesome_billsum_model\checkpoint-500\tokenizer_config.json
Special tokens file saved in my_awesome_billsum_model\checkpoint-500\special_tokens_map.json
Saving model checkpoint to my_awesome_billsum_model\checkpoint-1000
Configuration saved in my_awesome_billsum_model\checkpoint-1000\config.json
Model weights saved in my_awesome_billsum_model\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in my_awesome_billsum_model\checkpoint-1000\tokenizer_config.json
Special tokens file saved in my_awesome_billsum_model\checkpoint-1000\special_tokens_map.json
Saving model checkpoint to my_awesome_billsum_model\checkpoint-1500
Configuration saved in my_awesome_billsum_model\checkpoint-1500\config.json
Model weights saved in my_awesome

TrainOutput(global_step=1780, training_loss=0.4791894805565309, metrics={'train_runtime': 118949.2059, 'train_samples_per_second': 0.015, 'train_steps_per_second': 0.015, 'total_flos': 239975552679936.0, 'train_loss': 0.4791894805565309, 'epoch': 1.0})

In [27]:
text = "summarize: Gradient descent is an iterative optimization algorithm used to minimize a cost function in order to find the optimal parameters m and c for a machine learning model. It’s a fundamental technique used in training models to make accurate predictions by adjusting the model’s parameters in the right direction to reduce the error between predictions and actual values. The goal of gradient descent is to find the local minimum (or maximum) of a function. Gradient decent first assigns the model’s parameters with some initial value(generally m = 1 and c = 1) and checks the cost function. After that gradient decent algorithm calculates the gradient of the cost function(Partial derivation) with respect to the model’s parameters. The gradient essentially indicates the direction and magnitude of the steepest increase of the cost function. It’s a vector that points in the direction of the greatest increase in the cost."

In [28]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("my_awesome_billsum_model/checkpoint-1500")
inputs = tokenizer(text, return_tensors="pt").input_ids

loading file spiece.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


In [29]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("my_awesome_billsum_model/checkpoint-1500")

loading configuration file my_awesome_billsum_model/checkpoint-1500\config.json
Model config T5Config {
  "_name_or_path": "my_awesome_billsum_model/checkpoint-1500",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
 

In [30]:
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [31]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'Gradient decent first assigns the model’s parameters with some initial value(generally m = 1 and c = 1) and checks the cost function.After that gradient decent algorithm calculates the gradient of the cost function(Partial derivation) with respect to the model’s parameters.The goal of gradient descent is to find the local minimum (or maximum) of a function.After that gradient decent algorithm calculates the gradient of the cost function(Partial deriv'