In [1]:
from huggingface_hub import notebook_login
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np
from transformers import pipeline

In [2]:
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\arjun\.cache\huggingface\token
Login successful


In [3]:
billsum = load_dataset("billsum", split="ca_test")

Found cached dataset billsum (C:/Users/arjun/.cache/huggingface/datasets/billsum/default/3.0.0/75cf1719d38d6553aa0e0714c393c74579b083ae6e164b2543684e3e92e0c4cc)


In [4]:
billsum[0]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nThe Legislature finds and declares all of the following:\n(a) (1) Since 1899 congressionally chartered veterans’ organizations have provided a valuable service to our nation’s returning service members. These organizations help preserve the memories and incidents of the great hostilities fought by our nation, and preserve and strengthen comradeship among members.\n(2) These veterans’ organizations also own and manage various properties including lodges, posts, and fraternal halls. These properties act as a safe haven where veterans of all ages and their families can gather together to find camaraderie and fellowship, share stories, and seek support from people who understand their unique experiences. This aids in the healing process for these returning veterans, and ensures their health and happiness.\n(b) As a result of congressional chartering of these veterans’ organizations, the United States Inte

In [5]:
billsum = billsum.select(range(10))

In [6]:
billsum = billsum.remove_columns("title")

In [7]:
billsum = billsum.train_test_split(test_size=0.2)

In [8]:
billsum["train"][0]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nThe Legislature finds and declares as follows:\n(a) More than $40 million of funding for the training of California’s primary care physicians is expiring in 2016.\n(b) Each year in California, only 368 slots are available to the thousands of medical students seeking to train in family medicine. If the funding is not replaced, 158 of those slots will be lost, creating a terrible deficit of primary care physicians in California’s underserved communities.\n(c) Only 36 percent of California’s active patient care physicians practice primary care. Twenty-three of California’s 58 counties fall below the minimum required primary care physician to population ratio.\n(d) As of 2010, California needed an estimated additional 8,243 primary care physicians by 2030 to prevent projected shortages in the state, which is about 412 new primary care physicians per year.\n(e) More than 32 percent of California’s practici

In [9]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [10]:
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/242M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [18]:
rouge = evaluate.load("rouge")

In [19]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [22]:
training_args = Seq2SeqTrainingArguments(
    output_dir="Arjun2102/test_summarizer",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)



PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Cloning https://huggingface.co/Arjun2102/test_summarizer into local empty directory.


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [23]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4
  Number of trainable parameters = 60506624
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33marjunrampalm2002[0m ([33mgeeky-void[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,4.643019,0.1355,0.0377,0.0999,0.0999,19.0
2,No log,4.591436,0.1355,0.0377,0.0999,0.0999,19.0
3,No log,4.556171,0.1355,0.0377,0.0999,0.0999,19.0
4,No log,4.538184,0.1355,0.0377,0.0999,0.0999,19.0


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 16
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 16
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

The following columns in the eva

TrainOutput(global_step=4, training_loss=5.122552871704102, metrics={'train_runtime': 236.2127, 'train_samples_per_second': 0.135, 'train_steps_per_second': 0.017, 'total_flos': 8661875294208.0, 'train_loss': 5.122552871704102, 'epoch': 4.0})

In [26]:
trainer.push_to_hub()

Saving model checkpoint to Arjun2102/test_summarizer
Configuration saved in Arjun2102/test_summarizer\config.json
Configuration saved in Arjun2102/test_summarizer\generation_config.json
Model weights saved in Arjun2102/test_summarizer\pytorch_model.bin
tokenizer config file saved in Arjun2102/test_summarizer\tokenizer_config.json
Special tokens file saved in Arjun2102/test_summarizer\special_tokens_map.json
Copy vocab file to Arjun2102/test_summarizer\spiece.model


Upload file pytorch_model.bin:   0%|          | 32.0k/231M [00:00<?, ?B/s]

Upload file training_args.bin: 100%|##########| 3.56k/3.56k [00:00<?, ?B/s]

Upload file runs/Feb20_16-20-21_LAPTOP-SGNGK0NT/events.out.tfevents.1676890237.LAPTOP-SGNGK0NT.12476.0: 100%|#…

Upload file runs/Feb20_16-20-21_LAPTOP-SGNGK0NT/1676890237.2563534/events.out.tfevents.1676890237.LAPTOP-SGNGK…

Upload file spiece.model:   4%|4         | 32.0k/773k [00:00<?, ?B/s]

remote: Scanning LFS files for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/Arjun2102/test_summarizer
   ec01fad..6ebbfd6  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/Arjun2102/test_summarizer
   ec01fad..6ebbfd6  main -> main

To https://huggingface.co/Arjun2102/test_summarizer
   6ebbfd6..b6254d2  main -> main

   6ebbfd6..b6254d2  main -> main



'https://huggingface.co/Arjun2102/test_summarizer/commit/6ebbfd6f1314a0a7c10e94e929a38444d10c8085'

In [30]:
text = '''Cricket is a bat-and-ball game played between two teams of eleven players on a field at the centre of which is a 22-yard (20-metre) pitch with a wicket at each end, each comprising two bails balanced on three stumps. The batting side scores runs by striking the ball bowled at one of the wickets with the bat and then running between the wickets, while the bowling and fielding side tries to prevent this (by preventing the ball from leaving the field, and getting the ball to either wicket) and dismiss each batter (so they are "out"). Means of dismissal include being bowled, when the ball hits the stumps and dislodges the bails, and by the fielding side either catching the ball after it is hit by the bat, but before it hits the ground, or hitting a wicket with the ball before a batter can cross the crease in front of the wicket. When ten batters have been dismissed, the innings ends and the teams swap roles. The game is adjudicated by two umpires, aided by a third umpire and match referee in international matches. They communicate with two off-field scorers who record the match's statistical information.

Forms of cricket range from Twenty20, with each team batting for a single innings of 20 overs (each "over" being a set of 6 fair opportunities for the batting team to score) and the game generally lasting three hours, to Test matches played over five days. Traditionally cricketers play in all-white kit, but in limited overs cricket they wear club or team colours. In addition to the basic kit, some players wear protective gear to prevent injury caused by the ball, which is a hard, solid spheroid made of compressed leather with a slightly raised sewn seam enclosing a cork core layered with tightly wound string.

The earliest reference to cricket is in South East England in the mid-16th century. It spread globally with the expansion of the British Empire, with the first international matches in the second half of the 19th century. The game's governing body is the International Cricket Council (ICC), which has over 100 members, twelve of which are full members who play Test matches. The game's rules, the Laws of Cricket, are maintained by Marylebone Cricket Club (MCC) in London. The sport is followed primarily in South Asia, Australasia, the United Kingdom, Southern Africa and the West Indies.[1]

Women's cricket, which is organised and played separately, has also achieved international standard. The most successful side playing international cricket is Australia, which has won seven One Day International trophies, including five World Cups, more than any other country and has been the top-rated Test side more than any other country.'''

In [31]:
summarizer = pipeline("summarization", model = "Arjun2102/test_summarizer")
summarizer(text)

loading configuration file Arjun2102/test_summarizer\config.json
Model config T5Config {
  "_name_or_path": "Arjun2102/test_summarizer",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_d

[{'summary_text': 'cricket is played between two teams of eleven players on a 22-yard (20-metre) pitch . each wicket comprises two bails balanced on three stumps . ten batters have been dismissed, the innings ends and the teams swap roles .'}]