In [23]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [24]:
from datasets import load_dataset
billsum=load_dataset("billsum",split='ca_test')

In [25]:
billsum=billsum.train_test_split(test_size=0.2)

In [26]:
billsum['train'][0]

 'summary': 'Existing law, the Swimming Pool Safety Act, provides that it does not apply to any pool within the jurisdiction of any political subdivision that adopts an ordinance for swimming pools, as specified. The act further requires, when a building permit is issued for construction of a new swimming pool or spa, or the remodeling of an existing pool or spa, at a private, single-family home, that the pool or spa be equipped with at least 1 of 7 drowning prevention safety features. The act requires the local building code official to inspect and approve the drowning safety prevention devices before the issuance of a final approval for the completion of permitted construction or remodeling work.\nThis bill would instead require, when a building permit is issued, that the pool or spa be equipped with at least 2 of the 7 drowning prevention safety features. By imposing additional duties on local officials, this bill would impose a state-mandated local program. The bill would remove th

In [27]:
from transformers import AutoTokenizer
model_ckpt='t5-small'
tokenizer=AutoTokenizer.from_pretrained(model_ckpt)

In [28]:
prefix='summarize: '
def preprocess_function(examples):
    inputs=[prefix+doc for doc in examples['text']]
    model_inputs=tokenizer(text_target=examples['summary'],max_length=128,truncation=True)
    labels=tokenizer(text_target=examples['summary'],max_length=128,truncation=True)
    model_inputs['labels']=labels['input_ids']
    return model_inputs

In [29]:
tokenized_billsum=billsum.map(preprocess_function,batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [30]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_ckpt)

In [31]:
pip install evaluate rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [32]:
import evaluate
rouge=evaluate.load('rouge')

In [33]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [34]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

In [35]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [37]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_text_summarization_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,0.188945,0.2549,0.2401,0.2531,0.2533,19.0
2,No log,0.070482,0.2549,0.2409,0.2529,0.2529,19.0
3,No log,0.041371,0.2553,0.2414,0.2539,0.2541,19.0
4,No log,0.028024,0.2554,0.2419,0.2543,0.2545,19.0
5,No log,0.021679,0.2558,0.2424,0.2552,0.2555,19.0
6,No log,0.018071,0.2558,0.2424,0.2552,0.2555,19.0
7,No log,0.015542,0.2558,0.2424,0.2552,0.2555,19.0
8,No log,0.014364,0.2558,0.2424,0.2552,0.2555,19.0
9,0.210900,0.013775,0.2558,0.2424,0.2552,0.2555,19.0
10,0.210900,0.013521,0.2558,0.2424,0.2552,0.2555,19.0




TrainOutput(global_step=620, training_loss=0.1788851676448699, metrics={'train_runtime': 234.8303, 'train_samples_per_second': 42.116, 'train_steps_per_second': 2.64, 'total_flos': 334632604139520.0, 'train_loss': 0.1788851676448699, 'epoch': 10.0})

In [38]:
trainer.push_to_hub()

events.out.tfevents.1700611721.149fc0351f3e.47.0:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

'https://huggingface.co/camie-cool-2903/my_text_summarization_model/tree/main/'

In [40]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."


tokenizer = AutoTokenizer.from_pretrained("camie-cool-2903/my_text_summarization_model")
inputs = tokenizer(text, return_tensors="pt").input_ids

model = AutoModelForSeq2SeqLM.from_pretrained("camie-cool-2903/my_text_summarization_model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)


Downloading config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [41]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

"the Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in American history. it'll ask the ultra-wealthy and corporations to pay their fair share."

In [None]:
# # from transformers import pipeline
# # summarizer = pipeline("summarization", model="my_text_summarization_model")
# # summarizer(text)
# def summarize(input):
#     output = summarizer(input)
#     return output[0]['summary_text']

In [None]:
# import gradio as gr
# gr.close_all()

In [None]:
# demo = gr.Interface(fn=summarize, 
#                     inputs=[gr.Textbox(label="Text to summarize", lines=6)],
#                     outputs=[gr.Textbox(label="Result", lines=3)],
#                     title="Text summarization with Mini T5",
#                     description="Summarize any text using the `T5 mini` model under the hood!"
#                    )
# demo.launch(share=True)