In [1]:
from datasets import load_dataset
import pandas as pd

# Load the BillSum dataset (default split is 'train')
ds = load_dataset("FiscalNote/billsum", split="ca_test")  # Using California subset for smaller size

# Convert to Pandas DataFrame
df = ds.to_pandas()[['text', 'summary']]

# Display the first three rows
print(df[['text', 'summary']].head(3))

                                                text  \
0  The people of the State of California do enact...   
1  The people of the State of California do enact...   
2  The people of the State of California do enact...   

                                             summary  
0  Existing property tax law establishes a vetera...  
1  Existing law provides that the Board of Parole...  
2  The Sales and Use Tax Law imposes a tax on ret...  


In [None]:
!pip install --upgrade --force-reinstall torch torchvision torchaudio

In [3]:
pip install sentencepiece





[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [39]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer
from torch.utils.data import Dataset

# Load FiscalNote billsum dataset (California test split)
ds = load_dataset("FiscalNote/billsum", split="ca_test")

# Convert to Pandas DataFrame and select first 100 rows and only needed columns
df = ds.to_pandas()[['text', 'summary']][:100]

# Print first 3 rows to verify
print(df[['text', 'summary']].head(3))

# Add summarization prefix for T5 input format
df['input_text'] = 'summarize: ' + df['text']

# Train-validation split (90% train, 10% val)
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Load T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Custom Dataset class for T5
class T5Dataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.input_texts = df['input_text'].tolist()
        self.target_texts = df['summary'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_encoding = self.tokenizer(
            self.input_texts[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target_encoding = self.tokenizer(
            self.target_texts[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        labels = target_encoding['input_ids'].squeeze()
        # Replace padding token id's of the labels by -100 so they are ignored by the loss function
        labels[labels == tokenizer.pad_token_id] = -100
        
        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': labels
        }

# Create datasets
train_dataset = T5Dataset(train_df, tokenizer)
val_dataset = T5Dataset(val_df, tokenizer)


                                                text  \
0  The people of the State of California do enact...   
1  The people of the State of California do enact...   
2  The people of the State of California do enact...   

                                             summary  
0  Existing property tax law establishes a vetera...  
1  Existing law provides that the Board of Parole...  
2  The Sales and Use Tax Law imposes a tax on ret...  


In [40]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)

In [None]:
pip install rouge_score evaluate

In [41]:


import evaluate


rouge = evaluate.load("rouge")

def compute_metrics(p):
    # If predictions is a tuple, get the first element
    predictions = p.predictions
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    labels = p.label_ids

    # Convert predicted logits to token IDs if needed
    if predictions.ndim == 3:
        predictions = predictions.argmax(-1)

    # Replace -100 in labels as tokenizer.decode can't handle them
    labels = [[(token if token != -100 else tokenizer.pad_token_id) for token in label] for label in labels]

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    # Return result directly, assuming it's already a dict of floats
    return result



In [42]:
from transformers import T5ForConditionalGeneration
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)


In [43]:
from transformers import Trainer, TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    save_strategy="no", 
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=3,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
)

# Initialize Trainer
trainer = Trainer(
    model=model,                     # the model to train
    args=training_args,              # training arguments
    train_dataset=train_dataset,     # training dataset
    eval_dataset=val_dataset,        # evaluation dataset
    compute_metrics=compute_metrics, # Rouge Metric
)

# Start training
trainer.train()
#It will ask you to enter an API token, you should sign up at https://wandb.ai to get a token 



Step,Training Loss


TrainOutput(global_step=36, training_loss=5.7835133870442705, metrics={'train_runtime': 1125.3591, 'train_samples_per_second': 0.24, 'train_steps_per_second': 0.032, 'total_flos': 36542286397440.0, 'train_loss': 5.7835133870442705, 'epoch': 3.0})

In [44]:
model.save_pretrained('./my_finetuned_models')
tokenizer.save_pretrained('./my_finetuned_models')


('./my_finetuned_models\\tokenizer_config.json',
 './my_finetuned_models\\special_tokens_map.json',
 './my_finetuned_models\\spiece.model',
 './my_finetuned_models\\added_tokens.json')

In [None]:
trainer.evaluate()

In [46]:
def generate_summary(text, model, tokenizer, device, max_input_length=512, max_output_length=100):
    model.eval()
    input_text = "summarize: " + text
    inputs = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=max_input_length).to(device)
    summary_ids = model.generate(
        inputs,
        max_length=max_output_length,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [47]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your saved fine-tuned model and tokenizer
tokenizer = T5Tokenizer.from_pretrained('./my_finetuned_models')
model = T5ForConditionalGeneration.from_pretrained('./my_finetuned_models')
model.to(device)



T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [50]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load the model and tokenizer
model_path = "my_finetuned_models"  # Use forward slashes or raw string for Windows paths
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()  # Set model to evaluation mode

# Sample input (adjust for your task, e.g., summarization, classification prompt, etc.)
input_text = "summarize: " + ds[0]["text"]

# Tokenize input
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Generate output
output_ids = model.generate(**inputs, max_length=100)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print input and output
print("Input:\n", input_text)
print("\nOutput:\n", output_text)


Input:
 summarize: The people of the State of California do enact as follows:


SECTION 1.
The Legislature finds and declares all of the following:
(a) (1) Since 1899 congressionally chartered veterans’ organizations have provided a valuable service to our nation’s returning service members. These organizations help preserve the memories and incidents of the great hostilities fought by our nation, and preserve and strengthen comradeship among members.
(2) These veterans’ organizations also own and manage various properties including lodges, posts, and fraternal halls. These properties act as a safe haven where veterans of all ages and their families can gather together to find camaraderie and fellowship, share stories, and seek support from people who understand their unique experiences. This aids in the healing process for these returning veterans, and ensures their health and happiness.
(b) As a result of congressional chartering of these veterans’ organizations, the United States In

In [34]:
examples = [
    "Text: The people of the State of California do enact as follows: SECTION 1. Section 123456 of the Health and Safety Code is amended to read: 123456. (a) Any facility that manages hazardous waste shall comply with the following requirements: (1) Obtain a permit from the Department of Toxic Substances Control. (2) Submit an annual report detailing the types and quantities of hazardous waste managed. (b) The Department of Toxic Substances Control shall establish regulations to ensure compliance with this section... [continues with more details about regulations, penalties, and implementation]. ",
]

for i, text in enumerate(examples):
    print(f"\nExample {i+1}")
    print("Original:", text)
    print("Summary:", generate_summary(text, model, tokenizer, device))



Example 1
Original: Text: The people of the State of California do enact as follows: SECTION 1. Section 123456 of the Health and Safety Code is amended to read: 123456. (a) Any facility that manages hazardous waste shall comply with the following requirements: (1) Obtain a permit from the Department of Toxic Substances Control. (2) Submit an annual report detailing the types and quantities of hazardous waste managed. (b) The Department of Toxic Substances Control shall establish regulations to ensure compliance with this section... [continues with more details about regulations, penalties, and implementation]. 
Summary: The people of the State of California do enact as follows: SECTION 1. Section 123456 of the Health and Safety Code is amended to read: 123456. (a) Any facility that manages hazardous waste shall comply with the following requirements: (1) Obtain a permit from the Department of Toxic Substances Control. (2) Submit an annual report detailing the types and quantities of h