In [5]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('test_1000.csv')
total_rows = len(df)
print(f"Total rows in dataset: {total_rows}")



Total rows in dataset: 1000


In [4]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

torch.cuda.empty_cache()

Using device: cuda


In [None]:
!pip install --upgrade --force-reinstall torch torchvision torchaudio

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader


# Add summarization prefix (optional but recommended)
df['input_text'] = 'summarize: ' + df['text']

# If not already present, create dummy summaries (for testing)
df['summary'] = df['text'].apply(lambda x: x[:100])  # placeholder

# Train-validation split
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Custom Dataset
class T5Dataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.input_texts = df['input_text'].tolist()
        self.target_texts = df['summary'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_encoding = self.tokenizer(
            self.input_texts[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target_encoding = self.tokenizer(
            self.target_texts[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

# Create datasets
train_dataset = T5Dataset(train_df, tokenizer)
val_dataset = T5Dataset(val_df, tokenizer)

# Load model (after dataset setup is totally fine)
model = T5ForConditionalGeneration.from_pretrained('t5-small')
model = model.to(device)


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)

In [None]:
pip install rouge_score evaluate

In [None]:
import evaluate


rouge = evaluate.load("rouge")

def compute_metrics(p):
    # If predictions is a tuple, get the first element
    predictions = p.predictions
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    labels = p.label_ids

    # Convert predicted logits to token IDs if needed
    if predictions.ndim == 3:
        predictions = predictions.argmax(-1)

    # Replace -100 in labels as tokenizer.decode can't handle them
    labels = [[(token if token != -100 else tokenizer.pad_token_id) for token in label] for label in labels]

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    # Return result directly, assuming it's already a dict of floats
    return result



In [9]:
from transformers import Trainer, TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=1,   # batch size for training
    per_device_eval_batch_size=1,    # batch size for evaluation
    num_train_epochs=3,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
    no_cuda=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,                     # the model to train
    args=training_args,              # training arguments
    train_dataset=train_dataset,     # training dataset
    eval_dataset=val_dataset,        # evaluation dataset
    compute_metrics=compute_metrics, # Rouge Metric
)

# Start training
trainer.train()
#It will ask you to enter an API token, you should sign up at https://wandb.ai to get a token 

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,1.7353
1000,0.039
1500,0.0285
2000,0.0254
2500,0.0247


TrainOutput(global_step=2700, training_loss=0.3449237339584916, metrics={'train_runtime': 266.0556, 'train_samples_per_second': 10.148, 'train_steps_per_second': 10.148, 'total_flos': 365422863974400.0, 'train_loss': 0.3449237339584916, 'epoch': 3.0})

In [10]:
trainer.evaluate()

{'eval_loss': 0.021979399025440216,
 'eval_rouge1': 0.9428657319959093,
 'eval_rouge2': 0.9389322103268396,
 'eval_rougeL': 0.9428445305734144,
 'eval_rougeLsum': 0.9430208292592478,
 'eval_runtime': 470.317,
 'eval_samples_per_second': 0.213,
 'eval_steps_per_second': 0.213,
 'epoch': 3.0}