In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('train.csv')
total_rows = len(df)
print(f"Total rows in dataset: {total_rows}")

Total rows in dataset: 117108


In [None]:
!pip install --upgrade --force-reinstall torch torchvision torchaudio

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader


# Add summarization prefix (optional but recommended)
df['input_text'] = 'summarize: ' + df['text']

# If not already present, create dummy summaries (for testing)
df['summary'] = df['text'].apply(lambda x: x[:100])  # placeholder

# Train-validation split
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Custom Dataset
class T5Dataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.input_texts = df['input_text'].tolist()
        self.target_texts = df['summary'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_encoding = self.tokenizer(
            self.input_texts[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target_encoding = self.tokenizer(
            self.target_texts[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

# Create datasets
train_dataset = T5Dataset(train_df, tokenizer)
val_dataset = T5Dataset(val_df, tokenizer)

# Load model (after dataset setup is totally fine)
model = T5ForConditionalGeneration.from_pretrained('t5-small')


In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)

In [None]:
pip install rouge_score

In [None]:
from datasets import load_metric

# Load ROUGE metric
rouge = load_metric('rouge')

# Define compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    # Return the ROUGE scores (you can customize this if you need specific ones)
    return {key: value.mid.fmeasure for key, value in result.items()}


In [None]:
from transformers import Trainer, TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=1,   # batch size for training
    per_device_eval_batch_size=1,    # batch size for evaluation
    num_train_epochs=3,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
)

# Initialize Trainer
trainer = Trainer(
    model=model,                     # the model to train
    args=training_args,              # training arguments
    train_dataset=train_dataset,     # training dataset
    eval_dataset=val_dataset,        # evaluation dataset
    compute_metrics=compute_metrics, # Rouge Metric
)

# Start training
trainer.train()
#It will ask you to enter an API token, you should sign up at https://wandb.ai to get a token 

In [None]:
trainer.evaluate()