In [None]:
# Install necessary libraries
!pip install transformers torch pandas sklearn

import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# ✅ Load the scraped data
df = pd.read_csv("hindustan_times_articles.csv")
df = df.dropna()[["Title", "Article"]]  # Keep only relevant columns
df = df[df["Article"].str.len() > 100]  # Remove short articles

# ✅ Split into train & validation sets
train_data, val_data = train_test_split(df, test_size=0.1, random_state=42)

# ✅ Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# ✅ Custom dataset class
class SummarizationDataset(Dataset):
    def __init__(self, df):
        self.articles = df["Article"].tolist()
        self.summaries = df["Title"].tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        input_enc = tokenizer(self.articles[idx], max_length=512, padding="max_length", truncation=True, return_tensors="pt")
        output_enc = tokenizer(self.summaries[idx], max_length=150, padding="max_length", truncation=True, return_tensors="pt")

        return {
            "input_ids": input_enc["input_ids"].squeeze(),
            "attention_mask": input_enc["attention_mask"].squeeze(),
            "labels": output_enc["input_ids"].squeeze()
        }

# ✅ Create DataLoaders
train_dataset = SummarizationDataset(train_data)
val_dataset = SummarizationDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

# ✅ Load Pretrained T5 Model
model = T5ForConditionalGeneration.from_pretrained("t5-small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ✅ Set up optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# ✅ Training Loop
EPOCHS = 3
for epoch in range(EPOCHS):
    print(f"🚀 Epoch {epoch+1}/{EPOCHS}")

    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"✅ Epoch {epoch+1} - Loss: {loss.item()}")

# ✅ Save the fine-tuned model to Google Drive
model_path = "/content/drive/MyDrive/Models/Text_Summary/fine_tuned_t5"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

print(f"🎉 Model saved to {model_path}")
