In [None]:
!pip install transformers

In [None]:
import torch
from transformers import AutoTokenizer, BartForConditionalGeneration, AdamW, get_scheduler
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import json

In [None]:
CONFIG = {
    "model_name": "facebook/bart-large",
    "train_batch_size": 2,
    "eval_batch_size": 2,
    "learning_rate": 3e-5,
    "num_train_epochs": 4,
    "warmup_steps": 500,
    "weight_decay": 0.01,
    "logging_steps": 10,
    "save_steps": 100,
    "output_dir": "/kaggle/working/finetuned_model",
    "use_cuda": True
}

In [None]:
# Data Preprocessor
class LabeledDataset(Dataset):
    def __init__(self, data_file, tokenizer_name="facebook/bart-large", max_len=512):
        self.data = self.load_data(data_file)
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_len = max_len

    def load_data(self, data_file):
        with open(data_file, "r") as f:
            return json.load(f)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        inputs = self.tokenizer(
            item["question"], 
            item["answer"], 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": torch.tensor(1)
        }

In [None]:
# Model Trainer
def train_model(train_dataset, eval_dataset, config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"🔧 Using device: {device}")

    model = BartForConditionalGeneration.from_pretrained(config["model_name"])
    model.to(device)

    train_loader = DataLoader(train_dataset, batch_size=config["train_batch_size"], shuffle=True)
    eval_loader = DataLoader(eval_dataset, batch_size=config["eval_batch_size"])

    optimizer = AdamW(model.parameters(), lr=config["learning_rate"])
    num_training_steps = len(train_loader) * config["num_train_epochs"]

    scheduler = get_scheduler(
        "linear", optimizer=optimizer, num_warmup_steps=config["warmup_steps"], num_training_steps=num_training_steps
    )

    model.train()
    for epoch in range(config["num_train_epochs"]):
        print(f"📚 Training Epoch {epoch + 1}")
        loop = tqdm(train_loader, leave=True)

        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids, 
                attention_mask=attention_mask, 
                labels=input_ids
            )
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            loop.set_description(f"Epoch {epoch + 1}")
            loop.set_postfix(loss=loss.item())

    model.save_pretrained(config["output_dir"])
    print(f"✅ Model saved to {config['output_dir']}")

In [None]:
train_dataset = LabeledDataset("/kaggle/input/ros-youtube-question-answers/labeled_data.json")
eval_dataset = LabeledDataset("/kaggle/input/ros-youtube-question-answers/labeled_data.json")

In [None]:
train_model(train_dataset, eval_dataset, CONFIG)