In [None]:
import pandas as pd 
import numpy as np 
import json 
import re 
import nltk

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
import json
import pandas as pd 

folder_path = "C:\\Users\\anura\\OneDrive\\Documents\\GitHub\\Text-Summarizer\\data\\"

train_path = folder_path + "train.csv"
validation_path = folder_path + "validation.csv"
test_path = folder_path + "test.csv"

train_df = pd.read_csv(train_path)
validation_df = pd.read_csv(validation_path)
test_df = pd.read_csv(test_path)

In [None]:
train_df.shape, validation_df.shape, test_df.shape

In [None]:
ds = [train_df, validation_df, test_df]

for df in ds:
    print(df.sample(1))
    print("--" * 20)

# Preprocessing

In [None]:
from transformers import PreTrainedTokenizer
from typing import Dict, List

class SummarizationPreprocessor:
    def __init__(self, tokenizer: PreTrainedTokenizer,
                 max_input_length: int = 1024,
                 max_target_length: int = 128):
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __call__(self, batch: Dict[str, List[str]]) -> Dict[str, List[int]]:
        # Tokenize the article (input text)
        inputs = self.tokenizer(
            batch["article"],
            max_length=self.max_input_length,
            truncation=True,
            padding="max_length"
        )

        # Tokenize the highlights (target summary)
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(
                batch["highlights"],
                max_length=self.max_target_length,
                truncation=True,
                padding="max_length"
            )

        # Add labels to the inputs
        inputs["labels"] = labels["input_ids"]
        return inputs

In [None]:
from datasets import load_dataset
from transformers import BartTokenizer

# Load tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

# Load your CSVs
train_dataset = load_dataset("csv", data_files=train_path)["train"]
val_dataset = load_dataset("csv", data_files=validation_path)["train"]
test_dataset = load_dataset("csv", data_files=test_path)["train"]

# Create preprocessor
preprocessor = SummarizationPreprocessor(tokenizer)

# Apply it using map with batching
tokenized_train = train_dataset.map(preprocessor, batched=True, remove_columns=["article", "highlights"])
tokenized_val = val_dataset.map(preprocessor, batched=True, remove_columns=["article", "highlights"])
tokenized_test = test_dataset.map(preprocessor, batched=True, remove_columns=["article", "highlights"])


In [None]:
print(f"training dataset: {tokenized_train}")
print(f"validation dataset: {tokenized_val}")
print(f"test dataset: {tokenized_test}")

# model training

In [None]:
from transformers import BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

In [None]:
import torch
torch.__version__

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
%pip install --upgrade "accelerate>=0.26.0"

In [None]:
%pip show accelerate

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=500,
    save_steps=1000,
    per_device_train_batch_size=2,        # Small batch for limited GPU
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,        # Effective larger batch
    num_train_epochs=3,
    save_total_limit=2,
    predict_with_generate=True,           # ✅ Needed for summarization
    fp16=True,                            # Use if you have a compatible GPU
    logging_dir="./logs",
    report_to="none",                     # Can be "wandb", "tensorboard", etc.
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,                    # e.g. BartForConditionalGeneration or T5ForConditionalGeneration
    args=training_args,             # instance of Seq2SeqTrainingArguments
    train_dataset=tokenized_train, # your pre-tokenized training dataset
    eval_dataset=tokenized_val,    # your pre-tokenized validation dataset
    tokenizer=tokenizer,           # the tokenizer you used for preprocessing
)

trainer.train()