In [1]:
from datasets import load_dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

ModuleNotFoundError: No module named 'datasets'

In [None]:
from transformers import BartTokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
import nltk
from rouge_score import rouge_scorer

nltk.download('punkt')
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def create_extractive_labels(example):
    article_sentences = nltk.sent_tokenize(example["article"])
    summary = example["highlights"]
    labels = []
    for sent in article_sentences:
        score = scorer.score(summary, sent)['rougeL'].fmeasure
        labels.append(1 if score > 0.5 else 0)  # Adjust threshold
    example["labels"] = labels
    example["sentences"] = article_sentences
    return example

labeled_dataset = dataset.map(create_extractive_labels)

In [None]:
from transformers import BartForConditionalGeneration, TrainingArguments, Trainer

model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=3e-5,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)
trainer.train()

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer, TrainingArguments, Trainer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_extractive(examples):
    return tokenizer(examples["sentences"], padding="max_length", truncation=True)

extractive_dataset = labeled_dataset.map(tokenize_extractive, batched=True)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results_extractive",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=2e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=extractive_dataset["train"],
    eval_dataset=extractive_dataset["validation"],
)
trainer.train()

In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

def evaluate_summary(predicted, reference):
    return scorer.score(predicted, reference)

# Apply to test dataset
abstractive_scores = []
extractive_scores = []

for example in dataset["test"]:
    # Generate abstractive summary
    abstractive_summary = generate_abstractive(example["article"])
    # Generate extractive summary
    extractive_summary = generate_extractive(example["article"])
    # Compute scores
    abstractive_scores.append(evaluate_summary(abstractive_summary, example["highlights"]))
    extractive_scores.append(evaluate_summary(extractive_summary, example["highlights"]))

# Average scores
avg_abstractive_rouge = {key: sum(s[key] for s in abstractive_scores)/len(abstractive_scores) for key in ['rouge1', 'rougeL']}
avg_extractive_rouge = {key: sum(s[key] for s in extractive_scores)/len(extractive_scores) for key in ['rouge1', 'rougeL']}