In [2]:
import os
import pandas as pd
from utils import prepare_labeled_sentences, prepare_labeled_sentences_spacy
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mohamedkenya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mohamedkenya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mohamedkenya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mohamedkenya/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/mohamedkenya/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mohamedkenya/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Read Datasets

In [3]:
# BBC Dataset
bbc_df = pd.read_csv("data/bbc/bbc_dataset.csv")

# CNN Datasets
# cnn_train_df = pd.read_csv("data/cnn/cnn_dailymail_train.csv")
# cnn_valid_df = pd.read_csv("data/cnn/cnn_dailymail_valid.csv")
# cnn_test_df = pd.read_csv("data/cnn/cnn_dailymail_test.csv")

imdb_df = pd.read_csv("data/imdb/imdb.csv")

In [4]:
# Preview to confirm structure
print("BBC Sample:")
display(bbc_df.head())

BBC Sample:


Unnamed: 0,Article,Summary
0,Musicians to tackle US red tape\n\nMusicians' ...,Nigel McCune from the Musicians' Union said Br...
1,"U2's desire to be number one\n\nU2, who have w...",But they still want more.They have to want to ...
2,Rocker Doherty in on-stage fight\n\nRock singe...,"Babyshambles, which he formed after his acrimo..."
3,Snicket tops US box office chart\n\nThe film a...,A Series of Unfortunate Events also stars Scot...
4,Ocean's Twelve raids box office\n\nOcean's Twe...,"Ocean's Twelve, the crime caper sequel starrin..."


In [4]:
# print("CNN Sample:")
# display(cnn_train_df.head())

In [5]:
print("IMDB Sample:")
display(imdb_df.head())

IMDB Sample:


Unnamed: 0,Article,Summary
0,One of the other reviewers has mentioned that ...,One of the other reviewers has mentioned that ...
1,A wonderful little production The filming tech...,A wonderful little production The filming tech...
2,I thought this was wonderful way to spend time...,I thought it was proof that Woody Allen is sti...
3,Basically there a family where little boy Jake...,Basically there a family where little boy Jake...
4,Petter Mattei Love in the Time of Money is vis...,Petter Mattei Love in the Time of Money is vis...


Preprocess BBC Datasets

In [8]:
# Process the BBC dataset
bbc_labeled_data = prepare_labeled_sentences_spacy(bbc_df)

# Convert to DataFrame for modeling
bbc_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "article_sentences": item["raw_sentence"],
            "preprocessed_sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in bbc_labeled_data
    ]
)

Preprocessing articles: 100%|██████████| 2225/2225 [04:39<00:00,  7.96it/s]


In [7]:
bbc_processed_df.shape

(41677, 4)

In [8]:
# Count how many sentences are labeled as summary sentences
summary_count = bbc_processed_df['label'].sum()
total_count = len(bbc_processed_df)
print(f"Summary sentences: {summary_count} out of {total_count} ({summary_count/total_count:.2%})")

# Show some examples of sentences included in summaries
print("\nExample summary sentences:")
display(bbc_processed_df[bbc_processed_df['label'] == 1].head(3))

Summary sentences: 16543 out of 41677 (39.69%)

Example summary sentences:


Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
0,0,Musicians to tackle US red tape Musicians' gr...,musician tackle u red tape musician group tack...,1
1,0,A singer hoping to perform in the US can expec...,singer hop perform u expect pay simply obtain ...,1
4,0,Nigel McCune from the Musicians' Union said Br...,nigel mccune musician union say british musici...,1


In [9]:
bbc_processed_df.head(60)

Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
0,0,Musicians to tackle US red tape Musicians' gr...,musician tackle u red tape musician group tack...,1
1,0,A singer hoping to perform in the US can expec...,singer hop perform u expect pay simply obtain ...,1
2,0,Groups including the Musicians' Union are call...,group include musician union call end raw deal...,0
3,0,US acts are not faced with comparable expense ...,u act face comparable expense bureaucracy visi...,0
4,0,Nigel McCune from the Musicians' Union said Br...,nigel mccune musician union say british musici...,1
5,0,A sponsor has to make a petition on their beha...,sponsor make petition behalf form amount nearl...,0
6,0,"""If you make a mistake on your form, you risk ...",make mistake form risk ban thus ability career...,0
7,0,"""The US is the world's biggest music market, w...",u world big music market mean something creaky...,1
8,0,"""The current situation is preventing British a...",current situation prevent british act maintain...,1
9,0,The Musicians' Union stance is being endorsed ...,musician union stance endorse music manager fo...,1


Preprocessed IMDB Dataset

In [9]:
# Process the BBC dataset
imdb_labeled_df = prepare_labeled_sentences_spacy(imdb_df[:4000])

# Convert to DataFrame for modeling
imdb_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "article_sentences": item["raw_sentence"],
            "preprocessed_sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in imdb_labeled_df
    ]
)

Preprocessing articles: 100%|██████████| 4000/4000 [03:23<00:00, 19.63it/s]


In [11]:
imdb_processed_df.shape

(13024, 4)

In [12]:
# Count how many sentences are labeled as summary sentences
summary_count = imdb_processed_df['label'].sum()
total_count = len(imdb_processed_df)
print(f"Summary sentences: {summary_count} out of {total_count} ({summary_count/total_count:.2%})")

# Show some examples of sentences included in summaries
print("\nExample summary sentences:")
display(imdb_processed_df[imdb_processed_df['label'] == 1].head(3))

Summary sentences: 2934 out of 13024 (22.53%)

Example summary sentences:


Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
2,1,A wonderful little production The filming tech...,wonderful little production filming technique ...,1
9,3,Basically there a family where little boy Jake...,basically family little boy jake think zombie ...,1
11,4,Petter Mattei Love in the Time of Money is vis...,petter mattei love time money visually stunnin...,1


In [13]:
# print(imdb_processed_df["raw_sentence"][2])

In [14]:
imdb_processed_df.head(60)

Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
0,0,One of the other reviewers has mentioned that ...,one reviewer mention watch oz episode hook rig...,0
1,0,This show pulls no punches with regards to dru...,show pull punch regard drug sex violence hardc...,0
2,1,A wonderful little production The filming tech...,wonderful little production filming technique ...,1
3,1,but he has all the voices down pat too You can...,voice pat truly see seamless edit guide refere...,0
4,1,but it is terrificly written and performed pie...,terrificly write perform piece masterful produ...,0
5,1,The realism really comes home with the little ...,realism really come home little thing fantasy ...,0
6,2,I thought this was wonderful way to spend time...,think wonderful way spend time hot summer week...,0
7,2,The plot is simplistic but the dialogue is wit...,plot simplistic dialogue witty character likab...,0
8,2,While some may be disappointed when they reali...,may disappoint realize match point risk addict...,0
9,3,Basically there a family where little boy Jake...,basically family little boy jake think zombie ...,1


BiLSTM + Attention


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from nltk.tokenize import sent_tokenize
from rouge import Rouge
import nltk
nltk.download('punkt')

def prepare_dataset(df):
    sentences = []
    labels = []
    for _, row in df.iterrows():
        article = row['article']
        summary = row['summary']
        sents = sent_tokenize(article)
        for sent in sents:
            sentences.append(sent)
            labels.append(1 if sent in summary else 0)
    return sentences, labels

def extract_summary(article, clf, vectorizer):
    sents = sent_tokenize(article)
    X_sents = vectorizer.transform(sents)
    preds = clf.predict(X_sents)
    extracted = [s for s, p in zip(sents, preds) if p == 1]
    # Return first sentence if no sentence predicted
    return " ".join(extracted) if extracted else sents[0]

def run_on_dataset(name, df):
    print(f"\n=== Running on {name} Dataset ===")
    
    # Prepare data
    sentences, labels = prepare_dataset(df)
    vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
    X = vectorizer.fit_transform(sentences)
    y = labels

    # Split train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Create Decision Tree classifier with tuned hyperparameters
    clf = DecisionTreeClassifier(
        max_depth=30,
        min_samples_leaf=10,
        min_samples_split=10,
        class_weight='balanced',
        random_state=42
    )
    clf.fit(X_train, y_train)

    # Train/Test accuracy
    train_acc = clf.score(X_train, y_train)
    test_acc = clf.score(X_test, y_test)
    print(f"Train Accuracy for {name}: {train_acc:.4f}")
    print(f"Test Accuracy for {name}: {test_acc:.4f}")

    # Classification report on test set
    y_pred = clf.predict(X_test)
    print(f"\nClassification Report for {name}:")
    print(classification_report(y_test, y_pred))

    # ROUGE evaluation on 200 samples
    sample_df = df.sample(n=200, random_state=42)
    rouge = Rouge()
    rouge_1_scores = []
    rouge_2_scores = []
    rouge_l_scores = []

    for _, row in sample_df.iterrows():
        pred_summary = extract_summary(row['article'], clf, vectorizer)
        true_summary = row['summary']
        scores = rouge.get_scores(pred_summary, true_summary)[0]
        rouge_1_scores.append(scores['rouge-1']['f'])
        rouge_2_scores.append(scores['rouge-2']['f'])
        rouge_l_scores.append(scores['rouge-l']['f'])

    print(f"\nAverage ROUGE-1 (F1) for {name} sample (200 articles): {sum(rouge_1_scores)/len(rouge_1_scores):.4f}")
    print(f"Average ROUGE-2 (F1) for {name} sample (200 articles): {sum(rouge_2_scores)/len(rouge_2_scores):.4f}")
    print(f"Average ROUGE-L (F1) for {name} sample (200 articles): {sum(rouge_l_scores)/len(rouge_l_scores):.4f}")

# Make sure your datasets columns are correctly named
bbc_df = bbc_df.rename(columns={"Article": "article", "Summary": "summary"})
imdb_df = imdb_df.rename(columns={"Article": "article", "Summary": "summary"})

# Run on both datasets
run_on_dataset("BBC", bbc_df)
run_on_dataset("IMDB", imdb_df)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mohamedkenya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



=== Running on BBC Dataset ===
Train Accuracy for BBC: 0.7290
Test Accuracy for BBC: 0.6457

Classification Report for BBC:
              precision    recall  f1-score   support

           0       0.71      0.72      0.71      7616
           1       0.54      0.53      0.54      4810

    accuracy                           0.65     12426
   macro avg       0.63      0.62      0.62     12426
weighted avg       0.64      0.65      0.64     12426


Average ROUGE-1 (F1) for BBC sample (200 articles): 0.6494
Average ROUGE-2 (F1) for BBC sample (200 articles): 0.5573
Average ROUGE-L (F1) for BBC sample (200 articles): 0.6427

=== Running on IMDB Dataset ===
Train Accuracy for IMDB: 0.9709
Test Accuracy for IMDB: 0.9606

Classification Report for IMDB:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     11591
           1       0.11      0.47      0.18       109

    accuracy                           0.96     11700
   macro avg       0.55

In [17]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import time

# =====================
# Dataset Class
# =====================
class ExtractiveDataset(Dataset):
    def __init__(self, df):
        self.sentences = df['preprocessed_sentence'].values
        self.labels = df['label'].values

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.labels[idx]

# =====================
# Tokenizer + Vocab
# =====================
class Vocab:
    def __init__(self, texts, min_freq=1):
        from collections import Counter
        self.token2idx = {'<PAD>': 0, '<UNK>': 1}
        counter = Counter(word for text in texts for word in text.split())
        for word, freq in counter.items():
            if freq >= min_freq:
                self.token2idx[word] = len(self.token2idx)
        self.idx2token = {i: t for t, i in self.token2idx.items()}

    def encode(self, text, max_len=50):
        tokens = text.split()
        ids = [self.token2idx.get(t, self.token2idx['<UNK>']) for t in tokens]
        ids = ids[:max_len] + [0] * (max_len - len(ids))
        return torch.tensor(ids)

# =====================
# BiLSTM + Attention
# =====================
class BiLSTMAttention(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.bilstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attn = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim * 2, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.bilstm(embedded)
        attn_weights = torch.softmax(self.attn(lstm_out).squeeze(2), dim=1)
        context = torch.sum(lstm_out * attn_weights.unsqueeze(2), dim=1)
        out = self.sigmoid(self.fc(context)).squeeze(1)
        return out

# =====================
# Training / Evaluation
# =====================
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for texts, labels in loader:
        inputs = torch.stack([vocab.encode(t) for t in texts]).to(device)
        labels = torch.tensor(labels, dtype=torch.float32).to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for texts, labels in loader:
            inputs = torch.stack([vocab.encode(t) for t in texts]).to(device)
            outputs = model(inputs)
            preds = (outputs > 0.5).int().cpu().tolist()
            all_preds.extend(preds)
            all_labels.extend(labels)
    print(classification_report(all_labels, all_preds))
    print("Accuracy:", accuracy_score(all_labels, all_preds))

# =====================
# Run Function
# =====================
def run_pipeline(name, df):
    print(f"\n=== Running Extractive Summarization on {name} Dataset ===")
    start_time = time.time()

    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    global vocab
    vocab = Vocab(train_df['preprocessed_sentence'].tolist())

    train_dataset = ExtractiveDataset(train_df)
    test_dataset = ExtractiveDataset(test_df)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BiLSTMAttention(len(vocab.token2idx)).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCELoss()

    for epoch in range(3):
        loss = train_epoch(model, train_loader, optimizer, criterion, device)
        print(f"Epoch {epoch+1} Loss: {loss:.4f}")

    evaluate(model, test_loader, device)
    print("Execution Time: {:.2f} seconds".format(time.time() - start_time))

# =====================
# Load Your Datasets
# =====================
# Make sure these are loaded correctly beforehand
# imdb_processed_df and bbc_processed_df should have 'preprocessed_sentence' and 'label'

# Example call:
run_pipeline("BBC", bbc_processed_df)
run_pipeline("IMDB", imdb_processed_df)


=== Running Extractive Summarization on BBC Dataset ===


  labels = torch.tensor(labels, dtype=torch.float32).to(device)


Epoch 1 Loss: 0.6280
Epoch 2 Loss: 0.5823
Epoch 3 Loss: 0.5087
              precision    recall  f1-score   support

           0       0.70      0.78      0.74      5081
           1       0.59      0.48      0.53      3255

    accuracy                           0.67      8336
   macro avg       0.65      0.63      0.64      8336
weighted avg       0.66      0.67      0.66      8336

Accuracy: 0.6665067178502879
Execution Time: 122.48 seconds

=== Running Extractive Summarization on IMDB Dataset ===


  labels = torch.tensor(labels, dtype=torch.float32).to(device)


Epoch 1 Loss: 0.5160
Epoch 2 Loss: 0.4829
Epoch 3 Loss: 0.4578
              precision    recall  f1-score   support

           0       0.78      1.00      0.87      2021
           1       0.00      0.00      0.00       584

    accuracy                           0.78      2605
   macro avg       0.39      0.50      0.44      2605
weighted avg       0.60      0.78      0.68      2605

Accuracy: 0.7754318618042226
Execution Time: 40.92 seconds


In [10]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import time
from collections import defaultdict
from rouge import Rouge

# =====================
# Dataset Class
# =====================
class ExtractiveDataset(Dataset):
    def __init__(self, df):
        self.sentences = df['preprocessed_sentence'].values
        self.labels = df['label'].values

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.labels[idx]

# =====================
# Tokenizer + Vocab
# =====================
class Vocab:
    def __init__(self, texts, min_freq=1):
        from collections import Counter
        self.token2idx = {'<PAD>': 0, '<UNK>': 1}
        counter = Counter(word for text in texts for word in text.split())
        for word, freq in counter.items():
            if freq >= min_freq:
                self.token2idx[word] = len(self.token2idx)
        self.idx2token = {i: t for t, i in self.token2idx.items()}

    def encode(self, text, max_len=50):
        tokens = text.split()
        ids = [self.token2idx.get(t, self.token2idx['<UNK>']) for t in tokens]
        ids = ids[:max_len] + [0] * (max_len - len(ids))
        return torch.tensor(ids)

# =====================
# BiLSTM + Attention
# =====================
class BiLSTMAttention(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.bilstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attn = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim * 2, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.bilstm(embedded)
        attn_weights = torch.softmax(self.attn(lstm_out).squeeze(2), dim=1)
        context = torch.sum(lstm_out * attn_weights.unsqueeze(2), dim=1)
        out = self.sigmoid(self.fc(context)).squeeze(1)
        return out

# =====================
# Training / Evaluation
# =====================
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for texts, labels in loader:
        inputs = torch.stack([vocab.encode(t) for t in texts]).to(device)
        labels = torch.tensor(labels, dtype=torch.float32).to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, device):
    model.eval()
    all_preds, all_labels, all_texts = [], [], []
    with torch.no_grad():
        for texts, labels in loader:
            inputs = torch.stack([vocab.encode(t) for t in texts]).to(device)
            outputs = model(inputs)
            preds = (outputs > 0.5).int().cpu().tolist()
            all_preds.extend(preds)
            all_labels.extend(labels)
            all_texts.extend(texts)
    print(classification_report(all_labels, all_preds))
    print("Accuracy:", accuracy_score(all_labels, all_preds))
    return all_preds, all_texts

# =====================
# Run Function (Modified to return preds & texts)
# =====================
def run_pipeline(name, df):
    print(f"\n=== Running Extractive Summarization on {name} Dataset ===")
    start_time = time.time()

    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    global vocab
    vocab = Vocab(train_df['preprocessed_sentence'].tolist())

    train_dataset = ExtractiveDataset(train_df)
    test_dataset = ExtractiveDataset(test_df)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BiLSTMAttention(len(vocab.token2idx)).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCELoss()

    for epoch in range(5):
        loss = train_epoch(model, train_loader, optimizer, criterion, device)
        print(f"Epoch {epoch+1} Loss: {loss:.4f}")

    preds, pred_texts = evaluate(model, test_loader, device)

    # Optional ROUGE evaluation
    if 'reference_summary' in df.columns and 'article_id' in df.columns:
        print("Computing ROUGE Scores...")
        pred_by_article = defaultdict(list)
        for text, pred, row in zip(pred_texts, preds, test_df.itertuples()):
            if pred == 1:
                pred_by_article[row.article_id].append(text)

        rouge = Rouge()
        scores_1, scores_2, scores_l = [], [], []

        grouped_refs = test_df[['article_id', 'reference_summary']].drop_duplicates().set_index('article_id')['reference_summary']

        for aid, pred_sents in pred_by_article.items():
            if aid in grouped_refs:
                pred_summary = ' '.join(pred_sents)
                ref_summary = grouped_refs[aid]
                try:
                    score = rouge.get_scores(pred_summary, ref_summary)[0]
                    scores_1.append(score['rouge-1']['f'])
                    scores_2.append(score['rouge-2']['f'])
                    scores_l.append(score['rouge-l']['f'])
                except:
                    continue

        if scores_1:
            print(f"\nROUGE-1 F1: {sum(scores_1)/len(scores_1):.4f}")
            print(f"ROUGE-2 F1: {sum(scores_2)/len(scores_2):.4f}")
            print(f"ROUGE-L F1: {sum(scores_l)/len(scores_l):.4f}")
        else:
            print("No valid ROUGE scores could be calculated.")

    else:
        print("Skipping ROUGE score: 'reference_summary' column not found.")

    print("Execution Time: {:.2f} seconds".format(time.time() - start_time))
    return preds, pred_texts

In [15]:
# Run models
preds_bbc, pred_texts_bbc = run_pipeline("BBC", bbc_processed_df)
preds_imdb, pred_texts_imdb = run_pipeline("IMDB", imdb_processed_df)




=== Running Extractive Summarization on BBC Dataset ===


  labels = torch.tensor(labels, dtype=torch.float32).to(device)


Epoch 1 Loss: 0.6291
Epoch 2 Loss: 0.5809
Epoch 3 Loss: 0.5064
Epoch 4 Loss: 0.3803
Epoch 5 Loss: 0.2272
              precision    recall  f1-score   support

           0       0.72      0.68      0.70      5081
           1       0.54      0.59      0.56      3255

    accuracy                           0.64      8336
   macro avg       0.63      0.63      0.63      8336
weighted avg       0.65      0.64      0.64      8336

Accuracy: 0.6419145873320538
Skipping ROUGE score: 'reference_summary' column not found.
Execution Time: 184.46 seconds

=== Running Extractive Summarization on IMDB Dataset ===


  labels = torch.tensor(labels, dtype=torch.float32).to(device)


Epoch 1 Loss: 0.5147
Epoch 2 Loss: 0.4874
Epoch 3 Loss: 0.4543
Epoch 4 Loss: 0.4074
Epoch 5 Loss: 0.3538
              precision    recall  f1-score   support

           0       0.79      0.93      0.85      2021
           1       0.36      0.13      0.19       584

    accuracy                           0.75      2605
   macro avg       0.57      0.53      0.52      2605
weighted avg       0.69      0.75      0.70      2605

Accuracy: 0.7531669865642995
Skipping ROUGE score: 'reference_summary' column not found.
Execution Time: 60.23 seconds


In [16]:
def display_predicted_summaries(df, preds, texts, num_samples=5):
    from collections import defaultdict
    import random

    pred_by_article = defaultdict(list)
    for text, pred, row in zip(texts, preds, df.itertuples()):
        if pred == 1:
            pred_by_article[row.article_id].append(text)

    sampled_ids = random.sample(list(pred_by_article.keys()), min(num_samples, len(pred_by_article)))

    print("\n=== Sample Predicted Summaries ===")
    for aid in sampled_ids:
        print(f"\n--- Article ID: {aid} ---")
        print("Predicted Summary:")
        print(" ".join(pred_by_article[aid]))

# Show 5 predicted summaries from BBC test set
display_predicted_summaries(bbc_processed_df, preds_bbc, pred_texts_bbc)

# Optionally do the same for IMDB
display_predicted_summaries(imdb_processed_df, preds_imdb, pred_texts_imdb)


=== Sample Predicted Summaries ===

--- Article ID: 51 ---
Predicted Summary:
j robinson sale shark capt cueto sale shark tait newcastle j noon newcastle j lewsey wasp c hodgson sale shark dawson wasp g rowntree leicester thompson northampton j white leicester grewcock bath b kay leicester l moody leicester hazell gloucester j worsley wasp initially bmw say would produce mini model year vast cowley factory outskirt oxford target quickly reach raise time time martin mubanga claim observer officer play key role consign u camp cuba follow arrest zambia complaint make body board deputy british jew commission racial equality

--- Article ID: 21 ---
Predicted Summary:
every rural school africa would access library student oxford harvard currently project operate area main electricity take charge score pat sanderson kai horstman mathew tait rob thirlby fiji rally force tense finale nominee say make film reason something say annual figure mark best year since also well ahead 2.2 record report

In [38]:
import pandas as pd
from collections import defaultdict
import random
from sklearn.model_selection import train_test_split

random.seed(42)  # for reproducibility

# --- BBC ---

# Recreate test split for BBC
_, test_df_bbc = train_test_split(bbc_processed_df, test_size=0.2, random_state=42)

# Build predicted summaries dict for BBC
pred_by_article_bbc = defaultdict(list)
for pred, text, row in zip(preds_bbc, pred_texts_bbc, test_df_bbc.itertuples()):
    if pred == 1:
        pred_by_article_bbc[row.article_id].append(text)

predicted_summaries_bbc = {aid: ' '.join(sents) for aid, sents in pred_by_article_bbc.items()}

# Filter and sample article_ids for BBC
filtered_df_bbc = bbc_processed_df[bbc_processed_df['label'] == 1]
candidate_article_ids_bbc = filtered_df_bbc['article_id'].unique()
candidate_article_ids_bbc = [aid for aid in candidate_article_ids_bbc if aid in predicted_summaries_bbc]
sampled_article_ids_bbc = random.sample(list(candidate_article_ids_bbc), 5)

# Ensure article_id column in bbc_df
if 'article_id' not in bbc_df.columns:
    bbc_df = bbc_df.reset_index().rename(columns={'index': 'article_id'})

print("\n=== BBC Dataset Sample Summaries ===\n")
for aid in sampled_article_ids_bbc:
    pred_sum = predicted_summaries_bbc.get(aid, "[No predicted summary]")
    ref_sum_row = bbc_df.loc[bbc_df['article_id'] == aid, 'Summary']
    ref_sum = ref_sum_row.values[0] if len(ref_sum_row) > 0 else "[No original summary]"
    
    print(f"--- Article ID: {aid} ---")
    print("Predicted Summary:")
    print(pred_sum)
    print("\nReference Summary:")
    print(ref_sum)
    print("\n" + "="*60 + "\n")



# --- IMDB ---

# Recreate test split for IMDB
_, test_df_imdb = train_test_split(imdb_processed_df, test_size=0.2, random_state=42)

# Build predicted summaries dict for IMDB
pred_by_article_imdb = defaultdict(list)
for pred, text, row in zip(preds_imdb, pred_texts_imdb, test_df_imdb.itertuples()):
    if pred == 1:
        pred_by_article_imdb[row.article_id].append(text)

predicted_summaries_imdb = {aid: ' '.join(sents) for aid, sents in pred_by_article_imdb.items()}

# Filter and sample article_ids for IMDB
filtered_df_imdb = imdb_processed_df[imdb_processed_df['label'] == 1]
candidate_article_ids_imdb = filtered_df_imdb['article_id'].unique()
candidate_article_ids_imdb = [aid for aid in candidate_article_ids_imdb if aid in predicted_summaries_imdb]
sampled_article_ids_imdb = random.sample(list(candidate_article_ids_imdb), 5)

# Ensure article_id column in imdb_df
if 'article_id' not in imdb_df.columns:
    imdb_df = imdb_df.reset_index().rename(columns={'index': 'article_id'})

print("\n=== IMDB Dataset Sample Summaries ===\n")
for aid in sampled_article_ids_imdb:
    pred_sum = predicted_summaries_imdb.get(aid, "[No predicted summary]")
    ref_sum_row = imdb_df.loc[imdb_df['article_id'] == aid, 'Summary']
    ref_sum = ref_sum_row.values[0] if len(ref_sum_row) > 0 else "[No original summary]"
    
    print(f"--- Article ID: {aid} ---")
    print("Predicted Summary:")
    print(pred_sum)
    print("\nReference Summary:")
    print(ref_sum)
    print("\n" + "="*60 + "\n")


=== BBC Dataset Sample Summaries ===

--- Article ID: 1755 ---
Predicted Summary:
mr kennedy tour come labour leader tony blair conservative leader michael howard step campaign ahead next general election widely expect hold may mr kennedy say british public felt let labour issue iraq fee conservative ask critical question labour say lib dem vote could let tory tory say lib dems would mean high tax soft crime law power europe say people highly sceptical labour conservative promise tax

Reference Summary:
The Liberal Democrats say in the northern cities, the race is between them and Labour, while in southern seats - particularly the south west - it is between them and the Tories.Mr Kennedy said the British public felt let down by Labour on issues from Iraq to top-up fees and the Conservatives were not "asking the critical questions".Mr Kennedy's tour comes as he, Labour leader Tony Blair and Conservative leader Michael Howard all step up campaigning ahead of the next General Election, w