In [2]:
import os
import pandas as pd
from utils import prepare_labeled_sentences, prepare_labeled_sentences_spacy
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mohamedkenya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mohamedkenya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mohamedkenya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mohamedkenya/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/mohamedkenya/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mohamedkenya/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Read Datasets

In [3]:
# BBC Dataset
bbc_df = pd.read_csv("data/bbc/bbc_dataset.csv")

# CNN Datasets
# cnn_train_df = pd.read_csv("data/cnn/cnn_dailymail_train.csv")
# cnn_valid_df = pd.read_csv("data/cnn/cnn_dailymail_valid.csv")
# cnn_test_df = pd.read_csv("data/cnn/cnn_dailymail_test.csv")

imdb_df = pd.read_csv("data/imdb/imdb.csv")

In [4]:
# Preview to confirm structure
print("BBC Sample:")
display(bbc_df.head())

BBC Sample:


Unnamed: 0,Article,Summary
0,Musicians to tackle US red tape\n\nMusicians' ...,Nigel McCune from the Musicians' Union said Br...
1,"U2's desire to be number one\n\nU2, who have w...",But they still want more.They have to want to ...
2,Rocker Doherty in on-stage fight\n\nRock singe...,"Babyshambles, which he formed after his acrimo..."
3,Snicket tops US box office chart\n\nThe film a...,A Series of Unfortunate Events also stars Scot...
4,Ocean's Twelve raids box office\n\nOcean's Twe...,"Ocean's Twelve, the crime caper sequel starrin..."


In [5]:
# print("CNN Sample:")
# display(cnn_train_df.head())

In [6]:
print("IMDB Sample:")
display(imdb_df.head())

IMDB Sample:


Unnamed: 0,Article,Summary
0,One of the other reviewers has mentioned that ...,One of the other reviewers has mentioned that ...
1,A wonderful little production The filming tech...,A wonderful little production The filming tech...
2,I thought this was wonderful way to spend time...,I thought it was proof that Woody Allen is sti...
3,Basically there a family where little boy Jake...,Basically there a family where little boy Jake...
4,Petter Mattei Love in the Time of Money is vis...,Petter Mattei Love in the Time of Money is vis...


Preprocess BBC Datasets

In [7]:
# Process the BBC dataset
bbc_labeled_data = prepare_labeled_sentences_spacy(bbc_df)

# Convert to DataFrame for modeling
bbc_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "article_sentences": item["raw_sentence"],
            "preprocessed_sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in bbc_labeled_data
    ]
)

Preprocessing articles: 100%|██████████| 2225/2225 [04:50<00:00,  7.66it/s]


In [8]:
bbc_processed_df.shape

(41677, 4)

In [9]:
# Count how many sentences are labeled as summary sentences
summary_count = bbc_processed_df['label'].sum()
total_count = len(bbc_processed_df)
print(f"Summary sentences: {summary_count} out of {total_count} ({summary_count/total_count:.2%})")

# Show some examples of sentences included in summaries
print("\nExample summary sentences:")
display(bbc_processed_df[bbc_processed_df['label'] == 1].head(3))

Summary sentences: 16543 out of 41677 (39.69%)

Example summary sentences:


Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
0,0,Musicians to tackle US red tape Musicians' gr...,musician tackle u red tape musician group tack...,1
1,0,A singer hoping to perform in the US can expec...,singer hop perform u expect pay simply obtain ...,1
4,0,Nigel McCune from the Musicians' Union said Br...,nigel mccune musician union say british musici...,1


In [10]:
bbc_processed_df.head(60)

Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
0,0,Musicians to tackle US red tape Musicians' gr...,musician tackle u red tape musician group tack...,1
1,0,A singer hoping to perform in the US can expec...,singer hop perform u expect pay simply obtain ...,1
2,0,Groups including the Musicians' Union are call...,group include musician union call end raw deal...,0
3,0,US acts are not faced with comparable expense ...,u act face comparable expense bureaucracy visi...,0
4,0,Nigel McCune from the Musicians' Union said Br...,nigel mccune musician union say british musici...,1
5,0,A sponsor has to make a petition on their beha...,sponsor make petition behalf form amount nearl...,0
6,0,"""If you make a mistake on your form, you risk ...",make mistake form risk ban thus ability career...,0
7,0,"""The US is the world's biggest music market, w...",u world big music market mean something creaky...,1
8,0,"""The current situation is preventing British a...",current situation prevent british act maintain...,1
9,0,The Musicians' Union stance is being endorsed ...,musician union stance endorse music manager fo...,1


Preprocessed IMDB Dataset

In [11]:
# Process the BBC dataset
imdb_labeled_df = prepare_labeled_sentences_spacy(imdb_df[:4000])

# Convert to DataFrame for modeling
imdb_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "article_sentences": item["raw_sentence"],
            "preprocessed_sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in imdb_labeled_df
    ]
)

Preprocessing articles:   0%|          | 0/4000 [00:00<?, ?it/s]

Preprocessing articles: 100%|██████████| 4000/4000 [03:28<00:00, 19.19it/s]


In [12]:
imdb_processed_df.shape

(13024, 4)

In [13]:
# Count how many sentences are labeled as summary sentences
summary_count = imdb_processed_df['label'].sum()
total_count = len(imdb_processed_df)
print(f"Summary sentences: {summary_count} out of {total_count} ({summary_count/total_count:.2%})")

# Show some examples of sentences included in summaries
print("\nExample summary sentences:")
display(imdb_processed_df[imdb_processed_df['label'] == 1].head(3))

Summary sentences: 2934 out of 13024 (22.53%)

Example summary sentences:


Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
2,1,A wonderful little production The filming tech...,wonderful little production filming technique ...,1
9,3,Basically there a family where little boy Jake...,basically family little boy jake think zombie ...,1
11,4,Petter Mattei Love in the Time of Money is vis...,petter mattei love time money visually stunnin...,1


In [14]:
# print(imdb_processed_df["raw_sentence"][2])

In [15]:
imdb_processed_df.head(60)

Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
0,0,One of the other reviewers has mentioned that ...,one reviewer mention watch oz episode hook rig...,0
1,0,This show pulls no punches with regards to dru...,show pull punch regard drug sex violence hardc...,0
2,1,A wonderful little production The filming tech...,wonderful little production filming technique ...,1
3,1,but he has all the voices down pat too You can...,voice pat truly see seamless edit guide refere...,0
4,1,but it is terrificly written and performed pie...,terrificly write perform piece masterful produ...,0
5,1,The realism really comes home with the little ...,realism really come home little thing fantasy ...,0
6,2,I thought this was wonderful way to spend time...,think wonderful way spend time hot summer week...,0
7,2,The plot is simplistic but the dialogue is wit...,plot simplistic dialogue witty character likab...,0
8,2,While some may be disappointed when they reali...,may disappoint realize match point risk addict...,0
9,3,Basically there a family where little boy Jake...,basically family little boy jake think zombie ...,1


BiLSTM + Attention


In [None]:
#Machine Using Decision Trees

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from nltk.tokenize import sent_tokenize
from rouge import Rouge
import nltk
nltk.download('punkt')

def prepare_dataset(df):
    sentences = []
    labels = []
    for _, row in df.iterrows():
        article = row['article']
        summary = row['summary']
        sents = sent_tokenize(article)
        for sent in sents:
            sentences.append(sent)
            labels.append(1 if sent in summary else 0)
    return sentences, labels

def extract_summary(article, clf, vectorizer):
    sents = sent_tokenize(article)
    X_sents = vectorizer.transform(sents)
    preds = clf.predict(X_sents)
    extracted = [s for s, p in zip(sents, preds) if p == 1]
    # Return first sentence if no sentence predicted
    return " ".join(extracted) if extracted else sents[0]

def run_on_dataset(name, df):
    print(f"\n=== Running on {name} Dataset ===")
    
    # Prepare data
    sentences, labels = prepare_dataset(df)
    vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
    X = vectorizer.fit_transform(sentences)
    y = labels

    # Split train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Create Decision Tree classifier with tuned hyperparameters
    clf = DecisionTreeClassifier(
        max_depth=30,
        min_samples_leaf=10,
        min_samples_split=10,
        class_weight='balanced',
        random_state=42
    )
    clf.fit(X_train, y_train)

    # Train/Test accuracy
    train_acc = clf.score(X_train, y_train)
    test_acc = clf.score(X_test, y_test)
    print(f"Train Accuracy for {name}: {train_acc:.4f}")
    print(f"Test Accuracy for {name}: {test_acc:.4f}")

    # Classification report on test set
    y_pred = clf.predict(X_test)
    print(f"\nClassification Report for {name}:")
    print(classification_report(y_test, y_pred))

    # ROUGE evaluation on 200 samples
    sample_df = df.sample(n=200, random_state=42)
    rouge = Rouge()
    rouge_1_scores = []
    rouge_2_scores = []
    rouge_l_scores = []

    for _, row in sample_df.iterrows():
        pred_summary = extract_summary(row['article'], clf, vectorizer)
        true_summary = row['summary']
        scores = rouge.get_scores(pred_summary, true_summary)[0]
        rouge_1_scores.append(scores['rouge-1']['f'])
        rouge_2_scores.append(scores['rouge-2']['f'])
        rouge_l_scores.append(scores['rouge-l']['f'])

    print(f"\nAverage ROUGE-1 (F1) for {name} sample (200 articles): {sum(rouge_1_scores)/len(rouge_1_scores):.4f}")
    print(f"Average ROUGE-2 (F1) for {name} sample (200 articles): {sum(rouge_2_scores)/len(rouge_2_scores):.4f}")
    print(f"Average ROUGE-L (F1) for {name} sample (200 articles): {sum(rouge_l_scores)/len(rouge_l_scores):.4f}")

# Make sure your datasets columns are correctly named
bbc_df = bbc_df.rename(columns={"Article": "article", "Summary": "summary"})
imdb_df = imdb_df.rename(columns={"Article": "article", "Summary": "summary"})

# Run on both datasets
run_on_dataset("BBC", bbc_df)
run_on_dataset("IMDB", imdb_df)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mohamedkenya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



=== Running on BBC Dataset ===
Train Accuracy for BBC: 0.7290
Test Accuracy for BBC: 0.6457

Classification Report for BBC:
              precision    recall  f1-score   support

           0       0.71      0.72      0.71      7616
           1       0.54      0.53      0.54      4810

    accuracy                           0.65     12426
   macro avg       0.63      0.62      0.62     12426
weighted avg       0.64      0.65      0.64     12426


Average ROUGE-1 (F1) for BBC sample (200 articles): 0.6494
Average ROUGE-2 (F1) for BBC sample (200 articles): 0.5573
Average ROUGE-L (F1) for BBC sample (200 articles): 0.6427

=== Running on IMDB Dataset ===
Train Accuracy for IMDB: 0.9709
Test Accuracy for IMDB: 0.9606

Classification Report for IMDB:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     11591
           1       0.11      0.47      0.18       109

    accuracy                           0.96     11700
   macro avg       0.55

In [34]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import time
from collections import defaultdict
from rouge import Rouge

# =====================
# Dataset Class
# =====================
class ExtractiveDataset(Dataset):
    def __init__(self, df):
        self.sentences = df['preprocessed_sentence'].values
        self.labels = df['label'].values

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.labels[idx]

# =====================
# Tokenizer + Vocab
# =====================
class Vocab:
    def __init__(self, texts, min_freq=1):
        from collections import Counter
        self.token2idx = {'<PAD>': 0, '<UNK>': 1}
        counter = Counter(word for text in texts for word in text.split())
        for word, freq in counter.items():
            if freq >= min_freq:
                self.token2idx[word] = len(self.token2idx)
        self.idx2token = {i: t for t, i in self.token2idx.items()}

    def encode(self, text, max_len=100):
        tokens = text.split()
        ids = [self.token2idx.get(t, self.token2idx['<UNK>']) for t in tokens]
        ids = ids[:max_len] + [0] * (max_len - len(ids))
        return torch.tensor(ids)

# =====================
# Additive Attention
# =====================
class AdditiveAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.W = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1)

    def forward(self, lstm_out):
        energy = torch.tanh(self.W(lstm_out))
        scores = self.v(energy).squeeze(2)
        weights = torch.softmax(scores, dim=1)
        context = torch.sum(lstm_out * weights.unsqueeze(2), dim=1)
        return context, weights

# =====================
# BiLSTM + Attention Model
# =====================
class BiLSTMAttention(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.bilstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attn_layer = AdditiveAttention(hidden_dim)
        self.fc = nn.Linear(hidden_dim * 2, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.bilstm(embedded)
        context, _ = self.attn_layer(lstm_out)
        out = self.fc(context).squeeze(1)  # raw logits
        return out

# =====================
# Collate Function for Padding
# =====================
def collate_fn(batch):
    texts, labels = zip(*batch)
    inputs = torch.stack([vocab.encode(t, max_len=100) for t in texts])
    labels = torch.tensor(labels, dtype=torch.float32)
    return inputs, labels

# =====================
# Training / Evaluation
# =====================
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for inputs, labels in loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, device):
    model.eval()
    all_preds, all_labels, all_texts, all_probs = [], [], [], []
    with torch.no_grad():
        for inputs, labels in loader:
            inputs = inputs.to(device)
            outputs = torch.sigmoid(model(inputs))  # Convert logits to probs
            all_probs.extend(outputs.cpu().tolist())
            all_preds.extend((outputs > 0.5).int().cpu().tolist())
            all_labels.extend(labels.tolist())
    print(classification_report(all_labels, all_preds))
    print("Accuracy:", accuracy_score(all_labels, all_preds))
    return all_probs, all_preds, all_labels

# =====================
# Run Pipeline
# =====================
def run_pipeline(name, df):
    print(f"\n=== Running Extractive Summarization on {name} Dataset ===")
    start_time = time.time()

    if 'reference_summary' not in df.columns:
        df['reference_summary'] = df.groupby('article_id')['preprocessed_sentence'].transform(
            lambda x: ' '.join(x[df.loc[x.index, 'label'] == 1])
        )

    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    global vocab
    vocab = Vocab(train_df['preprocessed_sentence'].tolist())

    train_dataset = ExtractiveDataset(train_df)
    test_dataset = ExtractiveDataset(test_df)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BiLSTMAttention(len(vocab.token2idx)).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    pos_weight = torch.tensor([3.0]).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    for epoch in range(5):
        loss = train_epoch(model, train_loader, optimizer, criterion, device)
        print(f"Epoch {epoch+1} Loss: {loss:.4f}")

    all_probs, all_preds, all_labels = evaluate(model, test_loader, device)

    # Find optimal threshold
    print("\nTuning threshold for best F1...")
    best_thresh, best_f1 = 0.5, 0
    for t in [i * 0.05 for i in range(1, 20)]:
        preds_t = [1 if p > t else 0 for p in all_probs]
        score = f1_score(all_labels, preds_t)
        if score > best_f1:
            best_f1, best_thresh = score, t
    print(f"Best Threshold: {best_thresh:.2f}, F1: {best_f1:.4f}")

    # Apply best threshold for prediction
    final_preds = [1 if p > best_thresh else 0 for p in all_probs]

    # ROUGE Evaluation
    print("\nComputing ROUGE Scores...")
    pred_by_article = defaultdict(list)
    for i, (row, pred) in enumerate(zip(test_df.itertuples(), final_preds)):
        if pred == 1:
            pred_by_article[row.article_id].append(row.preprocessed_sentence)

    rouge = Rouge()
    scores_1, scores_2, scores_l = [], [], []
    grouped_refs = test_df[['article_id', 'reference_summary']].drop_duplicates().set_index('article_id')['reference_summary']

    for aid, pred_sents in pred_by_article.items():
        if aid in grouped_refs:
            pred_summary = ' '.join(pred_sents)
            ref_summary = grouped_refs[aid]
            try:
                score = rouge.get_scores(pred_summary, ref_summary)[0]
                scores_1.append(score['rouge-1']['f'])
                scores_2.append(score['rouge-2']['f'])
                scores_l.append(score['rouge-l']['f'])
            except:
                continue

    if scores_1:
        print(f"\nROUGE-1 F1: {sum(scores_1)/len(scores_1):.4f}")
        print(f"ROUGE-2 F1: {sum(scores_2)/len(scores_2):.4f}")
        print(f"ROUGE-L F1: {sum(scores_l)/len(scores_l):.4f}")
    else:
        print("No valid ROUGE scores could be calculated.")

    print("Execution Time: {:.2f} seconds".format(time.time() - start_time))
    return final_preds

# Example call
preds_bbc = run_pipeline("BBC", bbc_processed_df)
preds_imdb = run_pipeline("IMDB", imdb_processed_df)


=== Running Extractive Summarization on BBC Dataset ===
Epoch 1 Loss: 1.0810
Epoch 2 Loss: 0.9956
Epoch 3 Loss: 0.8756
Epoch 4 Loss: 0.6977
Epoch 5 Loss: 0.4841
              precision    recall  f1-score   support

         0.0       0.75      0.62      0.68      5081
         1.0       0.53      0.68      0.60      3255

    accuracy                           0.64      8336
   macro avg       0.64      0.65      0.64      8336
weighted avg       0.66      0.64      0.64      8336

Accuracy: 0.6402351247600768

Tuning threshold for best F1...
Best Threshold: 0.30, F1: 0.6094

Computing ROUGE Scores...

ROUGE-1 F1: 0.3606
ROUGE-2 F1: 0.2528
ROUGE-L F1: 0.3061
Execution Time: 355.20 seconds

=== Running Extractive Summarization on IMDB Dataset ===
Epoch 1 Loss: 0.9255
Epoch 2 Loss: 0.8912
Epoch 3 Loss: 0.8349
Epoch 4 Loss: 0.7562
Epoch 5 Loss: 0.6411
              precision    recall  f1-score   support

         0.0       0.85      0.66      0.75      2021
         1.0       0.34     

In [32]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from collections import defaultdict
from rouge import Rouge
import time

# =====================
# Constants & Configs
# =====================
BATCH_SIZE = 32
EPOCHS = 5
LR = 1e-3
MAX_LEN = 100
POS_WEIGHT = 3.0

# =====================
# Dataset Class
# =====================
class ExtractiveDataset(Dataset):
    def __init__(self, df):
        self.sentences = df['preprocessed_sentence'].values
        self.labels = df['label'].values

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.labels[idx]

# =====================
# Tokenizer + Vocabulary
# =====================
class Vocab:
    def __init__(self, texts, min_freq=1):
        from collections import Counter
        self.token2idx = {'<PAD>': 0, '<UNK>': 1}
        counter = Counter(word for text in texts for word in text.split())
        for word, freq in counter.items():
            if freq >= min_freq:
                self.token2idx[word] = len(self.token2idx)
        self.idx2token = {i: t for t, i in self.token2idx.items()}

    def encode(self, text, max_len=MAX_LEN):
        ids = [self.token2idx.get(t, 1) for t in text.split()]
        ids = ids[:max_len] + [0] * (max_len - len(ids))
        return torch.tensor(ids)

# =====================
# Additive Attention Layer
# =====================
class AdditiveAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.W = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1)

    def forward(self, lstm_out):
        energy = torch.tanh(self.W(lstm_out))
        scores = self.v(energy).squeeze(2)
        weights = torch.softmax(scores, dim=1)
        context = torch.sum(lstm_out * weights.unsqueeze(2), dim=1)
        return context, weights

# =====================
# BiLSTM + Attention Model
# =====================
class BiLSTMAttention(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.bilstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attn = AdditiveAttention(hidden_dim)
        self.fc = nn.Linear(hidden_dim * 2, 1)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.bilstm(x)
        context, _ = self.attn(lstm_out)
        return self.fc(context).squeeze(1)  # logits

# =====================
# Collate Function
# =====================
def collate_fn(batch):
    texts, labels = zip(*batch)
    inputs = torch.stack([vocab.encode(t) for t in texts])
    labels = torch.tensor(labels, dtype=torch.float32)
    return inputs, labels

# =====================
# Training Function
# =====================
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for inputs, labels in loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# =====================
# Evaluation Function
# =====================
def evaluate(model, loader, device):
    model.eval()
    all_probs, all_preds, all_labels = [], [], []
    with torch.no_grad():
        for inputs, labels in loader:
            inputs = inputs.to(device)
            outputs = torch.sigmoid(model(inputs))
            all_probs.extend(outputs.cpu().numpy())
            all_preds.extend((outputs > 0.5).int().cpu().numpy())
            all_labels.extend(labels.numpy())
    print(classification_report(all_labels, all_preds))
    print("Accuracy:", accuracy_score(all_labels, all_preds))
    return all_probs, all_preds, all_labels

# =====================
# Main Pipeline
# =====================
def run_pipeline(name, df):
    print(f"\n=== Running Extractive Summarization on {name} Dataset ===")
    start_time = time.time()

    if 'reference_summary' not in df.columns:
        df['reference_summary'] = df.groupby('article_id')['preprocessed_sentence'].transform(
            lambda x: ' '.join(x[df.loc[x.index, 'label'] == 1])
        )

    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    global vocab
    vocab = Vocab(train_df['preprocessed_sentence'].tolist())

    train_dataset = ExtractiveDataset(train_df)
    test_dataset = ExtractiveDataset(test_df)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BiLSTMAttention(len(vocab.token2idx)).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([POS_WEIGHT]).to(device))

    for epoch in range(EPOCHS):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)

        # Validation loss
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                val_loss += criterion(model(inputs), labels).item()
        val_loss /= len(test_loader)
        scheduler.step(val_loss)
        print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")

    # Threshold tuning
    all_probs, _, all_labels = evaluate(model, test_loader, device)
    thresholds = [i/100 for i in range(10, 90, 2)]
    best_thresh, best_f1 = max(((t, f1_score(all_labels, [1 if p > t else 0 for p in all_probs]))
                                for t in thresholds), key=lambda x: x[1])
    print(f"\nBest Threshold: {best_thresh:.2f}, F1: {best_f1:.4f}")

    final_preds = [1 if p > best_thresh else 0 for p in all_probs]

    # ROUGE scoring
    print("\nComputing ROUGE Scores...")
    pred_by_article = defaultdict(list)
    for row, pred in zip(test_df.itertuples(), final_preds):
        if pred == 1:
            pred_by_article[row.article_id].append(row.preprocessed_sentence)

    grouped_refs = test_df[['article_id', 'reference_summary']].drop_duplicates().set_index('article_id')['reference_summary']
    rouge = Rouge()
    rouge_scores = {"rouge-1": [], "rouge-2": [], "rouge-l": []}

    for aid, preds in pred_by_article.items():
        ref = grouped_refs.get(aid)
        if ref:
            try:
                scores = rouge.get_scores(' '.join(preds), ref)[0]
                for key in rouge_scores:
                    rouge_scores[key].append(scores[key]['f'])
            except:
                continue

    if rouge_scores["rouge-1"]:
        print("\nROUGE-1 F1:", sum(rouge_scores["rouge-1"]) / len(rouge_scores["rouge-1"]))
        print("ROUGE-2 F1:", sum(rouge_scores["rouge-2"]) / len(rouge_scores["rouge-2"]))
        print("ROUGE-L F1:", sum(rouge_scores["rouge-l"]) / len(rouge_scores["rouge-l"]))
    else:
        print("No valid ROUGE scores could be calculated.")

    print("Execution Time: {:.2f} seconds".format(time.time() - start_time))
    return final_preds

# =====================
# Example Usage
# =====================
preds_bbc = run_pipeline("BBC", bbc_processed_df)
preds_imdb = run_pipeline("IMDB", imdb_processed_df)


=== Running Extractive Summarization on BBC Dataset ===
Epoch 1: Train Loss=1.0831, Val Loss=1.0590
Epoch 2: Train Loss=0.9987, Val Loss=1.0609
Epoch 3: Train Loss=0.8756, Val Loss=1.0986
Epoch 4: Train Loss=0.6644, Val Loss=1.2436
Epoch 5: Train Loss=0.3821, Val Loss=1.5655
              precision    recall  f1-score   support

         0.0       0.74      0.59      0.66      5081
         1.0       0.52      0.68      0.59      3255

    accuracy                           0.63      8336
   macro avg       0.63      0.64      0.63      8336
weighted avg       0.66      0.63      0.63      8336

Accuracy: 0.628478886756238

Best Threshold: 0.14, F1: 0.6003

Computing ROUGE Scores...

ROUGE-1 F1: 0.36842557298372297
ROUGE-2 F1: 0.2553879411912053
ROUGE-L F1: 0.30800864265444367
Execution Time: 364.21 seconds

=== Running Extractive Summarization on IMDB Dataset ===
Epoch 1: Train Loss=0.9281, Val Loss=0.9079
Epoch 2: Train Loss=0.8853, Val Loss=0.9044
Epoch 3: Train Loss=0.8220, Val Lo

In [29]:
import pandas as pd
from collections import defaultdict
import random
from sklearn.model_selection import train_test_split

random.seed(42)  # for reproducibility

# --- BBC ---

# Recreate test split for BBC
_, test_df_bbc = train_test_split(bbc_processed_df, test_size=0.2, random_state=42)

# Build predicted summaries dict for BBC
pred_by_article_bbc = defaultdict(list)
for pred, text, row in zip(preds_bbc, pred_texts_bbc, test_df_bbc.itertuples()):
    if pred == 1:
        pred_by_article_bbc[row.article_id].append(text)

predicted_summaries_bbc = {aid: ' '.join(sents) for aid, sents in pred_by_article_bbc.items()}

# Filter and sample article_ids for BBC
filtered_df_bbc = bbc_processed_df[bbc_processed_df['label'] == 1]
candidate_article_ids_bbc = filtered_df_bbc['article_id'].unique()
candidate_article_ids_bbc = [aid for aid in candidate_article_ids_bbc if aid in predicted_summaries_bbc]
sampled_article_ids_bbc = random.sample(list(candidate_article_ids_bbc), 5)

# Ensure article_id column in bbc_df
if 'article_id' not in bbc_df.columns:
    bbc_df = bbc_df.reset_index().rename(columns={'index': 'article_id'})

print("\n=== BBC Dataset Sample Summaries ===\n")
for aid in sampled_article_ids_bbc:
    pred_sum = predicted_summaries_bbc.get(aid, "[No predicted summary]")
    ref_sum_row = bbc_df.loc[bbc_df['article_id'] == aid, 'Summary']
    ref_sum = ref_sum_row.values[0] if len(ref_sum_row) > 0 else "[No original summary]"
    
    print(f"--- Article ID: {aid} ---")
    print("Predicted Summary:")
    print(pred_sum)
    print("\nReference Summary:")
    print(ref_sum)
    print("\n" + "="*60 + "\n")



# --- IMDB ---

# Recreate test split for IMDB
_, test_df_imdb = train_test_split(imdb_processed_df, test_size=0.2, random_state=42)

# Build predicted summaries dict for IMDB
pred_by_article_imdb = defaultdict(list)
for pred, text, row in zip(preds_imdb, pred_texts_imdb, test_df_imdb.itertuples()):
    if pred == 1:
        pred_by_article_imdb[row.article_id].append(text)

predicted_summaries_imdb = {aid: ' '.join(sents) for aid, sents in pred_by_article_imdb.items()}

# Filter and sample article_ids for IMDB
filtered_df_imdb = imdb_processed_df[imdb_processed_df['label'] == 1]
candidate_article_ids_imdb = filtered_df_imdb['article_id'].unique()
candidate_article_ids_imdb = [aid for aid in candidate_article_ids_imdb if aid in predicted_summaries_imdb]
sampled_article_ids_imdb = random.sample(list(candidate_article_ids_imdb), 5)

# Ensure article_id column in imdb_df
if 'article_id' not in imdb_df.columns:
    imdb_df = imdb_df.reset_index().rename(columns={'index': 'article_id'})

print("\n=== IMDB Dataset Sample Summaries ===\n")
for aid in sampled_article_ids_imdb:
    pred_sum = predicted_summaries_imdb.get(aid, "[No predicted summary]")
    ref_sum_row = imdb_df.loc[imdb_df['article_id'] == aid, 'Summary']
    ref_sum = ref_sum_row.values[0] if len(ref_sum_row) > 0 else "[No original summary]"
    
    print(f"--- Article ID: {aid} ---")
    print("Predicted Summary:")
    print(pred_sum)
    print("\nReference Summary:")
    print(ref_sum)
    print("\n" + "="*60 + "\n")


=== BBC Dataset Sample Summaries ===

--- Article ID: 1542 ---
Predicted Summary:
say minister would consult proposal could see father allow take partner maternity pay leave period extend right flexible work carers parent old child say party would boost maternity pay first six month allow woman stay home time hewitt also stress plan would pay taxpayer employer plan include let maternity pay give father extend right parent old child far detail government plan outline monday

Reference Summary:
She said her party would boost maternity pay in the first six months to allow more women to stay at home in that time.She said new mothers were already entitled to 12 months leave, but that many women could not take it as only six of those months were paid.The Tories dismissed the maternity pay plan as "desperate", while the Liberal Democrats said it was misdirected.She said ministers would consult on other proposals that could see fathers being allowed to take some of their partner's maternity p

In [37]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from nltk.tokenize import sent_tokenize
from rouge import Rouge
import nltk
nltk.download('punkt')

# Prepare dataset into sentence-level samples
def prepare_dataset(df):
    sentences = []
    labels = []
    for _, row in df.iterrows():
        article = row['article']
        summary = row['summary']
        sents = sent_tokenize(article)
        for sent in sents:
            sentences.append(sent)
            labels.append(1 if sent in summary else 0)
    return sentences, labels

# Use the model to extract predicted summary from an article
def extract_summary(article, clf, vectorizer):
    sents = sent_tokenize(article)
    X_sents = vectorizer.transform(sents)
    preds = clf.predict(X_sents)
    extracted = [s for s, p in zip(sents, preds) if p == 1]
    return " ".join(extracted) if extracted else sents[0]

# Main training + evaluation function
def run_on_dataset(name, df):
    print(f"\n=== Running on {name} Dataset with Random Forest ===")

    sentences, labels = prepare_dataset(df)
    vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
    X = vectorizer.fit_transform(sentences)
    y = labels

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    clf = RandomForestClassifier(
        n_estimators=100,
        max_depth=30,
        min_samples_leaf=10,
        min_samples_split=10,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    clf.fit(X_train, y_train)

    train_acc = clf.score(X_train, y_train)
    test_acc = clf.score(X_test, y_test)
    print(f"Train Accuracy for {name}: {train_acc:.4f}")
    print(f"Test Accuracy for {name}: {test_acc:.4f}")

    y_pred = clf.predict(X_test)
    print(f"\nClassification Report for {name}:")
    print(classification_report(y_test, y_pred))

    sample_df = df.sample(n=200, random_state=42)
    rouge = Rouge()
    rouge_1_scores, rouge_2_scores, rouge_l_scores = [], [], []

    for _, row in sample_df.iterrows():
        pred_summary = extract_summary(row['article'], clf, vectorizer)
        true_summary = row['summary']
        try:
            scores = rouge.get_scores(pred_summary, true_summary)[0]
            rouge_1_scores.append(scores['rouge-1']['f'])
            rouge_2_scores.append(scores['rouge-2']['f'])
            rouge_l_scores.append(scores['rouge-l']['f'])
        except:
            continue

    print(f"\nAverage ROUGE-1 (F1) for {name} sample (200 articles): {sum(rouge_1_scores)/len(rouge_1_scores):.4f}")
    print(f"Average ROUGE-2 (F1) for {name} sample (200 articles): {sum(rouge_2_scores)/len(rouge_2_scores):.4f}")
    print(f"Average ROUGE-L (F1) for {name} sample (200 articles): {sum(rouge_l_scores)/len(rouge_l_scores):.4f}")

    return clf, vectorizer

# Function to display predicted vs reference summaries
def show_predictions(df, name, clf, vectorizer, n=5):
    print(f"\n=== Sample Predicted vs Reference Summaries for {name} ===")
    sample_df = df.sample(n=n, random_state=1).reset_index(drop=True)
    for i, row in sample_df.iterrows():
        article = row['article']
        true_summary = row['summary']
        pred_summary = extract_summary(article, clf, vectorizer)

        print(f"\n--- Article {i+1} ---")
        print(f"Predicted Summary:\n{pred_summary}\n")
        print(f"Reference Summary:\n{true_summary}\n")
        print("-" * 80)

# Ensure correct column names
bbc_df = bbc_df.rename(columns={"Article": "article", "Summary": "summary"})
imdb_df = imdb_df.rename(columns={"Article": "article", "Summary": "summary"})

# Run and capture models
bbc_clf, bbc_vectorizer = run_on_dataset("BBC", bbc_df)
imdb_clf, imdb_vectorizer = run_on_dataset("IMDB", imdb_df)

# Show sample predictions
show_predictions(bbc_df, "BBC", bbc_clf, bbc_vectorizer, n=5)
show_predictions(imdb_df, "IMDB", imdb_clf, imdb_vectorizer, n=5)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mohamedkenya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



=== Running on BBC Dataset with Random Forest ===
Train Accuracy for BBC: 0.7017
Test Accuracy for BBC: 0.6632

Classification Report for BBC:
              precision    recall  f1-score   support

           0       0.72      0.74      0.73      7616
           1       0.57      0.54      0.56      4810

    accuracy                           0.66     12426
   macro avg       0.64      0.64      0.64     12426
weighted avg       0.66      0.66      0.66     12426


Average ROUGE-1 (F1) for BBC sample (200 articles): 0.6474
Average ROUGE-2 (F1) for BBC sample (200 articles): 0.5601
Average ROUGE-L (F1) for BBC sample (200 articles): 0.6415

=== Running on IMDB Dataset with Random Forest ===
Train Accuracy for IMDB: 0.9892
Test Accuracy for IMDB: 0.9840

Classification Report for IMDB:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     11591
           1       0.29      0.50      0.37       109

    accuracy                           