In [5]:
import os
import pandas as pd
from utils import prepare_labeled_sentences, prepare_labeled_sentences_spacy
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mohamedkenya/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Read Datasets

In [6]:
# BBC Dataset
bbc_df = pd.read_csv("data/bbc/bbc_dataset.csv")

# CNN Datasets
# cnn_train_df = pd.read_csv("data/cnn/cnn_dailymail_train.csv")
# cnn_valid_df = pd.read_csv("data/cnn/cnn_dailymail_valid.csv")
# cnn_test_df = pd.read_csv("data/cnn/cnn_dailymail_test.csv")

imdb_df = pd.read_csv("data/imdb/imdb.csv")

In [7]:
# Preview to confirm structure
print("BBC Sample:")
display(bbc_df.head())

BBC Sample:


Unnamed: 0,Article,Summary
0,Musicians to tackle US red tape\n\nMusicians' ...,Nigel McCune from the Musicians' Union said Br...
1,"U2's desire to be number one\n\nU2, who have w...",But they still want more.They have to want to ...
2,Rocker Doherty in on-stage fight\n\nRock singe...,"Babyshambles, which he formed after his acrimo..."
3,Snicket tops US box office chart\n\nThe film a...,A Series of Unfortunate Events also stars Scot...
4,Ocean's Twelve raids box office\n\nOcean's Twe...,"Ocean's Twelve, the crime caper sequel starrin..."


In [None]:
# print("CNN Sample:")
# display(cnn_train_df.head())

CNN Sample:


NameError: name 'cnn_train_df' is not defined

In [None]:
print("IMDB Sample:")
display(imdb_df.head())

IMDB Sample:


Unnamed: 0,Article,Summary
0,One of the other reviewers has mentioned that ...,One of the other reviewers has mentioned that ...
1,A wonderful little production The filming tech...,A wonderful little production The filming tech...
2,I thought this was wonderful way to spend time...,I thought it was proof that Woody Allen is sti...
3,Basically there a family where little boy Jake...,Basically there a family where little boy Jake...
4,Petter Mattei Love in the Time of Money is vis...,Petter Mattei Love in the Time of Money is vis...


Preprocess BBC Datasets

In [9]:
# Process the BBC dataset
bbc_labeled_data = prepare_labeled_sentences_spacy(bbc_df)

# Convert to DataFrame for modeling
bbc_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "article_sentences": item["raw_sentence"],
            "preprocessed_sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in bbc_labeled_data
    ]
)

Preprocessing articles: 100%|██████████| 2225/2225 [04:42<00:00,  7.87it/s]


In [10]:
bbc_processed_df.shape

(41677, 4)

In [11]:
# Count how many sentences are labeled as summary sentences
summary_count = bbc_processed_df['label'].sum()
total_count = len(bbc_processed_df)
print(f"Summary sentences: {summary_count} out of {total_count} ({summary_count/total_count:.2%})")

# Show some examples of sentences included in summaries
print("\nExample summary sentences:")
display(bbc_processed_df[bbc_processed_df['label'] == 1].head(3))

Summary sentences: 16543 out of 41677 (39.69%)

Example summary sentences:


Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
0,0,Musicians to tackle US red tape Musicians' gr...,musician tackle u red tape musician group tack...,1
1,0,A singer hoping to perform in the US can expec...,singer hop perform u expect pay simply obtain ...,1
4,0,Nigel McCune from the Musicians' Union said Br...,nigel mccune musician union say british musici...,1


In [12]:
bbc_processed_df.head(60)

Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
0,0,Musicians to tackle US red tape Musicians' gr...,musician tackle u red tape musician group tack...,1
1,0,A singer hoping to perform in the US can expec...,singer hop perform u expect pay simply obtain ...,1
2,0,Groups including the Musicians' Union are call...,group include musician union call end raw deal...,0
3,0,US acts are not faced with comparable expense ...,u act face comparable expense bureaucracy visi...,0
4,0,Nigel McCune from the Musicians' Union said Br...,nigel mccune musician union say british musici...,1
5,0,A sponsor has to make a petition on their beha...,sponsor make petition behalf form amount nearl...,0
6,0,"""If you make a mistake on your form, you risk ...",make mistake form risk ban thus ability career...,0
7,0,"""The US is the world's biggest music market, w...",u world big music market mean something creaky...,1
8,0,"""The current situation is preventing British a...",current situation prevent british act maintain...,1
9,0,The Musicians' Union stance is being endorsed ...,musician union stance endorse music manager fo...,1


Preprocessed IMDB Dataset

In [13]:
# Process the BBC dataset
imdb_labeled_df = prepare_labeled_sentences_spacy(imdb_df[:4000])

# Convert to DataFrame for modeling
imdb_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "article_sentences": item["raw_sentence"],
            "preprocessed_sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in imdb_labeled_df
    ]
)

Preprocessing articles: 100%|██████████| 4000/4000 [03:21<00:00, 19.80it/s]


In [14]:
imdb_processed_df.shape

(13024, 4)

In [15]:
# Count how many sentences are labeled as summary sentences
summary_count = imdb_processed_df['label'].sum()
total_count = len(imdb_processed_df)
print(f"Summary sentences: {summary_count} out of {total_count} ({summary_count/total_count:.2%})")

# Show some examples of sentences included in summaries
print("\nExample summary sentences:")
display(imdb_processed_df[imdb_processed_df['label'] == 1].head(3))

Summary sentences: 2934 out of 13024 (22.53%)

Example summary sentences:


Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
2,1,A wonderful little production The filming tech...,wonderful little production filming technique ...,1
9,3,Basically there a family where little boy Jake...,basically family little boy jake think zombie ...,1
11,4,Petter Mattei Love in the Time of Money is vis...,petter mattei love time money visually stunnin...,1


In [None]:
# print(imdb_processed_df["raw_sentence"][2])

KeyError: 'raw_sentence'

In [17]:
imdb_processed_df.head(60)

Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
0,0,One of the other reviewers has mentioned that ...,one reviewer mention watch oz episode hook rig...,0
1,0,This show pulls no punches with regards to dru...,show pull punch regard drug sex violence hardc...,0
2,1,A wonderful little production The filming tech...,wonderful little production filming technique ...,1
3,1,but he has all the voices down pat too You can...,voice pat truly see seamless edit guide refere...,0
4,1,but it is terrificly written and performed pie...,terrificly write perform piece masterful produ...,0
5,1,The realism really comes home with the little ...,realism really come home little thing fantasy ...,0
6,2,I thought this was wonderful way to spend time...,think wonderful way spend time hot summer week...,0
7,2,The plot is simplistic but the dialogue is wit...,plot simplistic dialogue witty character likab...,0
8,2,While some may be disappointed when they reali...,may disappoint realize match point risk addict...,0
9,3,Basically there a family where little boy Jake...,basically family little boy jake think zombie ...,1


BiLSTM + Attention


In [None]:
# Required imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd
from rouge import Rouge

# Tokenizer utility
def tokenize(text):
    return text.lower().split()

# Create vocabulary
def build_vocab(texts, min_freq=2):
    counter = Counter(word for text in texts for word in tokenize(text))
    vocab = ['<PAD>', '<UNK>', '<SOS>', '<EOS>'] + [word for word, count in counter.items() if count >= min_freq]
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

# Dataset class for summarization
class SummarizationDataset(Dataset):
    def __init__(self, articles, summaries, word2idx, max_len=100):
        self.articles = articles
        self.summaries = summaries
        self.word2idx = word2idx
        self.max_len = max_len

    def encode(self, text, add_sos_eos=False):
        tokens = tokenize(text)
        if add_sos_eos:
            tokens = ['<SOS>'] + tokens + ['<EOS>']
        ids = [self.word2idx.get(t, self.word2idx['<UNK>']) for t in tokens]
        padded = ids[:self.max_len] + [self.word2idx['<PAD>']] * (self.max_len - len(ids))
        return torch.tensor(padded)

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        article = self.encode(self.articles[idx])
        summary = self.encode(self.summaries[idx], add_sos_eos=True)
        return article, summary

# Bahdanau Attention
class BahdanauAttention(nn.Module):
    def __init__(self, enc_dim, dec_dim):
        super().__init__()
        self.attn = nn.Linear(enc_dim + dec_dim, dec_dim)
        self.v = nn.Linear(dec_dim, 1, bias=False)

    def forward(self, encoder_outputs, hidden):
        src_len = encoder_outputs.shape[1]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((encoder_outputs, hidden), dim=2)))
        attn_weights = torch.softmax(self.v(energy).squeeze(2), dim=1)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        return context

# BiLSTM + Attention model
class Seq2SeqAttention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.dropout = nn.Dropout(0.3)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True, num_layers=2, dropout=0.3)
        self.decoder = nn.LSTMCell(embedding_dim + hidden_dim * 2, hidden_dim * 2)
        self.attn = BahdanauAttention(hidden_dim * 2, hidden_dim * 2)
        self.fc_out = nn.Linear(hidden_dim * 4, vocab_size)

    def forward(self, src, tgt):
        batch_size, tgt_len = tgt.shape
        embedded_src = self.dropout(self.embedding(src))
        encoder_outputs, _ = self.encoder(embedded_src)
        embedded_tgt = self.dropout(self.embedding(tgt))

        h = encoder_outputs.mean(dim=1)
        c = h
        outputs = []

        for t in range(tgt_len):
            context = self.attn(encoder_outputs, h)
            rnn_input = torch.cat((embedded_tgt[:, t], context), dim=1)
            h, c = self.decoder(rnn_input, (h, c))
            output = self.fc_out(torch.cat((h, context), dim=1))
            outputs.append(output.unsqueeze(1))

        return torch.cat(outputs, dim=1)

# Training function
def train(model, loader, criterion, optimizer, device, clip=1):
    model.train()
    total_loss = 0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.view(-1, output.shape[-1]), tgt[:, 1:].reshape(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Evaluation function
def evaluate(model, loader, idx2word, device):
    model.eval()
    rouge = Rouge()
    predictions = []
    references = []

    with torch.no_grad():
        for src, tgt in loader:
            src = src.to(device)
            output = model(src, tgt[:, :-1].to(device))
            pred_ids = output.argmax(dim=2).cpu().numpy()
            tgt_ids = tgt[:, 1:].cpu().numpy()

            for pred, ref in zip(pred_ids, tgt_ids):
                pred_words = [idx2word.get(i, '') for i in pred if i not in (0, 1, 0)]
                ref_words = [idx2word.get(i, '') for i in ref if i not in (0, 1, 0)]
                predictions.append(" ".join(pred_words))
                references.append(" ".join(ref_words))

    scores = rouge.get_scores(predictions, references, avg=True)
    return scores

# Pipeline
def run_summarization_pipeline(name, df):
    print(f"\n=== Training on {name} Dataset ===")
    word2idx, idx2word = build_vocab(df['article'].tolist() + df['summary'].tolist())
    pad_idx = word2idx['<PAD>']

    X_train, X_test, y_train, y_test = train_test_split(df['article'], df['summary'], test_size=0.2)

    train_dataset = SummarizationDataset(X_train.tolist(), y_train.tolist(), word2idx)
    test_dataset = SummarizationDataset(X_test.tolist(), y_test.tolist(), word2idx)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = Seq2SeqAttention(vocab_size=len(word2idx), embedding_dim=256, hidden_dim=256, pad_idx=pad_idx).to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(10):
        loss = train(model, train_loader, criterion, optimizer, device)
        print(f"Epoch {epoch+1} | Loss: {loss:.4f}")

    rouge_scores = evaluate(model, test_loader, idx2word, device)
    print(f"\nROUGE Scores on {name} Test Set:")
    for key, score in rouge_scores.items():
        print(f"{key}: {score}")

# Rename columns for compatibility
bbc_df = bbc_df.rename(columns={"Article": "article", "Summary": "summary"})
imdb_df = imdb_df.rename(columns={"Article": "article", "Summary": "summary"})

# Run the pipeline
run_summarization_pipeline("BBC", bbc_df)
run_summarization_pipeline("imdb", imdb_df)



=== Training on BBC Dataset ===
Epoch 1 | Loss: 8.0298


KeyboardInterrupt: 

In [29]:
print(bbc_processed_df.columns)
print(imdb_processed_df.columns)


Index(['article_id', 'article_sentences', 'preprocessed_sentence', 'label'], dtype='object')
Index(['article_id', 'article_sentences', 'preprocessed_sentence', 'label'], dtype='object')


In [None]:
# Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd
from rouge import Rouge
import random
import numpy as np

# Tokenization
def tokenize(text):
    return text.lower().strip().split()

# Vocabulary
def build_vocab(texts, min_freq=2):
    counter = Counter(word for text in texts for word in tokenize(text))
    vocab = ['<PAD>', '<UNK>', '<SOS>', '<EOS>'] + [word for word, count in counter.items() if count >= min_freq]
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

# Dataset
class SummarizationDataset(Dataset):
    def __init__(self, articles, summaries, word2idx, max_len=80):
        self.articles = articles
        self.summaries = summaries
        self.word2idx = word2idx
        self.max_len = max_len

    def encode(self, text, add_sos_eos=False):
        tokens = tokenize(text)
        if add_sos_eos:
            tokens = ['<SOS>'] + tokens + ['<EOS>']
        ids = [self.word2idx.get(t, self.word2idx['<UNK>']) for t in tokens]
        padded = ids[:self.max_len] + [self.word2idx['<PAD>']] * (self.max_len - len(ids))
        return torch.tensor(padded[:self.max_len])

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        article = self.encode(self.articles[idx])
        summary = self.encode(self.summaries[idx], add_sos_eos=True)
        return article, summary

# Attention
class BahdanauAttention(nn.Module):
    def __init__(self, enc_dim, dec_dim):
        super().__init__()
        self.attn = nn.Linear(enc_dim + dec_dim, dec_dim)
        self.v = nn.Linear(dec_dim, 1, bias=False)

    def forward(self, encoder_outputs, hidden):
        src_len = encoder_outputs.shape[1]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((encoder_outputs, hidden), dim=2)))
        attn_weights = torch.softmax(self.v(energy).squeeze(2), dim=1)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        return context

# Seq2Seq Model
class Seq2SeqAttention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.decoder = nn.LSTMCell(embedding_dim + hidden_dim * 2, hidden_dim * 2)
        self.attn = BahdanauAttention(hidden_dim * 2, hidden_dim * 2)
        self.fc_out = nn.Linear(hidden_dim * 4, vocab_size)
        self.dropout = nn.Dropout(0.4)

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size, tgt_len = tgt.shape
        embedded_src = self.dropout(self.embedding(src))
        encoder_outputs, _ = self.encoder(embedded_src)
        embedded_tgt = self.dropout(self.embedding(tgt))

        h = encoder_outputs.mean(dim=1)
        c = h
        inputs = embedded_tgt[:, 0]
        outputs = []

        for t in range(1, tgt_len):
            context = self.attn(encoder_outputs, h)
            rnn_input = torch.cat((inputs, context), dim=1)
            h, c = self.decoder(rnn_input, (h, c))
            output = self.fc_out(torch.cat((h, context), dim=1))
            outputs.append(output.unsqueeze(1))
            teacher_force = random.random() < teacher_forcing_ratio
            inputs = embedded_tgt[:, t] if teacher_force else self.embedding(output.argmax(dim=1))

        return torch.cat(outputs, dim=1)

# Training
def train(model, loader, criterion, optimizer, device, epoch):
    model.train()
    total_loss = 0
    tf_ratio = max(0.6 - 0.05 * epoch, 0.3)
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt, teacher_forcing_ratio=tf_ratio)
        loss = criterion(output.view(-1, output.shape[-1]), tgt[:, 1:].reshape(-1))
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Evaluation
def evaluate(model, loader, idx2word, device):
    model.eval()
    rouge = Rouge()
    predictions, references = [], []

    with torch.no_grad():
        for src, tgt in loader:
            src = src.to(device)
            batch_size = src.size(0)
            embedded = model.embedding(src)
            encoder_outputs, _ = model.encoder(embedded)
            h = encoder_outputs.mean(dim=1)
            c = h
            inputs = model.embedding(torch.tensor([2]*batch_size).to(device))  # <SOS>
            preds = [[] for _ in range(batch_size)]

            for _ in range(30):
                context = model.attn(encoder_outputs, h)
                rnn_input = torch.cat((inputs, context), dim=1)
                h, c = model.decoder(rnn_input, (h, c))
                output = model.fc_out(torch.cat((h, context), dim=1))
                top1 = output.argmax(dim=1)
                inputs = model.embedding(top1)
                for i, token in enumerate(top1.tolist()):
                    preds[i].append(token)

            for pred, ref in zip(preds, tgt[:, 1:].cpu().numpy()):
                pred_words = [idx2word.get(i, '') for i in pred if i > 3]
                ref_words = [idx2word.get(i, '') for i in ref if i > 3]
                predictions.append(" ".join(pred_words))
                references.append(" ".join(ref_words))

    return rouge.get_scores(predictions, references, avg=True)

# Main pipeline
def run_pipeline(name, df, max_epochs=8):
    print(f"\n=== Training on {name} Dataset ===")
    word2idx, idx2word = build_vocab(df['article'].tolist() + df['summary'].tolist())
    pad_idx = word2idx['<PAD>']

    X_train, X_test, y_train, y_test = train_test_split(df['article'], df['summary'], test_size=0.2)
    train_dataset = SummarizationDataset(X_train.tolist(), y_train.tolist(), word2idx)
    test_dataset = SummarizationDataset(X_test.tolist(), y_test.tolist(), word2idx)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)

    device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
    model = Seq2SeqAttention(len(word2idx), 128, 128, pad_idx).to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(max_epochs):
        loss = train(model, train_loader, criterion, optimizer, device, epoch)
        print(f"Epoch {epoch+1} | Loss: {loss:.4f}")

    rouge_scores = evaluate(model, test_loader, idx2word, device)
    print(f"\nROUGE Scores on {name} Test Set:")
    for key, score in rouge_scores.items():
        print(f"{key}: {score}")

# === Load your data ===
# Rename your preprocessed DataFrames to match:
bbc_df = bbc_df.rename(columns={"Article": "article", "Summary": "summary"})
imdb_df = imdb_df.rename(columns={"Article": "article", "Summary": "summary"})

# Run pipeline
run_pipeline("BBC", bbc_df)
run_pipeline("IMDb", imdb_df)



=== Training on BBC Dataset ===
Epoch 1 | Loss: 8.2085
Epoch 2 | Loss: 7.4750
