In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import os
import pandas as pd

In [6]:
import os
import re
import nltk
import spacy
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from difflib import SequenceMatcher

# Download NLTK resources
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("averaged_perceptron_tagger", quiet=True)
nltk.download("averaged_perceptron_tagger_eng", quiet=True)
nlp = spacy.load("en_core_web_sm")


def to_lowercase(text: str) -> str:
    return text.lower()


def remove_control_characters(text: str) -> str:
    return re.sub(r"[\x00-\x1f\x7f-\x9f\u200b]", "", text)


def normalize_whitespace(text: str) -> str:
    return re.sub(r"\s+", " ", text).strip()


def tokenize_text(text: str) -> list:
    return word_tokenize(text)


def clean_tokens(tokens: list) -> list:
    return [
        t
        for t in tokens
        if t.isalpha() or ("." in t and len(t) > 1 and not t.strip(".") == "")
    ]


def remove_stopwords(tokens: list) -> list:
    stop_words = set(stopwords.words("english"))
    return [t for t in tokens if t.lower() not in stop_words]


def get_wordnet_pos(treebank_tag: str) -> str:
    if treebank_tag.startswith("J"):
        return wordnet.ADJ
    elif treebank_tag.startswith("V"):
        return wordnet.VERB
    elif treebank_tag.startswith("N"):
        return wordnet.NOUN
    elif treebank_tag.startswith("R"):
        return wordnet.ADV
    return wordnet.NOUN


def lemmatize_tokens(tokens: list) -> list:
    lemmatizer = WordNetLemmatizer()
    pos_tags = pos_tag(tokens)
    return [lemmatizer.lemmatize(t, get_wordnet_pos(tag)) for t, tag in pos_tags]


def join_tokens(tokens: list) -> str:
    return " ".join(tokens)


def basic_preprocess(text: str) -> str:
    if not isinstance(text, str):
        return ""

    text = to_lowercase(text)
    text = remove_control_characters(text)
    text = normalize_whitespace(text)
    tokens = tokenize_text(text)
    tokens = clean_tokens(tokens)
    tokens = lemmatize_tokens(tokens)
    tokens = remove_stopwords(tokens)

    return join_tokens(tokens)


def spacy_sent_tokenize(text: str) -> list:
    """
    Splits text into sentences using spaCy.
    Normalizes line breaks first.
    """
    cleaned_text = text.replace("\n", " ").strip()
    doc = nlp(cleaned_text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]


def is_summary_sentence(sent: str, summary_sents: list, threshold: float = 0.7) -> bool:
    """
    Checks whether a sentence closely matches any sentence in the summary using SequenceMatcher.
    """
    for summ_sent in summary_sents:
        similarity = SequenceMatcher(None, sent, summ_sent).ratio()
        if similarity >= threshold:
            return True
    return False


def prepare_labeled_sentences_spacy(df) -> list:
    all_data = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Preprocessing articles"):
        article = row["Article"]
        summary = row["Summary"]

        article_sents = spacy_sent_tokenize(article)
        summary_sents = spacy_sent_tokenize(summary)

        preprocessed_summary_sents = [basic_preprocess(s) for s in summary_sents]

        for sent in article_sents:
            raw_sentence = sent
            preprocessed = basic_preprocess(sent)

            label = int(
                is_summary_sentence(
                    preprocessed, preprocessed_summary_sents, threshold=0.6
                )
            )

            all_data.append(
                {
                    "article_id": idx,
                    "raw_sentence": raw_sentence,
                    "preprocessed_sentence": preprocessed,
                    "label": label,
                }
            )

    return all_data


def prepare_labeled_sentences(df) -> list:
    all_data = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Preprocessing articles"):
        article = row["Article"]
        summary = row["Summary"]

        article_sents = sent_tokenize(article)
        summary_sents = sent_tokenize(summary)

        preprocessed_summary_sents = [basic_preprocess(s) for s in summary_sents]

        for sent in article_sents:
            raw_sentence = sent
            preprocessed = basic_preprocess(sent)

            label = int(
                is_summary_sentence(
                    preprocessed, preprocessed_summary_sents, threshold=0.6
                )
            )

            all_data.append(
                {
                    "article_id": idx,
                    "raw_sentence": raw_sentence,
                    "preprocessed_sentence": preprocessed,
                    "label": label,
                }
            )

    return all_data


# ========== FILE HANDLING ==========


def preprocess_document(input_path: str, output_path: str) -> bool:
    if not os.path.exists(input_path):
        print(f"[Error] File not found at: {os.path.abspath(input_path)}")
        return False

    try:
        with open(input_path, "r", encoding="utf-8") as f:
            raw_text = f.read()

        clean_text = basic_preprocess(raw_text)

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(clean_text)

        print(f"[Success] Output saved to: {output_path}")
        return True
    except Exception as e:
        print(f"[Error] {e}")
        return False


In [7]:
# BBC Dataset
bbc_df = pd.read_csv("/content/drive/MyDrive/data/bbc/bbc_dataset.csv")

#IMDB Dataset
imdb_df = pd.read_csv("/content/drive/MyDrive/data/imdb/imdb.csv")

In [8]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |  

True

In [9]:
# Then process the BBC dataset
bbc_labeled_data = prepare_labeled_sentences_spacy(bbc_df)

# Convert to DataFrame for modeling
bbc_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "article_sentences": item["raw_sentence"],
            "preprocessed_sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in bbc_labeled_data
    ]
)

Preprocessing articles: 100%|██████████| 2225/2225 [05:54<00:00,  6.27it/s]


In [10]:
# Count how many sentences are labeled as summary sentences
summary_count = bbc_processed_df['label'].sum()
total_count = len(bbc_processed_df)
print(f"Summary sentences: {summary_count} out of {total_count} ({summary_count/total_count:.2%})")

# Show some examples of sentences included in summaries
print("\nExample summary sentences:")
display(bbc_processed_df[bbc_processed_df['label'] == 1].head(3))

Summary sentences: 16543 out of 41677 (39.69%)

Example summary sentences:


Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
0,0,Ad sales boost Time Warner profit Quarterly p...,ad sale boost time warner profit quarterly pro...,1
2,0,TimeWarner said fourth quarter sales rose 2% t...,timewarner say fourth quarter sale rise 11.1bn...,1
6,0,"It lost 464,000 subscribers in the fourth quar...",lose subscriber fourth quarter profit low prec...,1


In [11]:
# Process the BBC dataset
imdb_labeled_df = prepare_labeled_sentences_spacy(imdb_df[:4000])

# Convert to DataFrame for modeling
imdb_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "article_sentences": item["raw_sentence"],
            "preprocessed_sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in imdb_labeled_df
    ]
)

Preprocessing articles: 100%|██████████| 4000/4000 [04:11<00:00, 15.94it/s]


In [12]:
# Count how many sentences are labeled as summary sentences
summary_count = imdb_processed_df['label'].sum()
total_count = len(imdb_processed_df)
print(f"Summary sentences: {summary_count} out of {total_count} ({summary_count/total_count:.2%})")

# Show some examples of sentences included in summaries
print("\nExample summary sentences:")
display(imdb_processed_df[imdb_processed_df['label'] == 1].head(3))

Summary sentences: 2934 out of 13024 (22.53%)

Example summary sentences:


Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
2,1,A wonderful little production The filming tech...,wonderful little production filming technique ...,1
9,3,Basically there a family where little boy Jake...,basically family little boy jake think zombie ...,1
11,4,Petter Mattei Love in the Time of Money is vis...,petter mattei love time money visually stunnin...,1


In [13]:
from sklearn.utils import resample

def balance_dataset(df):
    df_majority = df[df.label == 0]
    df_minority = df[df.label == 1]

    df_minority_upsampled = resample(
        df_minority, replace=True, n_samples=len(df_majority), random_state=42
    )

    return pd.concat([df_majority, df_minority_upsampled])


# Balance both datasets
bbc_balanced = balance_dataset(bbc_processed_df)
imdb_balanced = balance_dataset(imdb_processed_df)


In [None]:
# !pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=eab35eb9216ce3b377d87f6996f278360813d68cd61d00104467e0de9ca2956f
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [15]:
# bertsum.py

import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from rouge_score import rouge_scorer

class BertSumDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.sentences = df['preprocessed_sentence'].tolist()
        self.labels = df['label'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            sentence,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        return input_ids, attention_mask, torch.tensor(label, dtype=torch.float)

class BertSumModel(nn.Module):
    def __init__(self, dropout=0.3):
        super(BertSumModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        x = self.classifier(x)
        return self.sigmoid(x)

def train_bertsum(train_df, val_df, batch_size=4, epochs=2):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    train_dataset = BertSumDataset(train_df, tokenizer)
    val_dataset = BertSumDataset(val_df, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    model = BertSumModel()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    loss_fn = nn.BCELoss()

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for input_ids, attention_mask, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device).unsqueeze(1)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1} Loss: {total_loss:.4f}")

    return model, tokenizer

class BertSumTrainer:
    def __init__(self, model, tokenizer, threshold=0.5):
        self.model = model
        self.tokenizer = tokenizer
        self.threshold = threshold
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.eval()

    def summarize(self, sentences):
        inputs = self.tokenizer(
            sentences,
            padding=True,
            truncation=True,
            return_tensors='pt'
        )
        input_ids = inputs['input_ids'].to(self.device)
        attention_mask = inputs['attention_mask'].to(self.device)

        with torch.no_grad():
            probs = self.model(input_ids, attention_mask).squeeze(1)

        selected_indices = (probs >= self.threshold).nonzero(as_tuple=True)[0].tolist()
        summary_sentences = [sentences[i] for i in selected_indices]
        return " ".join(summary_sentences), probs.cpu().numpy()

def evaluate_model(model, data_loader, tokenizer, original_sentences, threshold=0.5):
    model.eval()
    device = next(model.parameters()).device
    correct = 0
    total = 0
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge1_total = 0
    rouge2_total = 0
    rougeL_total = 0

    idx = 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in data_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device).unsqueeze(1)

            outputs = model(input_ids, attention_mask)
            preds = (outputs >= threshold).float()
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            for i in range(len(labels)):
                if idx >= len(original_sentences):
                    break
                original = original_sentences[idx]
                predicted = original if preds[i].item() == 1.0 else ""
                reference = original if labels[i].item() == 1.0 else ""
                scores = scorer.score(reference, predicted)
                rouge1_total += scores['rouge1'].fmeasure
                rouge2_total += scores['rouge2'].fmeasure
                rougeL_total += scores['rougeL'].fmeasure
                idx += 1

    accuracy = correct / total
    n = min(idx, len(original_sentences))
    avg_rouge1 = rouge1_total / n if n > 0 else 0
    avg_rouge2 = rouge2_total / n if n > 0 else 0
    avg_rougeL = rougeL_total / n if n > 0 else 0

    return {
        "accuracy": accuracy,
        "rouge1": avg_rouge1,
        "rouge2": avg_rouge2,
        "rougeL": avg_rougeL
    }

def save_model(model, tokenizer, path_prefix="bertsum_model"):
    model_path = f"{path_prefix}.pt"
    tokenizer_path = f"{path_prefix}_tokenizer"
    torch.save(model.state_dict(), model_path)
    tokenizer.save_pretrained(tokenizer_path)
    print(f"Model saved to {model_path}")
    print(f"Tokenizer saved to {tokenizer_path}")

In [17]:
# === Split BBC into train, val, and test sets (80/10/10) ===
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# First: split into train (80%) and temp (20%)
bbc_train, bbc_temp = train_test_split(bbc_balanced, test_size=0.2, random_state=42)

# Then: split temp into val (10%) and test (10%)
bbc_val, bbc_test = train_test_split(bbc_temp, test_size=0.5, random_state=42)

# === Train the model ===
bbc_model, bbc_tokenizer = train_bertsum(bbc_train, bbc_val, batch_size=8, epochs=5)

# === Wrap in a trainer (optional for inference) ===
bbc_trainer = BertSumTrainer(bbc_model, bbc_tokenizer)

# === Create Datasets & Loaders ===
train_dataset = BertSumDataset(bbc_train, bbc_tokenizer)
val_dataset = BertSumDataset(bbc_val, bbc_tokenizer)
test_dataset = BertSumDataset(bbc_test, bbc_tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

# === Evaluate ===
train_metrics = evaluate_model(bbc_model, train_loader, bbc_tokenizer, bbc_train['preprocessed_sentence'].tolist())
val_metrics = evaluate_model(bbc_model, val_loader, bbc_tokenizer, bbc_val['preprocessed_sentence'].tolist())
test_metrics = evaluate_model(bbc_model, test_loader, bbc_tokenizer, bbc_test['preprocessed_sentence'].tolist())

# === Print Results ===
print(f"Train Accuracy: {train_metrics['accuracy']:.4f}, ROUGE-1: {train_metrics['rouge1']:.4f}, ROUGE-L: {train_metrics['rougeL']:.4f}")
print(f"Val Accuracy:   {val_metrics['accuracy']:.4f}, ROUGE-1: {val_metrics['rouge1']:.4f}, ROUGE-L: {val_metrics['rougeL']:.4f}")
print(f"Test Accuracy:  {test_metrics['accuracy']:.4f}, ROUGE-1: {test_metrics['rouge1']:.4f}, ROUGE-L: {test_metrics['rougeL']:.4f}")

# === Save model and tokenizer ===
save_model(bbc_model, bbc_tokenizer, path_prefix="bertsum_bbc")

Epoch 1: 100%|██████████| 5027/5027 [16:12<00:00,  5.17it/s]


Epoch 1 Loss: 3102.8417


Epoch 2: 100%|██████████| 5027/5027 [16:12<00:00,  5.17it/s]


Epoch 2 Loss: 2139.4816


Epoch 3: 100%|██████████| 5027/5027 [16:12<00:00,  5.17it/s]


Epoch 3 Loss: 1090.1288


Epoch 4: 100%|██████████| 5027/5027 [16:11<00:00,  5.18it/s]


Epoch 4 Loss: 597.3660


Epoch 5: 100%|██████████| 5027/5027 [16:11<00:00,  5.18it/s]


Epoch 5 Loss: 404.3644
Train Accuracy: 0.9948, ROUGE-1: 0.4974, ROUGE-L: 0.4974
Val Accuracy:   0.8297, ROUGE-1: 0.4358, ROUGE-L: 0.4358
Test Accuracy:  0.8251, ROUGE-1: 0.4219, ROUGE-L: 0.4219


In [18]:
# Alternative using your preprocessing pipeline
sample_row = bbc_df.iloc[[1]]  # Get as DataFrame to match your processing function
processed = prepare_labeled_sentences_spacy(sample_row)
sample_sentences = [item['raw_sentence'] for item in processed]

summary, probs = bbc_trainer.summarize(sample_sentences)

Preprocessing articles: 100%|██████████| 1/1 [00:00<00:00,  7.47it/s]


In [19]:
# Alternative using your preprocessing pipeline
sample_row = bbc_df.iloc[[1]]  # Get as DataFrame to match your processing function
processed = prepare_labeled_sentences_spacy(sample_row)
sample_sentences = [item['raw_sentence'] for item in processed]

# Generate summary
summary, probs = bbc_trainer.summarize(sample_sentences)

# Display results
print("=== ORIGINAL ARTICLE SENTENCES ===")
for i, sent in enumerate(sample_sentences):
    print(f"\nSentence {i+1} (Score: {probs[i]:.3f}):")
    print(sent)

print("\n=== GENERATED SUMMARY ===")
print(summary)

print("\n=== SUMMARY SENTENCES SELECTED ===")
for i, (sent, prob) in enumerate(zip(sample_sentences, probs)):
    if prob >= bbc_trainer.threshold:  # Default threshold is 0.5
        print(f"\nSentence {i+1} (Score: {prob:.3f}):")
        print(sent)

Preprocessing articles: 100%|██████████| 1/1 [00:00<00:00,  6.54it/s]

=== ORIGINAL ARTICLE SENTENCES ===

Sentence 1 (Score: 0.001):
Dollar gains on Greenspan speech

Sentence 2 (Score: 0.997):
The dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.

Sentence 3 (Score: 0.010):
And Alan Greenspan highlighted the US government's willingness to curb spending and rising household savings as factors which may help to reduce it.

Sentence 4 (Score: 0.037):
In late trading in New York, the dollar reached $1.2871 against the euro, from $1.2974 on Thursday.

Sentence 5 (Score: 0.295):
Market concerns about the deficit has hit the greenback in recent months.

Sentence 6 (Score: 0.631):
On Friday, Federal Reserve chairman Mr Greenspan's speech in London ahead of the meeting of G7 finance ministers sent the dollar higher after it had earlier tumbled on the back of worse-than-expected US jobs data.

Sentence 7 (Score: 0.030):
"I think the chairman's taking a much more s




In [20]:
# Get the original article and process it
sample_row = bbc_df.iloc[[1]]  # Get second article as DataFrame
original_text = sample_row['Article'].values[0]  # Changed from 'text' to 'Article'
processed = prepare_labeled_sentences_spacy(sample_row)
sample_sentences = [item['raw_sentence'] for item in processed]

# Generate summary
summary, probs = bbc_trainer.summarize(sample_sentences)

# Display the original article
print("="*80)
print("ORIGINAL ARTICLE:")
print("="*80)
print(original_text)
print("\n" + "="*80)

# Display the generated summary
print("GENERATED SUMMARY:")
print("="*80)
print(summary)
print("\n" + "="*80)

# Display sentence scores
print("SENTENCE SCORES:")
print("="*80)
for i, (sent, score) in enumerate(zip(sample_sentences, probs)):
    included = "[INCLUDED]" if score >= bbc_trainer.threshold else "[EXCLUDED]"
    print(f"\n{included} Sentence {i+1} (Score: {score:.3f}):")
    print(sent)

Preprocessing articles: 100%|██████████| 1/1 [00:00<00:00,  8.18it/s]


ORIGINAL ARTICLE:
Dollar gains on Greenspan speech

The dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.

And Alan Greenspan highlighted the US government's willingness to curb spending and rising household savings as factors which may help to reduce it. In late trading in New York, the dollar reached $1.2871 against the euro, from $1.2974 on Thursday. Market concerns about the deficit has hit the greenback in recent months. On Friday, Federal Reserve chairman Mr Greenspan's speech in London ahead of the meeting of G7 finance ministers sent the dollar higher after it had earlier tumbled on the back of worse-than-expected US jobs data. "I think the chairman's taking a much more sanguine view on the current account deficit than he's taken for some time," said Robert Sinche, head of currency strategy at Bank of America in New York. "He's taking a longer-term view, laying out a set of cond