## Import Modules

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings


In [13]:
!pip install rouge



## Load Data

In [14]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
import os
print("Current working directory:", os.getcwd())


Current working directory: /content


In [16]:
import os
import pandas as pd

base_path = r"/content/drive/My Drive/archive/BBC News Summary"
articles_path = os.path.join(base_path, "News Articles")
summaries_path = os.path.join(base_path, "Summaries")

categories = ['business', 'entertainment', 'politics', 'sport', 'tech']

data = []
print("Contents of News Articles folder:")
print(os.listdir(articles_path))

print("Folders in base_path:", os.listdir(base_path))
print("Folders in articles_path:", os.listdir(articles_path))

for category in categories:
    article_dir = os.path.join(articles_path, category)
    summary_dir = os.path.join(summaries_path, category)

    if not os.path.isdir(article_dir):
        print(f"Warning: {article_dir} does not exist. Skipping.")
        continue

    for filename in os.listdir(article_dir):
        article_file = os.path.join(article_dir, filename)
        summary_file = os.path.join(summary_dir, filename)

        if os.path.exists(article_file) and os.path.exists(summary_file):
            # Try reading with latin-1 encoding
            with open(article_file, 'r', encoding='latin-1') as af:
                article = af.read().strip()
            # Try reading with latin-1 encoding
            with open(summary_file, 'r', encoding='latin-1') as sf:
                summary = sf.read().strip()

            data.append({
                'category': category,
                'filename': filename,
                'article': article,
                'summary': summary
            })

# Convert to DataFrame
df = pd.DataFrame(data)
print(df.head())

Contents of News Articles folder:
['politics', 'tech', 'business', 'sport', 'entertainment']
Folders in base_path: ['News Articles', 'BBC News Summary', 'Summaries']
Folders in articles_path: ['politics', 'tech', 'business', 'sport', 'entertainment']
   category filename                                            article  \
0  business  008.txt  India calls for fair trade rules\n\nIndia, whi...   
1  business  005.txt  Pernod takeover talk lifts Domecq\n\nShares in...   
2  business  006.txt  Japan narrowly escapes recession\n\nJapan's ec...   
3  business  007.txt  Jobs growth still slow in the US\n\nThe US cre...   
4  business  003.txt  Yukos unit buyer faces loan claim\n\nThe owner...   

                                             summary  
0  At a conference on developing enterprise hoste...  
1  Pernod has reduced the debt it took on to fund...  
2  On an annual basis, the data suggests annual g...  
3  The job gains mean that President Bush can cel...  
4  Yukos' owner Menatep

## Text Preprocessing

In [17]:
def clean_text(text):
    return text.replace('\n', ' ').replace('\r', ' ').strip()

# Apply to both article and summary columns
df['article_clean'] = df['article'].apply(clean_text)
df['summary_clean'] = df['summary'].apply(clean_text)

# Preview cleaned text
print(df[['article_clean', 'summary_clean']].head())


                                       article_clean  \
0  India calls for fair trade rules  India, which...   
1  Pernod takeover talk lifts Domecq  Shares in U...   
2  Japan narrowly escapes recession  Japan's econ...   
3  Jobs growth still slow in the US  The US creat...   
4  Yukos unit buyer faces loan claim  The owners ...   

                                       summary_clean  
0  At a conference on developing enterprise hoste...  
1  Pernod has reduced the debt it took on to fund...  
2  On an annual basis, the data suggests annual g...  
3  The job gains mean that President Bush can cel...  
4  Yukos' owner Menatep Group says it will ask Ro...  


## Summarization

In [18]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge

# Download both 'punkt' and 'punkt_tab' just to be sure
nltk.download('punkt')
nltk.download('punkt_tab')

# Select a sample article
text = df['article_clean'].iloc[0]
reference_summary = df['summary_clean'].iloc[0]

# Split into sentences
sentences = nltk.sent_tokenize(text)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(sentences)

# Cosine similarity matrix
similarity_matrix = cosine_similarity(X)

# Rank sentences by their "importance" (sum of similarities)
sentence_scores = similarity_matrix.sum(axis=1)

# Select top N sentences
top_n = 3
top_sentence_indices = sentence_scores.argsort()[-top_n:][::-1]
top_sentences = [sentences[i] for i in sorted(top_sentence_indices)]

generated_summary_tf_idf = " ".join(top_sentences)

# Step 8: Print summary
print("Original Article:\n", text[:500], "\n...")
print("\nTraditional Extractive Summary (TF-IDF):")
print(generated_summary_tf_idf)

# Step 9: Evaluate with ROUGE
rouge = Rouge()
scores = rouge.get_scores(generated_summary_tf_idf, reference_summary)

# Step 10: Print ROUGE scores
print("\n🔍 ROUGE Evaluation:")
for metric, score in scores[0].items():
    print(f"{metric}:")
    for submetric, val in score.items():
        print(f"  {submetric}: {val:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Original Article:
 India calls for fair trade rules  India, which attends the G7 meeting of seven leading industrialised nations on Friday, is unlikely to be cowed by its newcomer status.  In London on Thursday ahead of the meeting, India's finance minister, lashed out at the restrictive trade policies of the G7 nations. He objected to subsidies on agriculture that make it hard for developing nations like India to compete. He also called for reform of the United Nations, the World Bank and the IMF.  Palaniappan Ch 
...

Traditional Extractive Summary (TF-IDF):
In London on Thursday ahead of the meeting, India's finance minister, lashed out at the restrictive trade policies of the G7 nations. Palaniappan Chidambaram, India's finance minister, argued that these organisations need to take into account the changing world order, given India and China's integration into the global economy. Mr Chidambaram is attending the G7 meeting as part of the G20 group of nations, which account for two t

## Text Summarization using sumy.TextRank

In [19]:
pip install sumy



In [20]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

# Use the same article
text = df['article_clean'].iloc[0]
reference_summary = df['summary_clean'].iloc[0]

# Set up parser and summarizer
parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarizer = TextRankSummarizer()

# Generate summary with N sentences
summary = summarizer(parser.document, sentences_count=3)

# Print result
print("TextRank Summary:")
generated_summary_sumy = ""
for sentence in summary:
    print("-", sentence)
    generated_summary_sumy += str(sentence) + " "

rouge = Rouge()
scores = rouge.get_scores(generated_summary_sumy.strip(), reference_summary)

print("\n🔍 ROUGE Evaluation (TextRank):")
for metric, score in scores[0].items():
    print(f"{metric}:")
    for submetric, val in score.items():
        print(f"  {submetric}: {val:.4f}")


TextRank Summary:
- In London on Thursday ahead of the meeting, India's finance minister, lashed out at the restrictive trade policies of the G7 nations.
- Mr Chidambaram is attending the G7 meeting as part of the G20 group of nations, which account for two thirds of the world's population.
- Separately, the IMF warned on Thursday that India's budget deficit was too large and would hamper the country's economic growth, which it forecast to be around 6.5% in the year to March 2005.

🔍 ROUGE Evaluation (TextRank):
rouge-1:
  r: 0.3652
  p: 0.6774
  f: 0.4746
rouge-2:
  r: 0.2763
  p: 0.5676
  f: 0.3717
rouge-l:
  r: 0.3565
  p: 0.6613
  f: 0.4633


## Abstractive Summarization

In [21]:
!pip install transformers
!pip install sentencepiece




In [22]:
from transformers import BartForConditionalGeneration, BartTokenizer
import torch
from rouge import Rouge  # <-- ROUGE added

# Load the pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Select your article text
text = df['article_clean'].iloc[0]
reference_summary = df['summary_clean'].iloc[0]

# Tokenize and encode the input text
inputs = tokenizer.encode(text, return_tensors="pt", max_length=1024, truncation=True)

# Generate summary (you can tweak length and decoding strategy)
summary_ids = model.generate(
    inputs,
    max_length=130,
    min_length=30,
    length_penalty=2.0,
    num_beams=4,
    early_stopping=True
)

# Decode and print the summary
summary_bart = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("BART Summary:\n", summary_bart)

# ROUGE Evaluation
rouge = Rouge()
scores = rouge.get_scores(summary_bart, reference_summary)

print("\n🔍 ROUGE Evaluation (BART):")
for metric, score in scores[0].items():
    print(f"{metric}:")
    for submetric, val in score.items():
        print(f"  {submetric}: {val:.4f}")


BART Summary:
 India's finance minister lashes out at restrictive trade policies of the G7 nations. Palaniappan Chidambaram calls for reform of the United Nations, the World Bank and the IMF. IMF warns India's budget deficit is too large and will hamper economic growth.

🔍 ROUGE Evaluation (BART):
rouge-1:
  r: 0.1565
  p: 0.5294
  f: 0.2416
rouge-2:
  r: 0.0592
  p: 0.2308
  f: 0.0942
rouge-l:
  r: 0.1565
  p: 0.5294
  f: 0.2416


## Result Evaluation

In [23]:
from IPython.display import display, Markdown

# Assume these variables contain your generated summaries
original_text = df['article_clean'].iloc[0]
reference_summary = df['summary_clean'].iloc[0]
tfidf_summary = generated_summary_tf_idf  # from your TF-IDF code
textrank_summary = generated_summary_sumy  # from your Sumy/TextRank code
bart_summary = summary_bart  # from your BART code

# Optional: Truncate summaries for display (e.g., first 300 chars)
def truncate(text, n=300):
    return text[:n] + ('...' if len(text) > n else '')

display(Markdown(
f"""
| **Type**               | **Text** |
|------------------------|----------|
| **Original Article**   | {truncate(original_text)} |
| **Reference Summary**  | {truncate(reference_summary)} |
| **TF-IDF Summary**     | {truncate(tfidf_summary)} |
| **TextRank Summary**   | {truncate(textrank_summary)} |
| **BART Summary**       | {truncate(bart_summary)} |
"""
))



| **Type**               | **Text** |
|------------------------|----------|
| **Original Article**   | India calls for fair trade rules  India, which attends the G7 meeting of seven leading industrialised nations on Friday, is unlikely to be cowed by its newcomer status.  In London on Thursday ahead of the meeting, India's finance minister, lashed out at the restrictive trade policies of the G7 natio... |
| **Reference Summary**  | At a conference on developing enterprise hosted by UK finance minister Gordon Brown on Friday, he said that he was in favour of floating exchange rates because they help countries cope with economic shocks.In London on Thursday ahead of the meeting, India's finance minister, lashed out at the restri... |
| **TF-IDF Summary**     | In London on Thursday ahead of the meeting, India's finance minister, lashed out at the restrictive trade policies of the G7 nations. Palaniappan Chidambaram, India's finance minister, argued that these organisations need to take into account the changing world order, given India and China's integra... |
| **TextRank Summary**   | In London on Thursday ahead of the meeting, India's finance minister, lashed out at the restrictive trade policies of the G7 nations. Mr Chidambaram is attending the G7 meeting as part of the G20 group of nations, which account for two thirds of the world's population. Separately, the IMF warned on ... |
| **BART Summary**       | India's finance minister lashes out at restrictive trade policies of the G7 nations. Palaniappan Chidambaram calls for reform of the United Nations, the World Bank and the IMF. IMF warns India's budget deficit is too large and will hamper economic growth. |
