<a href="https://colab.research.google.com/github/CoiferousYogi/Webpage_Article_Summarizer_Using_BERT/blob/main/GenAI_Practical3_Abstractive_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install transformers torch
!pip install requests bs4
!pip install trafilatura
!pip install --upgrade gradio
!pip install bert-extractive-summarizer
!pip install sumy
import requests
from transformers import pipeline
from transformers import BartTokenizer
import trafilatura
import gradio as gr
from summarizer import Summarizer



In [7]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
sentiment_pipeline = pipeline("sentiment-analysis")

# extractive summary
bert_model = Summarizer()

def extractive_summary(text):
    return bert_model(text, min_length=60)


# sentiment analysis
def get_sentiment(text):
    results = sentiment_pipeline(text[:1000])  # Trim if article is long
    return results[0]['label'] + f" ({results[0]['score']:.2f})"



# chunking function
def chunk_text(text, max_tokens=1024, overlap=100):
    input_ids = tokenizer.encode(text, truncation=False)
    chunks = []
    i = 0
    while i < len(input_ids):
      chunk_ids = input_ids[i:min(i + max_tokens, len(input_ids))]
      chunk = tokenizer.decode(chunk_ids, skip_special_tokens=True)
      chunks.append((chunk, chunk_ids))  # store both text and original IDs
      i += max_tokens - overlap
    return chunks


def extract_article(url):
    try:
        response = trafilatura.fetch_url(url)
        if response is None:
            return "Failed to retrieve article.", "", None
        full_text = trafilatura.extract(response)
        if not full_text or not full_text.strip():
            return "Could not extract meaningful content.", "", None
        return "Article successfully extracted.", full_text, full_text
    except Exception as e:
        return (f"Error: {str(e)}", "", None)


def summarize_article(article_text, summary_type, perform_sentiment_analysis):
    try:
        if not article_text:
            return "No article loaded yet.", ""


        # Apply summarization based on user choice
        if summary_type == "Extractive":
            summary = extractive_summary(article_text)
        elif summary_type == "TL;DR":
            chunked = chunk_text(article_text)
            summaries = []
            for chunk, _ in chunked:
                if chunk.strip():
                    out = summarizer(chunk, max_new_tokens=100, min_length=15, do_sample=False)
                    summaries.append(out[0]['summary_text'])
            summary = "\n".join(summaries)
        elif summary_type == "Bullet Points":
            chunked = chunk_text(article_text)
            summaries = []
            for chunk, _ in chunked:
                if chunk.strip():
                    out = summarizer(chunk, max_new_tokens=500, min_length=50, do_sample=False)
                    summaries.append("- " + out[0]['summary_text'])
            summary = "\n".join(summaries)
        else:  # Abstractive(default)
            chunked = chunk_text(article_text)
            summaries = []
            for chunk, _ in chunked:
                if chunk.strip():
                    out = summarizer(chunk, max_new_tokens=100, min_length=100, do_sample=False)
                    summaries.append(out[0]['summary_text'])
            summary = "\n".join(summaries)

        # Sentiment analysis if selected
        sentiment = ""
        if perform_sentiment_analysis:
            sentiment = get_sentiment(article_text)

        return summary, sentiment

    except Exception as e:
        return f"An error occurred: {str(e)}", ""



# Gradio Interface
# Interface
with gr.Blocks() as demo:
    gr.Markdown("# Article Summarizer with Multiple Options")

    url_input = gr.Textbox(label="Enter Article URL")
    load_btn = gr.Button("Load Article")

    article_state = gr.State()

    status = gr.Textbox(label="Status", interactive=False)
    full_text_display = gr.Textbox(label="Extracted Article (hidden)", visible=False)

    summary_type = gr.Dropdown(
        choices=["Abstractive", "Bullet Points", "TL;DR", "Extractive"],
        label="Summary Type",
        value="Abstractive"
    )
    sentiment_check = gr.Checkbox(label="Perform Sentiment Analysis")
    summarize_btn = gr.Button("Summarize")

    summary_output = gr.Textbox(label="Summary Output")
    sentiment_output = gr.Textbox(label="Sentiment Output", interactive=False)

    # Button Logic
    load_btn.click(
        extract_article,
        inputs=[url_input],
        outputs=[status, full_text_display, article_state]
    )

    summarize_btn.click(
        summarize_article,
        inputs=[article_state, summary_type, sentiment_check],
        outputs=[summary_output, sentiment_output]
    )

if __name__ == "__main__":
    demo.launch()


Device set to use cpu
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://822517d9e35382db2d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [8]:
summaries

NameError: name 'summaries' is not defined