<a href="https://colab.research.google.com/github/DenniseMc/news_summary/blob/main/news_app.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install "lxml[html_clean]"

Collecting lxml-html-clean (from lxml[html_clean])
  Downloading lxml_html_clean-0.3.1-py3-none-any.whl.metadata (2.4 kB)
Downloading lxml_html_clean-0.3.1-py3-none-any.whl (13 kB)
Installing collected packages: lxml-html-clean
Successfully installed lxml-html-clean-0.3.1


In [4]:
!pip install readability-lxml



In [13]:
!pip install ipywidgets

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.1


# Get news API key

In [6]:
from google.colab import userdata
api_key = userdata.get('newsApiKey')

# Import libraries

In [18]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
from datetime import datetime
from dateutil.relativedelta import relativedelta
from readability import Document
import re

import ipywidgets as widgets
from IPython.display import display, HTML, clear_output


# Initialize the summarizer

In [8]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



# Functions

## Get article content

In [40]:
def get_article_content(url):

    # Fetch the HTML content of the first article
    article_response = requests.get(url)

    if article_response.status_code == 200:
        # Parse and extract the main content with Readability
        doc = Document(article_response.text)
        article_content = doc.summary()

        # Use BeautifulSoup to clean and format the extracted content
        soup = BeautifulSoup(article_content, 'html.parser')
        clean_text = soup.get_text(separator='\n', strip=True)

        # Return the clean article content
        return(clean_text)
    else:
        return "Failed to get the article content"



## Split the content

In [27]:
def split_content(article):
    # Split article by sentences while preserving sentence boundaries
    sentences = re.split(r'(?<=[.!?]) +', article)

    # Initialize variables
    max_words = 500
    segments = []
    current_segment = []

    # Iterate through each sentence and group them into segments
    for sentence in sentences:
        # Check if adding this sentence exceeds the max word limit
        if len(" ".join(current_segment + [sentence]).split()) <= max_words:
            current_segment.append(sentence)
        else:
            # Join current segment as a single string and add to segments list
            segments.append(" ".join(current_segment))
            current_segment = [sentence]

    # Add the last segment if any content remains
    if current_segment:
        segments.append(" ".join(current_segment))

    return segments


## Get summary

In [28]:
def get_summary(article):
    # Extract the number of words from output_array[0]
    word_count = len(article.split())

    # Calculate max_summary_length and min_summary_length based on the word count
    max_summary_length =100 if word_count > 100 else word_count
    min_summary_length = 10 if word_count > 10 else word_count

    # Get the summarizer output
    return summarizer(article, max_length=max_summary_length, min_length=min_summary_length, do_sample=False)




## Get article summary

In [42]:
def get_article_summary(url):
    article = get_article_content(url)
    if(article == ""):
        return ""

    words = article.split()
    # print(len(words))
    segments = split_content(article)
    # segments now holds the text split into chunks of up to 500 words each
    total_summary = ""
    for i, segment in enumerate(segments, 1):
        summary = get_summary(segment)[0]['summary_text']
        total_summary += summary + "\n"

    return(total_summary)



## UI

In [43]:

# Define the API key and language for NewsAPI
api_key = api_key
language = "en"

# Function to fetch and display the articles
def fetch_and_display_articles(query):
    # Get the current date and one month ago date for the query
    current_date = datetime.today()
    one_month_ago = current_date - relativedelta(months=1)
    from_date = one_month_ago.strftime('%Y-%m-%d')


    # Construct the API URL
    url = f"https://newsapi.org/v2/everything?q={query}&from={from_date}&sortBy=publishedAt&apiKey={api_key}&language={language}"

    # Clear the previous output before displaying new data
    clear_output(wait=True)

    # Display the input widgets again after clearing
    display(search_container)

    # Create and display the loading spinner while fetching data
    loading_spinner = widgets.HTML(value='<div style="display: flex; justify-content: center; align-items: center; height: 100px;">Loading...</div>')
    display(loading_spinner)

    # Send the request to NewsAPI
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        if data['status'] == 'ok':
            # Extract and display the top 3 articles
            top_articles = data['articles'][:3]

            # Start building the HTML content
            html_content = f"<h3>Top 3 Articles for '{query}'</h3>"
            html_content += """
                <div style="display: flex; justify-content: space-around; font-family: Arial, sans-serif; padding: 10px 0;">
            """

            for article in top_articles:
                title = article['title']
                url = article['url']
                summary = get_article_summary(url)
                image_url = article['urlToImage']

                # Check if there is an image URL; if not, use placeholder with icon
                if image_url:
                    image_html = f"""
                        <img src="{image_url}" alt="Article Image" style="width: 150px; height: 150px; border-radius: 50%; object-fit: cover; margin-bottom: 10px;"
                        onerror="this.onerror=null;this.style.display='none';this.insertAdjacentHTML('afterend', `<div style='width: 150px; height: 150px; border-radius: 50%; background-color: #ddd; display: flex; align-items: center; justify-content: center;'>
                        <i class='fa fa-newspaper-o' style='font-size: 50px; color: #091F46;'></i></div>`)">
                    """
                else:
                    image_html = """
                        <div style="width: 150px; height: 150px; border-radius: 50%; background-color: #ddd; display: flex; align-items: center; justify-content: center;">
                            <i class="fa fa-newspaper-o" style="font-size: 50px; color: #091F46;"></i>
                        </div>
                    """

                # Create HTML for each article with circular image or placeholder and centered content
                html_content += f"""
                    <div style="width: 30%; text-align: center; border: 1px solid #ddd; padding: 10px; border-radius: 10px;">
                        {image_html}
                        <h4 style='margin: 10px 0;'><a href="{url}" target="_blank" style='text-decoration: none; color: #091F46;'>{title}</a></h4>
                        <p style='margin: 5px 0; color: #444;'>{summary}</p>
                        <a href="{url}" target="_blank" style='color: #A55D35;'>Read full article</a>
                    </div>
                """
            html_content += "</div>"

            # Hide loading spinner and display the result
            loading_spinner.layout.display = 'none'

            # Display the HTML content in the notebook
            display(HTML(html_content))
        else:
            display(HTML("<p style='color: red;'>Error: Failed to retrieve articles.</p>"))
    else:
        display(HTML(f"<p style='color: red;'>Error: Status code {response.status_code} - Failed to fetch data.</p>"))

# Create a text input and button widget for user query input
query_input = widgets.Text(
    value='',
    placeholder='Enter the topic here'
)
button = widgets.Button(
    description="Search",
    button_style='primary',
    layout=widgets.Layout(border_radius='15px')
)

# Event handler for button click to fetch and display articles
def on_button_click(b):
    query = query_input.value.strip()  # Get the query from the input box
    if query:
        fetch_and_display_articles(query)
    else:
        display(HTML("<p style='color: red;'>Please enter a query to search.</p>"))

# Link the button click event with the handler
button.on_click(on_button_click)

# Center the HBox layout
centered_hbox = widgets.HBox([query_input, button], layout=widgets.Layout(justify_content='center'))

# Container to hold the title, input, button, and results in one unit
search_container = widgets.VBox([
    widgets.HTML("<h2 style='text-align: center; font-family: Arial, sans-serif;'>Search for News Articles</h2>"),
    centered_hbox,
    widgets.HTML("<div id='results_container'></div>")  # Placeholder for results
], layout=widgets.Layout(border='2px solid #ddd', padding='20px', border_radius='10px', margin='10px 0', align_items='center'))

# Display the input and button in the notebook
display(search_container)


VBox(children=(HTML(value="<h2 style='text-align: center; font-family: Arial, sans-serif;'>Search for News Art…

HTML(value='<div style="display: flex; justify-content: center; align-items: center; height: 100px;">Loading..…