In [1]:
import requests
from bs4 import BeautifulSoup
import json

In [2]:
def save_function(article_list):
    with open('articles.txt', 'w') as outfile:
        json.dump(article_list, outfile)

In [3]:
def hackernews_rss(hacker_news):
    article_list = []
    try:
        r = requests.get(hacker_news)
        soup = BeautifulSoup(r.content, features='xml')
        articles = soup.findAll('item')
        for a in articles:
            title = a.find('title').text
            link = a.find('link').text
            published = a.find('pubDate').text
            article = {
                'title': title,
                'link': link,
                'published': published
                }
            article_list.append(article)
            save_function(article_list)
        return article_list
    except Exception as e:
        print('The scraping job failed. See exception: ')
        print(e)

In [5]:
def contentScrapping(web_url):
    try:
        r = requests.get(web_url)
        soup = BeautifulSoup(r.content, 'html.parser')
        text = soup.find_all(text=True)

        output = ''
        blacklist = [
            '[document]',
            'noscript',
            'header',
            'html',
            'meta',
            'head', 
            'input',
            'script',
            # there may be more elements you don't want, such as "style", etc.
            ]

        for t in text:
            if t.parent.name not in blacklist:
                output += '{} '.format(t)
        return output
    except Exception as e:
        print('The scraping job failed. See exception: ')
        print(e)


In [6]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

In [7]:
def textSummarizer(text, percentage):
    
    # load the model into spaCy
    nlp = spacy.load('en_core_web_sm')
    
    # pass the text into the nlp function
    doc= nlp(text)
    
    ## The score of each word is kept in a frequency table
    tokens=[token.text for token in doc]
    freq_of_word=dict()
    
    # Text cleaning and vectorization 
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in freq_of_word.keys():
                    freq_of_word[word.text] = 1
                else:
                    freq_of_word[word.text] += 1
                    
    # Maximum frequency of word
    max_freq=max(freq_of_word.values())
    
    # Normalization of word frequency
    for word in freq_of_word.keys():
        freq_of_word[word]=freq_of_word[word]/max_freq
        
    # In this part, each sentence is weighed based on how often it contains the token.
    sent_tokens= [sent for sent in doc.sents]
    sent_scores = dict()
    for sent in sent_tokens:
        for word in sent:
            if word.text.lower() in freq_of_word.keys():
                if sent not in sent_scores.keys():                            
                    sent_scores[sent]=freq_of_word[word.text.lower()]
                else:
                    sent_scores[sent]+=freq_of_word[word.text.lower()]
    
    
    len_tokens=int(len(sent_tokens)*percentage)
    
    # Summary for the sentences with maximum score. Here, each sentence in the list is of spacy.span type
    summary = nlargest(n = len_tokens, iterable = sent_scores,key=sent_scores.get)
    
    # Prepare for final summary
    final_summary=[word.text for word in summary]
    
    #convert to a string
    summary=" ".join(final_summary)
    
    # Return final summary
    return summary

In [11]:
hackerNews = 'https://news.ycombinator.com/rss'
print('Starting scraping')
article_list = hackernews_rss(hackerNews)
print('Finished scraping')
raw_content = contentScrapping('https://www.catsuka.com/breves/2023-09-21/nippon-television-rachete-le-studio-ghibli')
summary = textSummarizer(raw_content, 0.9)
print(summary)


Starting scraping
Finished scraping


  text = soup.find_all(text=True)


CATEGORIES : 
   Court-métrages 
   Long-métrages 
   Séries 
   Clips et pubs 
   Direct-to-video 
   Sur le web 
   Evénements 
   Diffusions 
   Édition 
   BD 
   Sakuga 
   Divers 
  ARCHIVES : 
 2023 :    Jan   Fev   Mar   Avr   Mai   Jun   Jul   Aou   Sep  
 2022 :    Jan   Fev   Mar   Avr   Mai   Jun   Jul   Aou   Sep   Oct   Nov   Dec  
 2021 :    Jan   Fev   Mar   Avr   Mai   Jun   Jul   Aou   Sep   Oct   Nov   Dec  
 2020 :    Jan   Fev   Mar   Avr   Mai   Jun   Jul   Aou   Sep   Oct   Nov   Dec  
 2019 :    Jan   Fev   Mar   Avr   Mai   Jun   Jul   Aou   Sep   Oct   Nov   Dec  
 2018 :    Jan   Fev   Mar   Avr   Mai   Jun   Jul   Aou   Sep   Oct   Nov   Dec  
 2017 :    Jan   Fev   Mar   Avr   Mai   Jun   Jul   Aou   Sep   Oct   Nov   Dec  
 2016 :    Jan   Fev   Mar   Avr   Mai   Jun   Jul   Aou   Sep   Oct   Nov   Dec  
 2015 :    Jan   Fev   Mar   Avr   Mai   Jun   Jul   Aou   Sep   Oct   Nov   Dec  
 2014 :    Jan   Fev   Mar   Avr   Mai   Jun   Jul   Aou   Sep   Oct   

In [19]:
for article in article_list:
    print(article['title'])
    print(article['link'])



Cisco Acquires Splunk
https://www.splunk.com/en_us/blog/leadership/splunk-and-cisco-unite-to-accelerate-digital-resilience-as-one-of-the-leading-global-software-companies.html
Nippon Television has just acquired Studio Ghibli
https://www.catsuka.com/breves/2023-09-21/nippon-television-rachete-le-studio-ghibli
Sunken temple and sanctuary from ancient Egypt found
https://www.livescience.com/archaeology/ancient-egyptians/sunken-temple-and-sanctuary-from-ancient-egypt-found-brimming-with-treasures-and-secrets
Why Kakoune
https://andreyor.st/posts/2023-09-20-why-kakoune/
They have genetic ALS. What should clinicians do?
https://www.statnews.com/2023/09/21/als-gene-carrier-riluzole-toferson/
Launch HN: Loops (YC W22) – Email for SaaS Companies
https://news.ycombinator.com/item?id=37596253
Show HN: Odin – the integration of LLMs with Obsidian note taking
https://github.com/memgraph/odin
OpenBSD/ARM64 on Hetzner Cloud
https://www.undeadly.org/cgi?action=article;sid=20230921073556
An INI Critiq