# Script to recolect news and send the info

In [16]:
import re
import spacy
import nltk
import openai
import requests
from typing import List
from colorama import Fore
from telegram import Update
from datetime import datetime
from bs4 import BeautifulSoup
from htmldate import find_date
from dataclasses import dataclass
from transformers import pipeline
from sumy.parsers.html import HtmlParser
from sumy.nlp.tokenizers import Tokenizer
from deep_translator import GoogleTranslator
from sumy.summarizers.text_rank import TextRankSummarizer
from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler, MessageHandler, filters


#verifiying dependencies
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/revientaelp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Extracting the information of the web page

In [2]:
"""
Args: 
    html: html content with scratch information
Returns:
    Clean text
"""
def cleanHtml(html):
    soup = BeautifulSoup(html, 'html.parser')
    for script in soup(["script", "style", "head"]):
        script.decompose()
    text = soup.get_text()
    text = re.sub('\s+', ' ', text).strip()
    return text

def getWebContent(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        cleaned_content = cleanHtml(response.text)
        return cleaned_content
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None

In [3]:
"""
Using a ML model to check the relation between the post and the
news searching
Args:
    txt (str): clean post plain text
    topic: tuple with the key words
Returns:
    a bool if the text is related 
"""
def isRelated(text, topic):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    for i in topic:
        val = i.lower() in [token.text.lower() for token in doc]
        if val: return True
    return False

### Searching, extracting web pages links and summarization objects

In [4]:
'''
The web page object needs some special urls
and we are using the useBase to simplify the 
searching
'''

@dataclass
class webPage:
    baseUrl: str
    date: str
    topicUrl: List[str]
    keyWord: List[str]
    useBase: bool = False

    """
    Search links in a web page
    Args:
        url (string): link of the web page
    Returns:
        List of links
    """
    def searchLinks(self, url):
        allLinks = []
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            links = soup.find_all('a')
            for link in links:
                allLinks.append(link.get("href"))
        else:
            print(Fore.RED + f"Bad status code from: {url}")
        return allLinks

    """
    Args:
        url (string): link of the web page
        currentDate: date to search the news
    Returns:
        List of input links
    """
    def searchUrl(self, url, currentDate):
        self.saveLinks = []
        tmp = []
        links = self.searchLinks(url)
        for href in links:
            if href and self.keyWord in href and href not in tmp:
                fullUrl = self.baseUrl + href if self.useBase else href
                if self.dateChecker(fullUrl) == currentDate:
                    self.saveLinks.append(fullUrl)
                tmp.append(href)

    """
    Check the date of the publication
    Args:
        url: URL of the publicationq
    Returns:
        Date of the publication
    """
    def dateChecker(self, url):
        try:
            response = requests.get(url)
            date = find_date(response.content.decode('utf-8'))
            return date
        except Exception as e:
            pass

    """
    Save all the links in a set
    Args:
        url: URL of the publicationq
    Returns:
        All the links founded
    """
    def obtainAllLinks(self):
        self.allLinks = set()
        for j in self.topicUrl:
            self.searchUrl(j, self.date)
            for i in self.saveLinks:
                self.allLinks.add(i)

In [5]:
"""
Model used to summarize the text with a Bart model
this approach is used when the resume is not enough good
using extracting summarization
"""
class sumObj():
    def __init__(self):
        self.summarizer = pipeline("summarization", model = "facebook/bart-large-cnn")
    
    def summarize(self, input):
        return self.summarizer(input, max_length=700, min_length=200, do_sample=False)[0]['summary_text']
    
#article = sumObj()

### Applying the searching and summarization

In [6]:
"""Translator could be another one, for example deepl needs to make a payment"""
translator = GoogleTranslator(source='en', target='es')

In [7]:
dateIn = datetime.now().strftime("%Y-%m-%d")

mitPage = webPage("https://news.mit.edu",
                  dateIn,
                  ["https://news.mit.edu/topic/artificial-intelligence2",
                   "https://news.mit.edu/topic/computers",
                   "https://news.mit.edu/topic/computer-vision"], 
                   "2023",
                   True)
guardPage = webPage("https://www.theguardian.com/",
                    dateIn,
                    ["https://www.theguardian.com/technology/artificialintelligenceai"],
                    "2023")
bbcPage = webPage("https://www.bbc.com",
                  dateIn,
                  ["https://www.bbc.com/mundo/topics/cwr9j26ddr5t?page=1"],
                  "articles")
wiredPage = webPage("https://www.wired.com",
                    dateIn,
                    ["https://www.wired.com/tag/artificial-intelligence/"],
                    "story",
                    True)
naturePage = webPage("https://www.nature.com/",
                     dateIn,
                    ["https://www.nature.com/"],
                    "articles",
                    True)

In [8]:
mitPage.obtainAllLinks()
guardPage.obtainAllLinks()
bbcPage.obtainAllLinks()
wiredPage.obtainAllLinks()
naturePage.obtainAllLinks()

In [9]:
listPages = (mitPage, guardPage, bbcPage, wiredPage, naturePage)

In [10]:
print(mitPage.saveLinks)
print(guardPage.saveLinks)
print(bbcPage.saveLinks)
print(wiredPage.saveLinks)
print(naturePage.saveLinks)

[]
['https://www.theguardian.com/technology/2023/dec/28/how-one-of-the-worlds-oldest-newspapers-is-using-ai-to-reinvent-journalism', 'https://www.theguardian.com/media/2023/dec/27/new-york-times-openai-microsoft-lawsuit']
[]
['https://www.wired.com/story/generative-ai-web-2-mistakes/']
[]


In [11]:

def extSum(link, length = 22):
    text = ""
    parser = HtmlParser.from_url(link, Tokenizer('english'))
    summarizer = TextRankSummarizer()
    for sentences in summarizer(parser.document, length):
        text = text + str(sentences) + "\n"
    return text

In [17]:
def resumeInfo():
    topicRelated = ("Artificial intelligence", "Deep learning", "Gan", "Gans", "Machine learning",
                    "LLM", "AI", "Generative adversial neuronal network", "Neuronal network",
                    "Convolutional neuronal network", "OpenIA", "Large lengual model", "Computer Vision",
                    "Computer Science")
    resume = []
    for page in listPages:
        try:
            print(Fore.BLUE + f'Revieweing page: {page.baseUrl}')
            for link in page.saveLinks:
                if isRelated(getWebContent(link), topicRelated):
                    print(Fore.BLUE + f"Link: {link}")
                    #outputOrg = article.summarize(getWebContent(link)[:4500])
                    outputOrg = extSum(link)
                    outputEsp = translator.translate(outputOrg)
                    resume.append(outputEsp)
        except Exception as e:
            print(Fore.RED + f"Error at: {e}")
    return resume

In [11]:
'''
Section to translate the information
Args:
    article txt
'''
for i in mitPage.saveLinks:
    if isRelated(getWebContent(mitPage.saveLinks[0]), topicRelated):
        outputOrg = article.summarize(getWebContent(mitPage.saveLinks[0])[:4500])
        outputEsp = translator.translate(outputOrg)
    print(outputEsp)
    print('--------------')

In [18]:
async def ChatQuestion(update: Update, context: ContextTypes.DEFAULT_TYPE):
    resume = resumeInfo()
    for i in resume:
        await context.bot.send_message(chat_id=update.effective_chat.id, text= i)

async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
    startString = 'Hola soy tu bot encargado de enviarte \n \
                   las noticias mas importantes sobre IA'
    await context.bot.send_message(chat_id  = update.effective_chat.id, text = startString)

def main():
    application = ApplicationBuilder().token('6890506436:AAFD-tjp93gLwZlIDaEnHqD8qSXVfFRLrvQ').build()
    application.add_handler(CommandHandler('start', start))
    application.add_handler(MessageHandler(filters.TEXT, ChatQuestion))

    application.run_polling()


In [20]:
main()

RuntimeError: Cannot close a running event loop