In [1]:
"""Grabs dangerous information on road closures through NewsAPI."""

'Grabs dangerous information on road closures through NewsAPI.'

In [2]:
import requests
import re
import InfoAPI
from datetime import datetime
from bs4 import BeautifulSoup

key = InfoAPI.NewsAPI()

In [31]:
class News:
    
    def __init__(self, key):
        self.key = key

    def main(self, city_name='Austin'):
        """Engine to gather all texts from different news and blogs for road closure.

        Parameters
        ----------
        city_name : string
            Contains the city that you are in; if none given, default is Austin.

        Returns
        -------
        texts : list
            Contains a list of all texts that have road closure in it.

        """
        date = datetime.now()
        date_format = f"{date.year}-{date.month}-{date.day}"

        BASE_URL = "https://newsapi.org/v2/everything/"

        params = {
            'q': city_name,
            'apiKey': self.key.API_KEY,
            'from': date_format,
            'to': date_format,
        }

        response = requests.get(BASE_URL, params)

        articles = response.json()['articles']
        urls = get_urls(articles)
        texts = use_urls(urls)
        return texts

    def get_urls(self, articles):
        """Simply extracts the urls.

        Paramters
        ---------
        articles : list
            List of objects that contain an attribute url.

        Returns
        -------
        urls : list
            List of extracted urls.

        """
        urls = []
        for article in articles:
            urls.append(article['url'])
        return urls

    def use_urls(self, urls, show_errors=False):
        """Extracts text from each website.

        Paramters
        ---------
        urls : list
            A list of urls.

        Returns
        -------
        texts : list
            A list of extracted texts from given urls.

        """
        count = 0
        texts = []
        for url in urls:
            response = requests.get(url)

            if response.status_code == 200:
                text = get_texts(response)
                if len(text) != 0:
                    texts.append(text)
            else:
                if show_errors:
                    print(url)
                    print(response)
                count = count + 1
        
        if show_errors:
            print(f"There was {count} error(s).")
        return texts

    def get_texts(self, response):
        """Extracts the text from the p element.

        Paramters
        ---------
        response : Response object
            e.g. response = requests.get(url)

        Returns
        -------
        texts : list

        """
        soup = BeautifulSoup(response.content, 'lxml')
        paragraphs = soup.find_all('p')
        texts = []
        for p in paragraphs:
            text = p.text
            text = process(text)
            if text != '':
                texts.append(text)

        return texts

    def process(self, text):
        """Grabs everything after road closure, inclusively.

        Paramters
        ---------
        text : string
            A single string of text.

        Returns
        -------
        matches[0] : string
            If there is a match, then it will return the string begining with road closure.
        '' : string
            Returns empty string if there is not a match.

        """
        regex = r"road .*"
        matches = re.search(regex, text, re.MULTILINE | re.IGNORECASE)
        if matches:
            return matches[0]

        return ''

In [28]:
test = News(key)

In [29]:
text = test.main()

http://www.livescience.com/65220-fossils-show-texas-serengeti.html
There was 1 error(s).
