In [2]:
import os
import csv
import time
import json
from datetime import timedelta, date
from config import newsapi_key
from pathlib import Path
from newsapi import NewsApiClient

In [3]:
# print(newsapi_key)

In [4]:
# Install newsou-python module into environment
# $ pip install newsapi-python

# Usage
# from newsapi import NewsApiClient

In [5]:
# Init newspai client
newsapi = NewsApiClient(api_key=newsapi_key)

In [6]:
print(newsapi)

<newsapi.newsapi_client.NewsApiClient object at 0x00000180F4123710>


In [7]:
sources_to_check = [
                    'bbc-news', 
                    'bloomberg', 
                    'breitbart-news', 
                    'cnn', 
                    'cbs-news',
                    "the-washington-post", 
                    'the-wall-street-journal',
                    'the-huffington-post',
                    'reuters',
                    'politico', 
                    'newsweek',
                    'new-scientist',
                    'national-geographic',
                    'medical-news-today',
                    'business-insider',
                    'al-jazeera-english',
#                     'abc-news',
#                     'nbc-news',
#                     'national-review',
#                     'the-hill',
#                     "the-globe-and-mail",
#                     'fox-news',
#                     'msnbc',
#                     'Fortune',
#                     'cnbc',
#                     'google-news' 
                    ]

In [12]:
# create a range of dates to query for news.
def daterange(date1, date2):
    for n in range(int ((date2 - date1).days)+1):
        yield date1 + timedelta(n)

start_dt = date(2020, 2, 16)
end_dt = date(2020, 2, 24)
dateList = []
for dt in daterange(start_dt, end_dt):
    dateList.append(dt.strftime("%Y-%m-%d"))
    
print(dateList)

['2020-02-16', '2020-02-17', '2020-02-18', '2020-02-19', '2020-02-20', '2020-02-21', '2020-02-22', '2020-02-23', '2020-02-24']


In [13]:
# mySources = sources_to_check[2:]
# print(mySources)
mySources = sources_to_check
print(mySources)

['bbc-news', 'bloomberg', 'breitbart-news', 'cnn', 'cbs-news', 'the-washington-post', 'the-wall-street-journal', 'the-huffington-post', 'reuters', 'politico', 'newsweek', 'new-scientist', 'national-geographic', 'medical-news-today', 'business-insider', 'al-jazeera-english']


In [14]:
# Iterate News API by Date and then by News Source.  
# Create News files for newsSource + date
basePath = "./static/news"
Path(basePath).mkdir(parents=True, exist_ok=True)

for myDate in dateList:
    dirPath = basePath + "/" + myDate
    Path(dirPath).mkdir(parents=True, exist_ok=True)
    
    for source in mySources:
        filename = f"{source}_{myDate}.json"
        filepath = dirPath + "/" + filename
        print(f"Scraping news from {filename}")
        
        scraped_news = newsapi.get_everything(q='coronavirus',
                                          sources=source,
                                          from_param=myDate,
                                          to=myDate,
                                          language='en',
                                          sort_by='popularity',
                                          page_size=100)


        with open(filepath, "w") as outfile:
            json.dump(scraped_news, outfile, indent=4)


Scraping news from bbc-news_2020-02-16.json
Scraping news from bloomberg_2020-02-16.json
Scraping news from breitbart-news_2020-02-16.json
Scraping news from cnn_2020-02-16.json
Scraping news from cbs-news_2020-02-16.json
Scraping news from the-washington-post_2020-02-16.json
Scraping news from the-wall-street-journal_2020-02-16.json
Scraping news from the-huffington-post_2020-02-16.json
Scraping news from reuters_2020-02-16.json
Scraping news from politico_2020-02-16.json
Scraping news from newsweek_2020-02-16.json
Scraping news from new-scientist_2020-02-16.json
Scraping news from national-geographic_2020-02-16.json
Scraping news from medical-news-today_2020-02-16.json
Scraping news from business-insider_2020-02-16.json
Scraping news from al-jazeera-english_2020-02-16.json
Scraping news from bbc-news_2020-02-17.json
Scraping news from bloomberg_2020-02-17.json
Scraping news from breitbart-news_2020-02-17.json
Scraping news from cnn_2020-02-17.json
Scraping news from cbs-news_2020-02-

In [25]:
# First Attempt
# Iterate newsapi by news Source and then by date
# grab news files for news source _ date

# basePath = "./static/news"
# Path(basePath).mkdir(parents=True, exist_ok=True)

# for source in mySources:
#     dirPath = basePath + "/" + source
#     Path(dirPath).mkdir(parents=True, exist_ok=True)
    
#     for myDate in dateList:
#         filename = f"{source}_{myDate}.json"
#         filepath = dirPath + "/" + filename
#         print(f"Scraping news from {filename}")
        
#         scraped_news = newsapi.get_everything(q='coronavirus',
#                                           sources=source,
#                                           from_param=myDate,
#                                           to=myDate,
#                                           language='en',
#                                           sort_by='publishedAt',
#                                           page_size=100)


#         with open(filepath, "w") as outfile:
#             json.dump(filepath, outfile, indent=4)


In [47]:
# Test the API before running
myDate = '2020-01-24'
mySource = 'bbc-news'
scraped_news = newsapi.get_everything(q='coronavirus',
                                      sources=mySource,
                                      from_param=myDate,
                                      to=myDate,
                                      language='en',
                                      sort_by='publishedAt',
                                      page_size=100)

print(scraped_news)

with open("test.json", "w") as outfile:
    json.dump(scraped_news, outfile, indent=4)

{'status': 'ok', 'totalResults': 20, 'articles': [{'source': {'id': 'bbc-news', 'name': 'BBC News'}, 'author': 'https://www.facebook.com/bbcnews', 'title': 'Chinese diasporas on edge over coronavirus', 'description': 'Chinese overseas are concerned for their own health and that of their families back home.', 'url': 'https://www.bbc.co.uk/news/51245373', 'urlToImage': 'https://ichef.bbci.co.uk/news/1024/branded_news/17B00/production/_110642079_gettyimages-1195417045.jpg', 'publishedAt': '2020-01-24T23:33:38Z', 'content': 'Image copyrightGetty ImagesImage caption\r\n China marks lunar New Year in the shadow of virus outbreak\r\nHours after the first case of coronavirus was confirmed in the US, surgical masks began selling out at the pharmacies in Seattle, where a resident recently … [+4129 chars]'}, {'source': {'id': 'bbc-news', 'name': 'BBC News'}, 'author': 'https://www.facebook.com/bbcnews', 'title': 'Six maps and graphics to explain the outbreak', 'description': 'Six maps and graphic

In [14]:
myDate = '2020-02-24'
mySource = 'bbc-news'
bbc_news = newsapi.get_everything(q='coronavirus',
                                      sources=mySource,
                                      from_param=myDate,
                                      to=myDate,
                                      language='en',
                                      sort_by='popularity',
                                      page_size=100)

print(scraped_news)

with open("test.json", "w") as outfile:
    json.dump(scraped_news, outfile, indent=4)

{'status': 'ok', 'totalResults': 8, 'articles': [{'source': {'id': 'al-jazeera-english', 'name': 'Al Jazeera English'}, 'author': 'Al Jazeera', 'title': 'China coronavirus outbreak: All the latest updates', 'description': 'WHO urges governments to step up efforts to prepare for coronavirus as toll from epidemic jumps past 1,600 in China.', 'url': 'https://www.aljazeera.com/news/2020/02/200215224437270.html', 'urlToImage': 'https://www.aljazeera.com/mritems/Images/2020/2/15/1b2b2d6abecd46ff9d9230e80adcd3a4_18.jpg', 'publishedAt': '2020-02-15T23:40:25Z', 'content': 'The death toll from China\'s coronavirus epidemic jumped past 1,600 on Sunday, as the World Health Organization (WHO) praised the country\'s efforts to contain the new disease, saying they have "bought the world time" and that other nations must make the most o… [+2235 chars]'}, {'source': {'id': 'al-jazeera-english', 'name': 'Al Jazeera English'}, 'author': 'Al Jazeera', 'title': "Chinese tourist in France is Europe's first 

In [15]:
mySource


'bbc-news'