In [3]:
import os
import csv
import time
import json
from datetime import timedelta, date
from config import newsapi_key
from pathlib import Path
from newsapi import NewsApiClient

In [4]:
# print(newsapi_key)

In [5]:
# Install newsou-python module into environment
# $ pip install newsapi-python

# Usage
# from newsapi import NewsApiClient

In [6]:
# Init newspai client
newsapi = NewsApiClient(api_key=newsapi_key)

In [7]:
print(newsapi)

<newsapi.newsapi_client.NewsApiClient object at 0x00000229C94F8320>


In [8]:
sources_to_check = [
                    'bbc-news', 
                    'bloomberg', 
                    'breitbart-news', 
                    'cnn', 
                    'cbs-news',
                    "the-washington-post", 
                    'the-wall-street-journal',
                    'the-huffington-post',
                    'reuters',
                    'politico', 
                    'newsweek',
                    'new-scientist',
                    'national-geographic',
                    'medical-news-today',
                    'business-insider',
                    'al-jazeera-english',
#                     'abc-news',
#                     'nbc-news',
#                     'national-review',
#                     'the-hill',
#                     "the-globe-and-mail",
#                     'fox-news',
#                     'msnbc',
#                     'Fortune',
#                     'cnbc',
#                     'google-news' 
                    ]

In [13]:
# create a range of dates to query for news.
def daterange(date1, date2):
    for n in range(int ((date2 - date1).days)+1):
        yield date1 + timedelta(n)

start_dt = date(2020, 2, 24)
end_dt = date(2020, 2, 27)
dateList = []
for dt in daterange(start_dt, end_dt):
    dateList.append(dt.strftime("%Y-%m-%d"))
    
print(dateList)

['2020-02-24', '2020-02-25', '2020-02-26', '2020-02-27']


In [10]:
# mySources = sources_to_check[2:]
# print(mySources)
mySources = sources_to_check
print(mySources)

['bbc-news', 'bloomberg', 'breitbart-news', 'cnn', 'cbs-news', 'the-washington-post', 'the-wall-street-journal', 'the-huffington-post', 'reuters', 'politico', 'newsweek', 'new-scientist', 'national-geographic', 'medical-news-today', 'business-insider', 'al-jazeera-english']


In [12]:
# Iterate News API by Date and then by News Source.  
# Create News files for newsSource + date
basePath = "../static/newsdata"
Path(basePath).mkdir(parents=True, exist_ok=True)

for myDate in dateList:
    dirPath = basePath + "/" + myDate
    Path(dirPath).mkdir(parents=True, exist_ok=True)
    
    for source in mySources:
        filename = f"{source}_{myDate}.json"
        filepath = dirPath + "/" + filename
        print(f"Scraping news from {filename}")

#        scraped_news = filepath
        
        scraped_news = newsapi.get_everything(q='coronavirus',
                                          sources=source,
                                          from_param=myDate,
                                          to=myDate,
                                          language='en',
                                          sort_by='relevancy',
                                          page_size=100)


        with open(filepath, "w") as outfile:
            json.dump(scraped_news, outfile, indent=4)


Scraping news from bbc-news_9999-02-24.json
Scraping news from bloomberg_9999-02-24.json
Scraping news from breitbart-news_9999-02-24.json
Scraping news from cnn_9999-02-24.json
Scraping news from cbs-news_9999-02-24.json
Scraping news from the-washington-post_9999-02-24.json
Scraping news from the-wall-street-journal_9999-02-24.json
Scraping news from the-huffington-post_9999-02-24.json
Scraping news from reuters_9999-02-24.json
Scraping news from politico_9999-02-24.json
Scraping news from newsweek_9999-02-24.json
Scraping news from new-scientist_9999-02-24.json
Scraping news from national-geographic_9999-02-24.json
Scraping news from medical-news-today_9999-02-24.json
Scraping news from business-insider_9999-02-24.json
Scraping news from al-jazeera-english_9999-02-24.json
Scraping news from bbc-news_9999-02-25.json
Scraping news from bloomberg_9999-02-25.json
Scraping news from breitbart-news_9999-02-25.json
Scraping news from cnn_9999-02-25.json
Scraping news from cbs-news_9999-02-

In [25]:
# First Attempt
# Iterate newsapi by news Source and then by date
# grab news files for news source _ date

# basePath = "./static/news"
# Path(basePath).mkdir(parents=True, exist_ok=True)

# for source in mySources:
#     dirPath = basePath + "/" + source
#     Path(dirPath).mkdir(parents=True, exist_ok=True)
    
#     for myDate in dateList:
#         filename = f"{source}_{myDate}.json"
#         filepath = dirPath + "/" + filename
#         print(f"Scraping news from {filename}")
        
#         scraped_news = newsapi.get_everything(q='coronavirus',
#                                           sources=source,
#                                           from_param=myDate,
#                                           to=myDate,
#                                           language='en',
#                                           sort_by='publishedAt',
#                                           page_size=100)


#         with open(filepath, "w") as outfile:
#             json.dump(filepath, outfile, indent=4)


In [12]:
# Test the API before running
myDate = '2020-02-24'
mySource = 'bbc-news'
scraped_news = newsapi.get_everything(q='coronavirus',
                                      sources=mySource,
                                      from_param=myDate,
                                      to=myDate,
                                      language='en',
                                      sort_by='publishedAt',
                                      page_size=100)

print(scraped_news)

with open("test.json", "w") as outfile:
    json.dump(scraped_news, outfile, indent=4)

{'status': 'ok', 'totalResults': 20, 'articles': [{'source': {'id': 'bbc-news', 'name': 'BBC News'}, 'author': None, 'title': 'Harvey Weinstein convicted of rape and sexual assault', 'description': 'The film producer was found guilty of two of the five charges he faced. Also, the WHO urges countries to prepare for a possible coronavirus pandemic, and a man is arrested in Germany after a car drove into people at a carnival.', 'url': 'https://www.bbc.co.uk/programmes/p084nd3v', 'urlToImage': 'https://ichef.bbci.co.uk/images/ic/1200x675/p07zqpb8.jpg', 'publishedAt': '2020-02-24T23:38:00Z', 'content': 'The film producer was found guilty of two of the five charges he faced. Also, the WHO urges countries to prepare for a possible coronavirus pandemic, and a man is arrested in Germany after a car drove into people at a carnival.'}, {'source': {'id': 'bbc-news', 'name': 'BBC News'}, 'author': 'https://www.facebook.com/bbcnews', 'title': "The Papers: Harvey Weinstein 'locked up at last'", 'desc

In [13]:
myDate = '2020-02-24'
mySource = 'bbc-news'
bbc_news = newsapi.get_everything(q='coronavirus',
                                      sources=mySource,
                                      from_param=myDate,
                                      to=myDate,
                                      language='en',
                                      sort_by='relevancy',
                                      page_size=100)

print(scraped_news)

with open("test.json", "w") as outfile:
    json.dump(scraped_news, outfile, indent=4)

{'status': 'ok', 'totalResults': 20, 'articles': [{'source': {'id': 'bbc-news', 'name': 'BBC News'}, 'author': None, 'title': 'Harvey Weinstein convicted of rape and sexual assault', 'description': 'The film producer was found guilty of two of the five charges he faced. Also, the WHO urges countries to prepare for a possible coronavirus pandemic, and a man is arrested in Germany after a car drove into people at a carnival.', 'url': 'https://www.bbc.co.uk/programmes/p084nd3v', 'urlToImage': 'https://ichef.bbci.co.uk/images/ic/1200x675/p07zqpb8.jpg', 'publishedAt': '2020-02-24T23:38:00Z', 'content': 'The film producer was found guilty of two of the five charges he faced. Also, the WHO urges countries to prepare for a possible coronavirus pandemic, and a man is arrested in Germany after a car drove into people at a carnival.'}, {'source': {'id': 'bbc-news', 'name': 'BBC News'}, 'author': 'https://www.facebook.com/bbcnews', 'title': "The Papers: Harvey Weinstein 'locked up at last'", 'desc

In [15]:
myDate = '2020-02-24'
mySource = 'bbc-news'
bbc_news = newsapi.get_everything(q='coronavirus',
                                      sources=mySource,
                                      from_param=myDate,
                                      to=myDate,
                                      language='en',
                                      sort_by='popularity',
                                      page_size=100)

print(scraped_news)

with open("test.json", "w") as outfile:
    json.dump(scraped_news, outfile, indent=4)

{'status': 'ok', 'totalResults': 20, 'articles': [{'source': {'id': 'bbc-news', 'name': 'BBC News'}, 'author': None, 'title': 'Harvey Weinstein convicted of rape and sexual assault', 'description': 'The film producer was found guilty of two of the five charges he faced. Also, the WHO urges countries to prepare for a possible coronavirus pandemic, and a man is arrested in Germany after a car drove into people at a carnival.', 'url': 'https://www.bbc.co.uk/programmes/p084nd3v', 'urlToImage': 'https://ichef.bbci.co.uk/images/ic/1200x675/p07zqpb8.jpg', 'publishedAt': '2020-02-24T23:38:00Z', 'content': 'The film producer was found guilty of two of the five charges he faced. Also, the WHO urges countries to prepare for a possible coronavirus pandemic, and a man is arrested in Germany after a car drove into people at a carnival.'}, {'source': {'id': 'bbc-news', 'name': 'BBC News'}, 'author': 'https://www.facebook.com/bbcnews', 'title': "The Papers: Harvey Weinstein 'locked up at last'", 'desc

In [15]:
mySource


'bbc-news'