In [None]:
# In case you haven't installed the API
! pip install nytimesarticle

In [3]:
from nytimesarticle import articleAPI

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import datetime
import csv
import math
import time
from ProgressBar import ProgressBar



In [9]:
api = articleAPI('ef0f07b0a98f450c9a11d3c2f25f4b67') #Kevin
#api = articleAPI('9f6355bf925a4af9b5d296791a35863e') #Kevin
#api = articleAPI('7b0535e75077457b97eabb75f52e2a5b') #Kevin
#api = articleAPI('4303822ccc8249a38913e858ec549574') #Kevin
#api = articleAPI('440fbcc705364b82b35b52cd5f4b4979') #Kevin




# Saving NY Times data to file
It is essential that we save the news data to file before experimenting with feature extraction and modeling so we don't flood the NY Times servers with requests. This way, we only need to make calls to get the data once. Furthermore, we make the function sleep for 1 second between calls so as to not stress the server.

There are some peculiarities about using the NY Times Article Search API that were found in experimentation. The first is that only the first 100 pages from a given search are callable. This means that a particularly general search or a long search window will lead to inaccessible results. Therefore, the function breaks the search window up into single weeks so that there are never over 100 pages of results.

After each week is extracted, the data is zipped together and appended to the output csv file.

In [7]:
def downloadToFile(startdate, enddate, filename):
    """
    Makes API calls to extract id, publication date, headline, and lead paragraph from NY Times articles in the date range.
    Then, saves the data to a local file in csv format.
    startdate: start of date range to extract (yyyymmdd)
    enddate: end of date range to extract (yyyymmdd)
    filename: csv file to append to
    """
    
    startdate = datetime.datetime.strptime(str(startdate), '%Y%m%d')
    enddate = datetime.datetime.strptime(str(enddate), '%Y%m%d')

    sliceStart = startdate

    while (sliceStart<enddate):
        leads = []
        ids = []
        dates = []
        headlines = []
        
        sliceEnd = min(sliceStart + datetime.timedelta(weeks=1), enddate)

        sliceStartInt = int(sliceStart.strftime('%Y%m%d'))
        sliceEndInt = int(sliceEnd.strftime('%Y%m%d'))
        print 'Downloading from {} to {}'.format(sliceStartInt, sliceEndInt)
        while True:
            try:
                numhits = api.search(fl = ['_id'],begin_date = sliceStartInt, end_date=sliceEndInt,fq = {'section_name':'Business'}, page=1)['response']['meta']['hits']
                break
            except:
                print 'JSON error avoided'
        pages = int(math.ceil(float(numhits)/10))
        time.sleep(1)
        pbar2 = ProgressBar(pages)
        print '{} pages to download'.format(pages) # Note that you can't download past page number 100
        for page in range(1,pages+1):
            while True:
                try:
                    articles = api.search(fl= ['_id','headline','lead_paragraph','pub_date'], begin_date = sliceStartInt, end_date=sliceEndInt,fq = {'section_name':'Business'}, page=page)
                    break
                except:
                    print 'JSON error avoided'
            time.sleep(1)
            pbar2.increment()
            for i in articles['response']['docs']:
                if (i['lead_paragraph'] is not None) and (i['headline'] != []):
                    headlines.append(i['headline']['main'])
                    leads.append(i['lead_paragraph'])
                    ids.append(i['_id'])
                    dates.append(i['pub_date'])

        pbar2.finish()
        sliceStart = sliceEnd

        zipped = zip(ids, dates, headlines, leads)
        if zipped:
            with open(filename, "a") as f:
                writer = csv.writer(f)
                for line in zipped: 
                    writer.writerow([unicode(s).encode("utf-8") for s in line])




In [None]:
downloadToFile(19980212, 19981231, '1998_Output.csv')

Downloading from 19980212 to 19980219
28 pages to download
Complete! Total Elapsed time: 39.0 seconds                        
Downloading from 19980219 to 19980226
31 pages to download
Loading: |>-------------------|  9%  Elapsed time: 4.1 seconds