In [1]:
import os
import json
import time
import requests
import datetime
import dateutil
import pandas as pd
from dateutil.relativedelta import relativedelta

In [13]:
end = datetime.date(2020,12,31)
start = end - relativedelta(years=1)

In [14]:
pd.date_range(start, end, freq='MS').strftime("%Y %m").tolist()

['2020 01',
 '2020 02',
 '2020 03',
 '2020 04',
 '2020 05',
 '2020 06',
 '2020 07',
 '2020 08',
 '2020 09',
 '2020 10',
 '2020 11',
 '2020 12']

In [15]:
months_in_range = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %#m").tolist()]

In [16]:
months_in_range

[['2020', '1'],
 ['2020', '2'],
 ['2020', '3'],
 ['2020', '4'],
 ['2020', '5'],
 ['2020', '6'],
 ['2020', '7'],
 ['2020', '8'],
 ['2020', '9'],
 ['2020', '10'],
 ['2020', '11'],
 ['2020', '12']]

In [17]:
def send_request(date):
    '''Sends a request to the NYT Archive API for given date.'''
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    url = base_url + '/' + date[0] + '/' + date[1] + '.json?api-key=' + 'IXgpV4G3MtKtTgzm7L54bXITJ2egifLm'
    response = requests.get(url).json()
    time.sleep(6)
    return response


def is_valid(article, date):
    '''An article is only worth checking if it is in range, and has a headline.'''
    is_in_range = date > start and date < end
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    return is_in_range and has_headline


def parse_response(response):
    '''Parses and returns response as pandas data frame.'''
    data = {'headline': [],  
        'date': [], 
        'doc_type': [],
        'type_of_material': [],
        'snippet': [],
        'source': [],
        'news_desk': [],
        'section_name': [],
        'keywords': []}
    
    articles = response['response']['docs']
    optional_features = ['section_name', 'type_of_material', 'snippet', 'source', 'news_desk']

    for article in articles: # For each article, make sure it falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date()
        keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main'])
            data['doc_type'].append(article['document_type'])
            data['keywords'].append(keywords)
            for feature in optional_features:
                if feature in article:
                    data[feature].append(article[feature])
                else:
                    data[feature].append(None)
            
    return pd.DataFrame(data) 

excluded_sections = ['Style','The Learning Network', 'Arts', 
                     'Opinion', 'Books', 'Corrections','Food',
                     'T Magazine', 'Times Insider', 'Magazine',
                     'The Upshot', 'Crosswords & Games', 'Reader Center',
                     'Fashion & Style', 'Podcasts', 'Sports', 'Theater', 
                     'Parenting','Movies']

excluded_news = ['Podcasts', 'Summary']

def get_data(dates):
    '''Sends and parses request/response to/from NYT Archive API for given dates.'''
    total = 0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('headlines'):
        os.mkdir('headlines')
    for date in dates:
        response = send_request(date)
        df = parse_response(response)
        df_sorted = df[(df['doc_type'] == 'article') & (df['type_of_material'] == 'News')]
        df_sorted = df_sorted[~df_sorted['section_name'].isin(excluded_sections)]
        df_sorted = df_sorted[~df_sorted['news_desk'].isin(excluded_news)]
        total += len(df_sorted)
        df_sorted.to_csv('headlines/' + date[0] + '-' + date[1] + '.csv', index=False)
        print('Saving headlines/' + date[0] + '-' + date[1] + '.csv...')
    print('Number of articles collected: ' + str(total))

In [18]:
get_data(months_in_range)

Date range: ['2020', '1'] to ['2020', '12']
Saving headlines/2020-1.csv...
Saving headlines/2020-2.csv...
Saving headlines/2020-3.csv...
Saving headlines/2020-4.csv...
Saving headlines/2020-5.csv...
Saving headlines/2020-6.csv...
Saving headlines/2020-7.csv...
Saving headlines/2020-8.csv...
Saving headlines/2020-9.csv...
Saving headlines/2020-10.csv...
Saving headlines/2020-11.csv...
Saving headlines/2020-12.csv...
Number of articles collected: 24864
