# NYTimes API

In [1]:
import pandas as pd
import requests, math, calendar, time

## Set up URL

In [2]:
base_url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?'
params = {'fq': 'section_name:("Business Day" "U.S." "World")',
          'fl': 'web_url,snippet,lead_paragraph,abstract,print_page,source,headline,keywords,pub_date,document_type,news_desk,section_name,byline,type_of_material,word_count',
          'api-key': 'TKYuzVXE26OPAYgLQCx7w4nzBWRyHIsV'}

url = base_url
for x, y in params.items():
    url += f'{x}={y}&'
url = url[:-1]

## Function to loop through all the pages

In [3]:
# add list of documents
def add_documents(df, documents):
    for result in documents:
        result['headline_kicker'] = result['headline']['kicker']
        result['headline_content_kicker'] = result['headline']['content_kicker']
        result['headline_print_headline'] = result['headline']['print_headline']
        result['headline_name'] = result['headline']['name']
        result['headline_seo'] = result['headline']['seo']
        result['headline_sub'] = result['headline']['sub']
        result['headline'] = result['headline']['main']

        result['keywords'] = str(result['keywords'])
        result['byline'] = str(result['byline'])

        df = df.append(result, ignore_index=True)
    return df

In [4]:
# keywords and byline will be string
df = pd.DataFrame(columns=['web_url','snippet','lead_paragraph','abstract','print_page','source','headline',
                           'headline_kicker','headline_content_kicker','headline_print_headline','headline_name',
                           'headline_seo','headline_sub','keywords','pub_date','document_type','news_desk',
                           'section_name','byline','type_of_material','word_count'])

## Loop through all dates and pages

In [None]:
for year in range(2015,2017):
    for month in range(1,13):
        # first half of the month
        try:
            date1 = f'&begin_date={year}-{month}-01&end_date={year}-{month}-14'

            prev_time = time.time()
            response = requests.get(f'{url}{date1}').json()['response']
            pages = math.ceil(response['meta']['hits'] / 10)
            if pages <= 200:
                df = add_documents(df, response['docs'])
                time.sleep(6 - (time.time() - prev_time))

                for page in range(1, pages):
                    prev_time = time.time()
                    response = requests.get(f'{url}{date1}&page={page}').json()['response']
                    df = add_documents(df, response['docs'])
                    time.sleep(6 - (time.time() - prev_time))
                print(f'Finished first half of {month}/{year}')
            else: 
                print(f'----- First half of {month}/{year} has {pages} pages -----')
        except:
            print()
            print(f'Date: first half of {month}/{year}')
            print(f'Page: {page}')
            print(requests.get(f'{url}{date1}').json())
            raise

            
        # second half of the month
        try:
            date2 = f'&begin_date={year}-{month}-15&end_date={year}-{month}-{calendar.monthrange(year, month)[1]}'

            prev_time = time.time()
            response = requests.get(f'{url}{date2}').json()['response']
            pages = math.ceil(response['meta']['hits'] / 10)
            if pages <= 200:
                df = add_documents(df, response['docs'])
                time.sleep(6 - (time.time() - prev_time))

                for page in range(1, pages):
                    prev_time = time.time()
                    response = requests.get(f'{url}{date2}&page={page}').json()['response']
                    df = add_documents(df, response['docs'])
                    time.sleep(6 - (time.time() - prev_time))
                print(f'Finished second half of {month}/{year}')
            else: 
                print(f'----- Second half of {month}/{year} has {pages} pages -----')
        except:
            print()
            print(f'Date: second half of {month}/{year}')
            print(f'Page: {page}')
            print(requests.get(f'{url}{date2}').json())
            raise


## Other Time Frames

In [None]:
for page in range(13, pages):
    prev_time = time.time()
    response = requests.get(f'{url}{date2}&page={page}').json()['response']
    df = add_documents(df, response['docs'])
    time.sleep(6 - (time.time() - prev_time))

In [None]:
for month in range(12,13):
    # first half of the month
    try:
        date1 = f'&begin_date={year}-{month}-01&end_date={year}-{month}-14'

        prev_time = time.time()
        response = requests.get(f'{url}{date1}').json()['response']
        pages = math.ceil(response['meta']['hits'] / 10)
        if pages <= 200:
            df = add_documents(df, response['docs'])
            time.sleep(6 - (time.time() - prev_time))

            for page in range(1, pages):
                prev_time = time.time()
                response = requests.get(f'{url}{date1}&page={page}').json()['response']
                df = add_documents(df, response['docs'])
                time.sleep(6 - (time.time() - prev_time))
            print(f'Finished first half of {month}/{year}')
        else: 
            print(f'----- First half of {month}/{year} has {pages} pages -----')
    except:
        print()
        print(f'Date: first half of {month}/{year}')
        print(f'Page: {page}')
        print(requests.get(f'{url}{date1}').json())
        raise


    # second half of the month
    try:
        date2 = f'&begin_date={year}-{month}-15&end_date={year}-{month}-{calendar.monthrange(year, month)[1]}'

        prev_time = time.time()
        response = requests.get(f'{url}{date2}').json()['response']
        pages = math.ceil(response['meta']['hits'] / 10)
        if pages <= 200:
            df = add_documents(df, response['docs'])
            time.sleep(6 - (time.time() - prev_time))

            for page in range(1, pages):
                prev_time = time.time()
                response = requests.get(f'{url}{date2}&page={page}').json()['response']
                df = add_documents(df, response['docs'])
                time.sleep(6 - (time.time() - prev_time))
            print(f'Finished second half of {month}/{year}')
        else: 
            print(f'----- Second half of {month}/{year} has {pages} pages -----')
    except:
        print()
        print(f'Date: second half of {month}/{year}')
        print(f'Page: {page}')
        print(requests.get(f'{url}{date2}').json())
        raise


In [None]:
year = 2016
for month in range(1,13):
    # first half of the month
    try:
        date1 = f'&begin_date={year}-{month}-01&end_date={year}-{month}-14'

        prev_time = time.time()
        response = requests.get(f'{url}{date1}').json()['response']
        pages = math.ceil(response['meta']['hits'] / 10)
        if pages <= 200:
            df = add_documents(df, response['docs'])
            time.sleep(6 - (time.time() - prev_time))

            for page in range(1, pages):
                prev_time = time.time()
                response = requests.get(f'{url}{date1}&page={page}').json()['response']
                df = add_documents(df, response['docs'])
                time.sleep(6 - (time.time() - prev_time))
            print(f'Finished first half of {month}/{year}')
        else: 
            print(f'----- First half of {month}/{year} has {pages} pages -----')
    except:
        print()
        print(f'Date: first half of {month}/{year}')
        print(f'Page: {page}')
        print(requests.get(f'{url}{date1}').json())
        raise


    # second half of the month
    try:
        date2 = f'&begin_date={year}-{month}-15&end_date={year}-{month}-{calendar.monthrange(year, month)[1]}'

        prev_time = time.time()
        response = requests.get(f'{url}{date2}').json()['response']
        pages = math.ceil(response['meta']['hits'] / 10)
        if pages <= 200:
            df = add_documents(df, response['docs'])
            time.sleep(6 - (time.time() - prev_time))

            for page in range(1, pages):
                prev_time = time.time()
                response = requests.get(f'{url}{date2}&page={page}').json()['response']
                df = add_documents(df, response['docs'])
                time.sleep(6 - (time.time() - prev_time))
            print(f'Finished second half of {month}/{year}')
        else: 
            print(f'----- Second half of {month}/{year} has {pages} pages -----')
    except:
        print()
        print(f'Date: second half of {month}/{year}')
        print(f'Page: {page}')
        print(requests.get(f'{url}{date2}').json())
        raise


## Check and save results

In [None]:
df.head()

In [None]:
len(df)

In [None]:
df.to_csv('nytimes_2016.csv', index=False)

## Size of the full dataset

In [6]:
base_url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?'
params = {'fq': 'section_name:("Business Day" "U.S." "World")',
          'begin_date': '2014-01-01',
          'end_date': '2014-12-31',
          'fl': 'web_url,snippet,lead_paragraph,abstract,print_page,source,headline,keywords,pub_date,document_type,news_desk,section_name,byline,type_of_material,word_count',
          'api-key': 'TKYuzVXE26OPAYgLQCx7w4nzBWRyHIsV'}

url = base_url
for x, y in params.items():
    url += f'{x}={y}&'
url = url[:-1]

response = requests.get(url).json()['response']
print('Hits: {}'.format(math.ceil(response['meta']['hits'])))
print('Pages: {}'.format(math.ceil(response['meta']['hits'] / 10)))

Hits: 54529
Pages: 5453


In [None]:
response

In [None]:
requests.get(url).json()