# Classifying Web Articles using NYTime API

In this lecture we will retrieve 100 articles from Art & Sports (50 documents in each category) and we will train a classifier to identify specific lexicon in each category so to automatically categorize new documents.

1. Generate Training Set

In [1]:
import time
import urllib.request
import json
from nytimesarticle import articleAPI

In [2]:
api_key='65e6563256a340f5842cf5b6af85c8d5'
api = articleAPI(api_key)
trainingFolder = 'C:\\tmp\\'
sampleSize = 5 #Number of pages included. Normally each page contains 10 articles.

#Functions are a modification of code made available by Rochelle Terman
#http://dlab.berkeley.edu/blog/scraping-new-york-times-articles-python-tutorial

def get_nytarticles(sampleSize,query,category,year):
    all_articles = []

    #Set date range (all year)
    dt_from = year+"0101"
    dt_to = year+"1231"
    
    #Set basic URL
    url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"
    
    #Adding parameters
    url=url+"api-key="+api_key
    url=url+"&fq=news_desk:'"+category+"'"
    url=url+"&begin_date="
    url=url+dt_from
    url=url+"&end_date="+dt_to
    url=url+"&sort:newest"

    #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
    for i in range(1,sampleSize+1):
        print ('generating %s batch of articles [%s] ...' % (category,i))
        
        #Adding paging information
        response = urllib.request.urlopen(url+"&page="+str(i))
        articles = json.loads(response.read())
        articles = parse_articles(articles)
        all_articles = all_articles + articles
        
        # Sleep 1 second to avoid "Exceeded Request Quota" error
        time.sleep(1)
    
    return(all_articles)


def parse_articles(articles):
    '''
    This function takes in a response to the NYT api and parses
    the articles into a list of dictionaries
    '''
    news = []
    #print(articles)
    for i in articles['response']['docs']:
        dic = {}
        dic['id'] = i['_id']
        if 'abstract' in i and i['abstract'] is not None:
            dic['abstract'] = i['abstract'].encode("utf8")
        dic['headline'] = i['headline']['main'].encode("utf8")
        dic['desk'] = i['new_desk']
        dic['date'] = i['pub_date'][0:10] # cutting time of day.
        if ('section_name' in i):
            dic['section'] = i['section_name']
        if i['snippet'] is not None:
            dic['snippet'] = i['snippet'].encode("utf8")
        dic['source'] = i['source']
        dic['type'] = i['type_of_material']
        dic['url'] = i['web_url']
        if ('word_count' in i):
            dic['word_count'] = i['word_count']
        # locations
        locations = []
        for x in range(0,len(i['keywords'])):
            if 'glocations' in i['keywords'][x]['name']:
                locations.append(i['keywords'][x]['value'])
        dic['locations'] = locations
        # subject
        subjects = []
        for x in range(0,len(i['keywords'])):
            if 'subject' in i['keywords'][x]['name']:
                subjects.append(i['keywords'][x]['value'])
        dic['subjects'] = subjects   
        news.append(dic)
    return(news) 

def get_articles(sampleSize,query,category,year):
    '''
    This function accepts a year in string format (e.g.'1980')
    and a query (e.g.'Amnesty International') and it will 
    return a list of parsed articles (in dictionaries)
    for that year.
    '''
    all_articles = []
    for i in range(0,sampleSize): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
        print ('generating %s block ...' % i)
        articles = api.search(q = query,
                              fq = {'news_desk':category},
                              begin_date = year + '0101',
                              end_date = year + '1231',
                              sort ='newest',
                              page = str(i))
        articles = parse_articles(articles)
        all_articles = all_articles + articles
        # Sleep 1 second to avoid "Exceeded Request Quota" error
        time.sleep(1)
    return(all_articles)

def testAPI(query,category):
    content = []
    #arts =  get_articles(1,query,category,'2016')
    arts =  get_nytarticles(1,query,category,'2016')
    print (arts)

Testing the API

In [3]:
testAPI("the","ARTS")

generating ARTS batch of articles [1] ...
[{'id': '586562d695d0e039260783cf', 'headline': b'4 Trailers That Have Us Excited for 2017', 'desk': 'Arts&Leisure', 'date': '2016-12-29', 'snippet': b'The films \xe2\x80\x9cFifty Shades Darker,\xe2\x80\x9d \xe2\x80\x9cLogan,\xe2\x80\x9d \xe2\x80\x9cThe Circle\xe2\x80\x9d and \xe2\x80\x9cSnatched\xe2\x80\x9d have notable previews.', 'source': 'The New York Times', 'type': 'News', 'url': 'https://www.nytimes.com/2016/12/29/movies/trailers-fifty-shades-logan-the-circle-snatched.html', 'word_count': 1039, 'locations': [], 'subjects': ['Movies']}, {'id': '58653c9a95d0e03926078372', 'headline': b'August Wilson\xe2\x80\x99s \xe2\x80\x98Jitney\xe2\x80\x99 at Manhattan Theater Club', 'desk': 'Arts&Leisure', 'date': '2016-12-29', 'snippet': b'The play, directed by Ruben Santiago-Hudson, focuses on a father-son relationship in a story about gypsy-cab drivers.', 'source': 'The New York Times', 'type': 'News', 'url': 'https://www.nytimes.com/2016/12/29/the

Utility Methods to generate training sets

In [4]:
def generateTrainingSet(size,category,year):
    articles = get_nytarticles(size,"the",category,year)
    fileName = "C:\\tmp\\training_"+category.lower()
    f = open(fileName, 'w')
    text = ''
    
    for doc in articles:
        try:
            if 'abstract' in doc:
                # Get the abstract
                text = doc['abstract']
            elif 'snippet' in doc:
                text = doc['snippet']

            #Remove tabs
            text = str(text,'utf-8')
            text=text.replace('\t',' ')
            f.write('%s\n' % text)
        except:
            print("Error Occured: ignoring ...")
    
    f.close()

Generate Training sets for ARTS and SPORTS

In [5]:
generateTrainingSet(20,"Arts","2017")
generateTrainingSet(20,"Sports","2017")

generating Arts batch of articles [1] ...
generating Arts batch of articles [2] ...
generating Arts batch of articles [3] ...
generating Arts batch of articles [4] ...
generating Arts batch of articles [5] ...
generating Arts batch of articles [6] ...
generating Arts batch of articles [7] ...
generating Arts batch of articles [8] ...
generating Arts batch of articles [9] ...
generating Arts batch of articles [10] ...
generating Arts batch of articles [11] ...
generating Arts batch of articles [12] ...
generating Arts batch of articles [13] ...
generating Arts batch of articles [14] ...
generating Arts batch of articles [15] ...
generating Arts batch of articles [16] ...
generating Arts batch of articles [17] ...
generating Arts batch of articles [18] ...
generating Arts batch of articles [19] ...
generating Arts batch of articles [20] ...
Error Occured: ignoring ...
generating Sports batch of articles [1] ...
generating Sports batch of articles [2] ...
generating Sports batch of articl