In [2]:
import requests
from pymongo import MongoClient
import pprint

In [8]:
def download_data_from_one_month(year, month):
    """
    This function makes a call the NYT archive API and pulls data from a given year and month and it returns a list of jsons containing the data.
    Params:
        - year: an int {1851, 1852, ... , 2018}
        - month: an int in {1, 2, 3, ... , 12}
    """
    response = requests.get(f"https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key=EarvxAz31SAtamu5RGpqSfKOaT0Dhr59")
    json_response = response.json()
    docs = json_response['response']['docs']
    
    data = []

    for doc in docs:
        raw_date = doc['pub_date']
        doc['year'] = raw_date[0:4]

        month = raw_date[5:7]
        if month.startswith('0'):
            month = month.replace('0','')

        doc['month'] = month
        data.append(doc)

    return data

In [4]:
def download_and_insert_articles(db, year, month):
    """
    This function downloads and caches (saves in the database) a data from a given year and month.
    """
    data = dowload_data_from_one_month(year, month)
    db.articles.insert_many(data)

In [5]:
def get_articles_from_one_month(db, year, month):
    """
    This function returns a data from a given year and month. If the data is already in the database,
    it will not be downloaded again, and if it is not in the db, it will be downloaded and cached.
    """
    data = db.articles.find_one({'year': str(year)}, {'month': str(month)})
    if data is None:
        print("the data is not in the base")
        download_and_insert_articles(db, year, month)
        print("the data has been downloaded")
    
    output = []
    for article in db.articles.find({"$and":[ {"year": str(year)}, {"month": str(month)}] }):
        output.append(article)

    return output

In [6]:
def get_document_keywords_list(data):
    """
    Returns a list in the following format [[document_id_1, [key_words], ..., [document_id_n, [key_words]]]
    """
    output = []

    for doc in data:
        id = doc['_id']
        keywords = [keyword['value'] for keyword in doc['keywords']]
        output.append([id, keywords])

    return output

In [64]:
sample_articles = download_data_from_one_month(2016,11)
sample_kws = get_document_keywords_list(sample_articles)

In [65]:
import pandas as pd
import numpy  as np
import re
import nltk
import sklearn
import sklearn.feature_extraction.text as sk

In [66]:
def wm2df(wm, feat_names):
    # create an index for each row
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
    df = pd.DataFrame(data=wm.toarray(), index=doc_names,
                      columns=feat_names)
    return(df)

In [67]:
import collections

output = sample_kws
# uncomment to use subset
# output = sample_kws[:5]

def get_word_to_count_dict(corpus):
    """
    Return a list in the following format: [{keyword: count}]
    """
    c = collections.Counter()
    for doc in corpus:
        keywords = doc[1]
        for keyword in keywords:
            c[keyword] += 1
    return [[key, c[key]] for key in c.keys()]

count_list=get_word_to_count_dict(output)
# count_list

In [68]:
sorted_list=sorted(count_list,key=lambda x:-x[1])
# sorted_list

In [69]:
col_names=[x[0] for x in sorted_list]

In [70]:
s1=pd.DataFrame(sorted_list, columns=['Keyword', 'Count'])
s1

Unnamed: 0,Keyword,Count
0,"Trump, Donald J",951
1,Presidential Election of 2016,852
2,United States Politics and Government,348
3,"Clinton, Hillary Rodham",323
4,Books and Literature,226
5,Movies,198
6,New York City,198
7,Television,153
8,Republican Party,140
9,Art,136
