In [34]:
import requests
from pprint import pprint
import numpy as np
import pandas as pd
import json
import time
from datetime import datetime
from dateutil.parser import parse
from newspaper import Article

In [35]:
#Set Up PyMongo
# db.times_articles.drop()
from pymongo import MongoClient

client = MongoClient()
db = client.ny_times
col = db.times_articles

In [36]:
def date_clean(row):
    row['clean_pubdate'] = datetime.isoformat(parse(row['pub_date']))
    return row

In [37]:
data = pd.DataFrame(list(col.find()))

In [38]:
data = data.apply(date_clean,axis=1)

In [39]:
data['year'] = pd.DatetimeIndex(data['clean_pubdate']).year
data['month'] = pd.DatetimeIndex(data['clean_pubdate']).month

I needed to use these masks as the NYTimes API search isn't as exclusive as we'd have liked. I figured it was better to grab more articles than was needed and then clean up rather than risk being too narrow on first pass and having to return.

In [40]:
data_1 = data[data.snippet.str.contains('New York State Senate',case=False)]
data_2 = data[(data.snippet.str.contains('Albany'))&(data.new_desk !='Sports')]
data_3 = data[(data.snippet.str.contains("lawmakers",case=False))\
          &~(data.snippet.str.contains("New York Times"))&(data.snippet.str.contains("New York"))]
data_4 = data[(data.snippet.str.contains("senators",case=False))\
          &~(data.snippet.str.contains("New York Times"))&(data.snippet.str.contains("New York"))]

data_5 = data[(data.snippet.str.contains("senate",case=False))\
          &~(data.snippet.str.contains("New York Times"))&(data.snippet.str.contains("New York"))]

data_6 = data[(data.snippet.str.contains("legislature",case=False))\
          &~(data.snippet.str.contains("New York Times"))&(data.snippet.str.contains("New York"))]
data_7 = data[(data.snippet.str.contains('New York'))&(data.snippet.str.contains('Senate',case=False))]

In [41]:
data_full = pd.concat([data_1,data_2,data_3,data_4,data_5,data_6,data_7])

I then used the newspaper library to get summaries and full text via the URL given from NYT (The NYT API doesnt give full article text, just a snippet and the URL)

In [42]:
def get_article_details(row):
    url = row['web_url']
#     print(url)
#     print(f"Type: {type(url)}")
    try:
        article = Article(url)
        article.download()
        article.parse()
    except:
        print(f"Parse Broken")
    
    try:
        article.nlp()
    except:
        print("NLP Broken")
        
    try:
        row['full_text']=article.text
    except:
        row['full_text']="no_text_available"

    try:
        row['summary'] = article.summary
    except:
        row['summary'] = "no_summary_available"

    try:
        row['keywords'] = article.keywords
    except:
        row['keywords'] = "no_keywords_available"

    return row

In [43]:
df_full = data_full.apply(get_article_details,axis=1)

Article `download()` failed with 410 Client Error: Gone for url: https://www.nytimes.com/aponline/2018/06/06/us/ap-us-verrazano-bridge-spelling-.html on URL https://www.nytimes.com/aponline/2018/06/06/us/ap-us-verrazano-bridge-spelling-.html
Parse Broken
Article `download()` failed with 410 Client Error: Gone for url: https://www.nytimes.com/aponline/2018/06/06/us/ap-us-verrazano-bridge-spelling-.html on URL https://www.nytimes.com/aponline/2018/06/06/us/ap-us-verrazano-bridge-spelling-.html
NLP Broken
Article `download()` failed with 410 Client Error: Gone for url: https://www.nytimes.com/aponline/2018/06/04/us/ap-us-sports-betting-ny.html on URL https://www.nytimes.com/aponline/2018/06/04/us/ap-us-sports-betting-ny.html
Parse Broken
Article `download()` failed with 410 Client Error: Gone for url: https://www.nytimes.com/aponline/2018/06/04/us/ap-us-sports-betting-ny.html on URL https://www.nytimes.com/aponline/2018/06/04/us/ap-us-sports-betting-ny.html
NLP Broken
Article `download()`

Article `download()` failed with 410 Client Error: Gone for url: https://www.nytimes.com/reuters/2018/06/04/technology/04reuters-facebook-privacy.html on URL https://www.nytimes.com/reuters/2018/06/04/technology/04reuters-facebook-privacy.html
Parse Broken
Article `download()` failed with 410 Client Error: Gone for url: https://www.nytimes.com/reuters/2018/06/04/technology/04reuters-facebook-privacy.html on URL https://www.nytimes.com/reuters/2018/06/04/technology/04reuters-facebook-privacy.html
NLP Broken
Article `download()` failed with 404 Client Error: Not Found for url: https://www.nytimes.com/reuters/2016/08/04/us/04reuters-new-york-corruption-skelos.html on URL https://www.nytimes.com/reuters/2016/08/04/us/04reuters-new-york-corruption-skelos.html
Parse Broken
Article `download()` failed with 404 Client Error: Not Found for url: https://www.nytimes.com/reuters/2016/08/04/us/04reuters-new-york-corruption-skelos.html on URL https://www.nytimes.com/reuters/2016/08/04/us/04reuters-n

In [44]:
pd.to_pickle(df_full,"nyt_midway")