# Extract New York Times articles related to bitcoin

In [1]:
import pandas as pd
import requests
import pprint
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')
import time
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
pp = pprint.PrettyPrinter(indent=4)
from key import app_key
import tqdm
import json

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/shujinkou/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /Users/shujinkou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [54]:
api_key = app_key

In [62]:
# initiate list
articles = []

In [59]:

def get_articles(articles, params):
    """
    Retrieves article information related to bitcoin from NYT through several api requests.
    
    Parameters:
    articles (list): A list to hold the dictionaries/json response
    params (dictionary): The api request params. This is modified within the function to accomedate pagenation
    
    returns: None
    effects: appends articles list with new json responses

    """

    # i runs through the response "pages". each requests gives 10 articles
    for i in tqdm.tqdm(range(99)):
        params["page"] = i
        endpoint = "https://api.nytimes.com/svc/search/v2/articlesearch.json"
        response = requests.get(endpoint, params=params).json()
        # loop through each article in an individual api response
        for article_num in range(len(response['response']['docs'])):
            pub_date = response['response']['docs'][article_num]['pub_date']
            headline = response['response']['docs'][article_num]['headline']['main']
            abstract = response['response']['docs'][article_num]['abstract']
            article_url = response['response']['docs'][article_num]['web_url']
            # append the dictionary to the list
            articles.append({"date": pub_date, "headline": headline, "abstract": abstract,
                             "url": article_url})
        time.sleep(7)

In [63]:
# retreive the first ~1000 results
params = {"q":"bitcoin", "api-key": api_key, "sort": "newest"}
get_articles(articles, params)

100%|██████████| 99/99 [13:16<00:00,  7.75s/it]


In [65]:
# retreive the last ~500 results
end = datetime.datetime.strptime(articles[-1]['date'], "%Y-%m-%dT%H:%M:%S+0000").strftime('%Y%m%d')
params['end_date'] = '20141031'
get_articles(articles, params)

100%|██████████| 99/99 [12:34<00:00,  7.64s/it]


In [66]:
# I used this block to check the oldest date requested. This date 
# is then used to make a new request with a end_date param
# it also sorts the entries by date
articles_temp = articles
articles_temp.sort(key=lambda item:item['date'], reverse=True)
articles_temp[-1]['date']

'2011-05-30T00:37:00+0000'

In [67]:
# example document
articles[45]

{'date': '2019-07-07T07:00:07+0000',
 'headline': 'A City Paid a Hefty Ransom to Hackers. But Its Pains Are Far From Over.',
 'abstract': 'Weeks after Lake City, Fla., was hit by a cyberattack, the phones are back on and email is working, but the city has not yet recovered all its files.',
 'url': 'https://www.nytimes.com/2019/07/07/us/florida-ransom-hack.html'}

In [68]:
len(articles)

1525

In [71]:
# convert date in json to unix timesamp
for article in articles:
    article['date'] = datetime.datetime \
    .strptime(article['date'],
              "%Y-%m-%dT%H:%M:%S+0000") \
    .timestamp()

In [73]:
# save as json
with open("articles.json", "w") as json_data:
    json.dump(articles, json_data)
df = pd.read_json("articles.json")

In [74]:
# save as csv
df.to_csv("articles.csv")

### Natural Language Processing: Sentiment Analysis

In [138]:
def get_sentiment(article):
    """
    Determines the sentiment of a NYT headline
    
    Parameters:
    article (dict): an element of the articles list
    
    returns: returns the article dictionary with the ['sentiment'] key included.
    sentiment is the probability that the headline is a "positive" opinion. 
    
    """
    article['sentiment'] = TextBlob(article['headline'], analyzer=NaiveBayesAnalyzer()).sentiment.p_pos
    return article

In [133]:
# an example sentiment anlysis. This has a .89% chance of being positive.
TextBlob(articles[0]['headline'], analyzer=NaiveBayesAnalyzer()).sentiment.p_pos

0.8942141448021957

In [142]:
# sentiment anlysis was done in parallel, since the naivebayesanalyzer is slow.
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool()
# utalize all cores
results = pool.map(get_sentiment, articles)
#close the pool and wait for the work to finish
pool.close()
pool.join()

In [145]:
results[-1]

{'date': '2011-05-30T00:37:00+0000',
 'headline': 'Some Faint Praise for Mr. Ballmer',
 'abstract': 'David Einhorn’s critique of Steve Ballmer is on target even if he’s far from the only chief executive who has struggled to make the most of a franchise.',
 'lead_paragraph': 'Microsoft has missed too many opportunities under Steve Ballmer’s stewardship. It’s a fair criticism reignited last week by the hedge fund boss, David Einhorn, who called for the software giant’s chief executive to step down. But that doesn’t necessarily mean Mr. Ballmer will go. Based on total return to shareholders, other long-time company bosses, including Jeff Immelt at General Electric, have fared worse.',
 'url': 'https://www.nytimes.com/2011/05/30/business/economy/30views.html',
 'sentiment': 0.6809889669735206}

In [144]:
# save as json with sentiment anlysis completed
with open("articles_with_sentiment.json", "w") as json_data:
    json.dump(results, json_data)

In [10]:
TextBlob("BULLISH").sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

In [6]:
params = {"q":"bitcoin", "api-key": api_key, "sort": "newest", "page": 1}
endpoint = "https://api.nytimes.com/svc/search/v2/articlesearch.json"
result = requests.get(endpoint, params).json()

In [70]:
import datetime
import time
print(result['response']['docs'][0]['pub_date'])
datetime.datetime.strptime(result['response']['docs'][-1]['pub_date'], "%Y-%m-%dT%H:%M:%S+0000").timestamp()

2019-09-12T20:28:03+0000


1566960204.0