In [1]:
import newspaper

In [2]:
from newspaper import Article

In [3]:
import re

In [4]:
import collections

In [5]:
from collections import Counter

In [6]:
import csv

In [7]:
from NYTimesArticleAPInew import articleAPI
api = articleAPI('8ea7f91a8e294b51b12f3f69849fef95') # special api key to access the NYT article archive for analysis 

In [8]:
import time

In [9]:
apathetic = ["prison", "crime", "criminal", "police", "justice", "arrest", "threat", "violence", "habit"]

In [10]:
empathetic = ["treatment", "disease", "sick", "health", "therapy", "insurance", "patient", "overdose", "misuse"]

In [11]:
def counter(link): # takes in url to be scraped and records their word frequencies and publication years
    cnt = Counter()
    emp_tally = 0
    ap_tally = 0 
    article = Article(url = link, language = 'en') 
    article.download()
    article.parse()
    year = article.publish_date.year
    article_words = re.findall('\w+', article.text.lower()) # create list of every word in article body
    for word in article_words: # create dict of frequency w/ which target words appear in article
        if word in apathetic:
            cnt[word] += 1
            ap_tally += 1 # keep track of how many apathetic words total are used in article
        if word in empathetic:
            cnt[word] += 1
            emp_tally += 1 # keep track of how many empathetic words total are used in article
    cnt['emp_tally'] = emp_tally 
    cnt['ap_tally'] = ap_tally
    finish = [cnt, year] 
    return finish; # returns list with dict of word frequencies and year of article publication

In [12]:
def cocaine_scrub(link_list): # takes in list of urls to be scrubbed and writes their word tallies into a csv file
    totals_dict = {'emp_total': 0, 'ap_total': 0}
    with open('crack_epidemic_data.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["Year", "Word", "Frequency"]) # write csv file headers
        for i in range (0, len(link_list)): # cycle through list of urls
            count = counter(link_list[i]) # get dict of word frequencies and publication date for i'th article
            year = (count[1], ) 
            if i != 0 and year != prev_year:
                for row in totals_dict.items(): 
                    writer.writerow(prev_year + row)
                totals_dict["emp_total"] = 0
                totals_dict["ap_total"] = 0
            prev_year = (count[1], )
            totals_dict["emp_total"] += count[0]["emp_tally"] # tally total number of empathetic words used in given year
            totals_dict["ap_total"] += count[0]["ap_tally"] # tally total number of apathetic words used in given year
            for row in count[0].items():
                writer.writerow(year + row) # write word frequencies and emp/ap tallies for each article to a csv file
            if i == (len (link_list) - 1):
                for row in totals_dict.items(): 
                    writer.writerow(year + row)

In [13]:
def heroin_scrub(link_list): # takes in list of urls to be scrubbed and writes their word tallies into a csv file
    totals_dict = {'emp_total': 0, 'ap_total': 0}
    with open('heroin_epidemic_data.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["Year", "Word", "Frequency"]) # write csv file headers
        for i in range (0, len(link_list)): # cycle through list of urls
            count = counter(link_list[i]) # get dict of word frequencies and publication date for i'th article
            year = (count[1], ) 
            if i != 0 and year != prev_year:
                for row in totals_dict.items(): 
                    writer.writerow(prev_year + row)
                totals_dict["emp_total"] = 0
                totals_dict["ap_total"] = 0
            prev_year = (count[1], )
            totals_dict["emp_total"] += count[0]["emp_tally"] # tally total number of empathetic words used in given year
            totals_dict["ap_total"] += count[0]["ap_tally"] # tally total number of apathetic words used in given year
            for row in count[0].items():
                writer.writerow(year + row) # write word frequencies and emp/ap tallies for each article to a csv file
            if i == (len (link_list) - 1):
                for row in totals_dict.items(): 
                    writer.writerow(year + row)

In [14]:
def parse_articles(articles): # takes in response to NYT api and parses articles into list of dicts
    news = []
    for i in articles['response']['docs']:
        dic = {}
        dic['url'] = i['web_url'] 
        news.append(dic)
    return(news) 

In [15]:
def get_articles(date, query): # accepts year (e.g.'1980') & query term (e.g.'Amnesty International') as str
                              # and returns list of parsed articles (in dicts) for that year.
    all_articles = []
    for i in range (0,25):
        articles = api.search(q = query,
                fq = {'source':['Reuters','AP', 'The New York Times'], 'section_name': ['Health', 
                      'N.Y. / Region', 'N.Y./Region', 'NYRegion', 'National', 'New York', 'New York and Region',
                      'U.S.', 'Washington']}, # limits the sections searched for the analysis 
                begin_date = date + '0101', # delimits the start date of the search
                end_date = date + '1231', # delimits the end datea of the search
                fl = 'web_url', # specifies the information to pull from the search results — in this case, the URL
                page = i)
        time.sleep(0.9) # a time delay to circumvent the NYT time-sensitive pull-request limit
        parsed_articles = parse_articles(articles) 
        all_articles = all_articles + parsed_articles 
    return(all_articles)

In [17]:
cocaine_all = []
for i in range(1985,1999): # generates a dict of all crack-related URLs
    print ('Processing ' + str(i) + '...')
    time.sleep(0.8) # a time delay to prevent an error that kept interrupting the analysis
    cocaine_year =  get_articles(str(i),'crack epidemic') # sets the search query term to use
    cocaine_all = cocaine_all + cocaine_year

Processing 1985...
Processing 1986...
Processing 1987...
Processing 1988...
Processing 1989...
Processing 1990...
Processing 1991...
Processing 1992...
Processing 1993...
Processing 1994...
Processing 1995...
Processing 1996...
Processing 1997...
Processing 1998...


In [18]:
cocaine_list = []
for i in range (0, len(cocaine_all)): # converts the dict of URLs into a list of URLs
    for value in cocaine_all[i].items():
        cocaine_list.append(value[1])

In [19]:
heroin_all = []
for i in range(2004,2018): # generates a dict of all opioid-related URLs
    print ('Processing ' + str(i) + '...')
    time.sleep(0.8) # a time delay to prevent an error that kept interrupting the analysis
    heroin_year =  get_articles(str(i),'opioid epidemic') # sets the search query term to use
    heroin_all = heroin_all + heroin_year

Processing 2004...
Processing 2005...
Processing 2006...
Processing 2007...
Processing 2008...
Processing 2009...
Processing 2010...
Processing 2011...
Processing 2012...
Processing 2013...
Processing 2014...
Processing 2015...
Processing 2016...
Processing 2017...


In [20]:
heroin_list = []
for i in range (0, len(heroin_all)): # converts the dict of URLs into a list of URLs
    for value in heroin_all[i].items():
        heroin_list.append(value[1])

In [21]:
cocaine_list # checking the lists for URLs that throw a wrench in the final scrub

['https://www.nytimes.com/1985/11/29/nyregion/a-new-purified-form-of-cocaine-causes-alarm-as-abuse-increases.html',
 'https://www.nytimes.com/1985/10/24/us/bathhouse-curbs-called-help-in-coast-aids-fight.html',
 'https://www.nytimes.com/1986/12/20/nyregion/force-seller-to-upgrade-buildings-koch-urges.html',
 'https://www.nytimes.com/1986/11/17/us/anatomy-of-the-drug-issue-how-after-years-it-erupted.html',
 'https://www.nytimes.com/1986/11/08/opinion/the-crack-judges-by-merit.html',
 'https://www.nytimes.com/1986/10/31/arts/pop-jazz-concerts-with-a-cause-fight-against-crack.html',
 'https://www.nytimes.com/1986/10/30/nyregion/2-suffolk-officers-indicted-on-charges-of-cocaine-use.html',
 'https://www.nytimes.com/1986/10/03/nyregion/news-summary-friday-october-3-1986.html',
 'https://www.nytimes.com/1986/10/03/us/24-task-forces-sought-by-meese-to-fight-crack.html',
 'https://www.nytimes.com/1986/09/26/nyregion/drug-influx-a-strain-on-the-beat.html',
 'https://www.nytimes.com/1986/09/22/ny

In [22]:
removal_list = [ # creating a list of those removed URLs from the several analyses run
 'https://learning.blogs.nytimes.com/1999/08/23/american-media-addicted-to-scandal/',
 'https://learning.blogs.nytimes.com/1999/06/08/fighting-fire-with-fire/', 
 'https://well.blogs.nytimes.com/2012/10/01/addicted-to-painkillers-but-not-ready-for-help/',
 'https://www.nytimes.com/video/health/policy/100000005515818/the-opioid-epidemic-what-you-need-to-know.html',
 'https://learning.blogs.nytimes.com/2000/08/30/children-on-the-street/',
 'https://learning.blogs.nytimes.com/2000/08/23/seeking-a-new-life/', 
 'https://query.nytimes.com/gst/fullpage.html?res=9C05E1DF113FF93AA15752C0A9609C8B63',
 'https://www.nytimes.com/video/us/100000002340382/a-deadly-dance.html',
 'https://www.nytimes.com/2013/08/18/nyregion/talking-bloomberg.html',
 'https://learning.blogs.nytimes.com/2013/06/17/word-of-the-day-insolence/',
 'https://query.nytimes.com/gst/fullpage.html?res=9B0DE2D61530F934A1575AC0A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=940DE6D8103FF932A05754C0A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9F00E7DD123FF93AA25753C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9F04EED61430F936A1575AC0A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9D04EFDF1131F930A2575AC0A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9F0CE0DF173CF937A35752C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9F00E3D8173CF937A35752C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9805E0DE103AF933A15753C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9803E1DA173AF936A25753C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9D01E5D9153BF93BA35753C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9D0CE3DE1731F936A15752C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9802EFD61E3FF93AA15753C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9A01E2DE103FF930A15753C1A9639C8B63',
 'https://learning.blogs.nytimes.com/2014/12/16/special-news-quiz-farewell-2014/',
 'https://query.nytimes.com/gst/fullpage.html?res=9D03E1D7173EF932A25750C0A9679D8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=950DE7D7103BF931A25754C0A9669D8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9404E3DB1E30F93AA35756C0A96F9C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9C03E1D8143EF931A25757C0A96F9C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9A05E7DD1E3BF93BA35750C0A96F9C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9C0CE5D91030F936A25756C0A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9D05E5DA103EF934A25757C0A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9502E0D7123FF930A15753C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9A04EFDE173FF935A25753C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9C0CE1D91331F934A2575AC0A9609C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9C07E7D91739F933A05751C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9404E0D91F30F93BA15751C1A9629C8B63', 
 'https://query.nytimes.com/gst/fullpage.html?res=9D03E4D71130F936A15751C1A9629C8B63', 
 'https://query.nytimes.com/gst/fullpage.html?res=9802E3DB1430F93AA25751C1A9629C8B63', 
 'https://query.nytimes.com/gst/fullpage.html?res=9F0CE1D91530F93BA25751C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9E06E6DD1530F934A25751C1A9629C8B63', 
 'https://query.nytimes.com/gst/fullpage.html?res=9B06E0DA1031F931A25751C1A9629C8B63', 
 'https://query.nytimes.com/gst/fullpage.html?res=9A0DE7D71031F933A25751C1A9629C8B63', 
 'https://query.nytimes.com/gst/fullpage.html?res=9904E1DF1230F936A15751C1A9639C8B63', 
 'https://query.nytimes.com/gst/fullpage.html?res=9B07E2DA1430F931A15751C1A9639C8B63', 
 'https://query.nytimes.com/gst/fullpage.html?res=9402E2D61730F932A15751C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9B07E1DF1730F93BA25751C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9503E2DA1630F93BA25751C1A9639C8B63', 
 'https://learning.blogs.nytimes.com/2003/05/20/deep-thoughts/', 
 'https://learning.blogs.nytimes.com/2002/11/11/closing-the-gaps/', 
 'https://learning.blogs.nytimes.com/2001/06/26/the-wind-up-and-the-pitch/',
 'https://learning.blogs.nytimes.com/2001/04/06/when-arts-a-craft/', 
 'https://learning.blogs.nytimes.com/2000/06/19/true-crime/', 
 'https://learning.blogs.nytimes.com/2000/11/14/constant-craving/',
 'https://learning.blogs.nytimes.com/1999/12/10/begging-for-another-chance/', 
 'https://learning.blogs.nytimes.com/2000/06/19/true-crime/', 
 'https://learning.blogs.nytimes.com/2001/03/13/substance-use-or-abuse/',
 'https://learning.blogs.nytimes.com/2002/04/30/this-is-your-brain-on-pot/',
 'https://learning.blogs.nytimes.com/2003/12/03/neighborly-interests/',
 'https://query.nytimes.com/gst/fullpage.html?res=9F0CE1D91530F93BA25751C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9E06E6DD1530F934A25751C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9400EFDE1430F935A25751C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9401E1DA1031F931A25751C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9B0CE5DE1731F937A35751C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9C05E0D8103EF933A05752C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9C07EFDF103EF93AA15752C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9801E7D9143EF93BA15752C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9C02EED9153EF936A15752C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9402E2D61730F932A15751C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9503E2DF1531F93AA25751C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9C0DE6DF1730F93BA25751C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9D04E6DF1F31F937A25751C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9900E2DD1131F93AA35751C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9D05E2DE1231F931A35751C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9A00E2DF1231F931A35751C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9C03E2DA1731F935A15752C1A9639C8B63',
 'https://www.nytimes.com/roomfordebate/2011/12/29/why-we-collect-stuff/',
 'https://www.nytimes.com/roomfordebate/2011/12/19/should-teenagers-get-high-instead-of-drunk/',
 'https://www.nytimes.com/roomfordebate/2011/12/18/are-presidential-pardons-fair/',
 'https://well.blogs.nytimes.com/2011/12/14/marijuana-growing-in-popularity-among-teenagers/',
 'https://latitude.blogs.nytimes.com/2011/12/02/taking-back-the-favelas/',
 'https://www.nytimes.com/roomfordebate/2012/10/18/shrink-inequality-to-grow-the-economy/',
 'https://learning.blogs.nytimes.com/2013/12/19/what-musician-actor-or-author-should-be-a-superstar-but-hasnt-quite-made-it-yet/',
 'https://learning.blogs.nytimes.com/2000/04/26/high-risk-areas/',
 'https://learning.blogs.nytimes.com/2003/09/30/redefining-addiction/', 
 'https://query.nytimes.com/gst/fullpage.html?res=9403E4DE1F30F935A15751C1A9629C8B63', 
 'https://query.nytimes.com/gst/fullpage.html?res=9802E5D61131F932A25751C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9C06E4DE1431F935A35751C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9C0DEFDB1F3EF936A35751C1A9629C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9C05E0D8103EF933A05752C1A9629C8B63', 
 'https://query.nytimes.com/gst/fullpage.html?res=9A02E6DB113FF93AA25752C1A9629C8B63', 
 'https://query.nytimes.com/gst/fullpage.html?res=9402E2D61730F932A15751C1A9639C8B63', 
 'https://query.nytimes.com/gst/fullpage.html?res=9900E2DD1131F93AA35751C1A9639C8B63', 
 'https://www.nytimes.com/2005/12/04/books/review/04martin.html',
 'https://query.nytimes.com/gst/fullpage.html?res=9B04EFD61531F931A35751C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9B07EEDA1531F932A35751C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9C0DE5DB103EF936A25752C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9804E4D8133EF930A25752C1A9639C8B63', 
 'https://query.nytimes.com/gst/fullpage.html?res=950DE4D6123EF932A25752C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=950DEEDF173EF935A35752C1A9639C8B63', 
 'https://query.nytimes.com/gst/fullpage.html?res=9803E7DA173FF930A25753C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9800E6DD173FF930A25753C1A9639C8B63',
 'https://query.nytimes.com/gst/fullpage.html?res=9806E1DB1F30F93BA35753C1A9639C8B63', 
 'https://roomfordebate.blogs.nytimes.com/2009/11/30/obamas-surge-strategy-in-afghanistan/', 
 'https://learning.blogs.nytimes.com/2010/11/02/which-pop-music-stars-fascinate-you/', 
 'https://www.nytimes.com/roomfordebate/2013/11/28/how-will-aids-be-eradicated/', 
 'https://www.nytimes.com/video/us/100000001970447/jumping-for-show.html', 
 'https://www.nytimes.com/video/us/100000002614644/a-familys-sentence.html', 
 'https://www.nytimes.com/video/opinion/100000002602665/naturally-jj-cale.html', 
 'https://www.nytimes.com/roomfordebate/2014/11/18/constitutional-limits-of-presidential-action-on-immigration-12',
 'https://www.nytimes.com/video/us/100000004010136/clinton-on-jail-time-for-cocaine-users.html', 
 'https://www.nytimes.com/roomfordebate/2015/11/11/should-drug-addicts-be-forced-into-treatment',
 ]

In [23]:
new_list = [] # removing all process-interrupting URLS from the cocaine list
for e in cocaine_list:
    if e not in removal_list:
        new_list.append(e)
cocaine_list = new_list

In [24]:
len(cocaine_list)

361

In [25]:
new_list = [] # removing all process-interrupting URLS from the cocaine list
for e in heroin_list:
    if e not in removal_list:
        new_list.append(e)
heroin_list = new_list

In [26]:
len(heroin_list)

190

In [115]:
cocaine_scrub(cocaine_list)

In [139]:
heroin_scrub(heroin_list)