# Scraping news from real news websites

In [5]:
import newspaper
import pandas as pd
import numpy as np
import os

In [6]:
# collect all or most recent 100 articles from a website
def generate_data(news_url, num_news, print_control):
    col_names = ["url","source", "title", "author", "text"]
    article_df = pd.DataFrame(columns = col_names)
    
    print(news_url)
    news_articles = newspaper.build(news_url, memoize_articles=False)
    
    size_articles = news_articles.size()
    if size_articles < num_news:
        num_news = size_articles
     
    news_brand = news_articles.brand
    count = 0
    for i in range(0,num_news):
        article = news_articles.articles[i];
        try:
            article.download()
            article.parse()
            if print_control:
                print(article.url)
            
            entry = pd.DataFrame([[article.url, news_brand, \
                                   article.title, article.authors, article.text]], columns=col_names)
            article_df = article_df.append(entry)
            count += 1
        except:
            pass
        
    print("The total number of " + str(news_brand) + " articles is ", count) 
        
    article_df.set_index('url', inplace=True)
    return article_df    

In [8]:
website_list = ['http://www.nbcnews.com/', 'http://www.msnbc.com/', 'http://www.cbsnews.com/', 
                'http://abcnews.go.com/', 'http://www.newyorker.com/', 
                'http://www.huffingtonpost.com/', 'http://www.bloomberg.com/']

## 1. nbcnews

In [9]:
url_nbcnews = website_list[0]
df_nbcnews = generate_data(url_nbcnews, 200, True)

http://www.nbcnews.com/
Article `download()` failed with 404 Client Error: Not Found for url: http://www.nbcnews.com/tv/shows/responding-by-storm/news/dramatic-storm-rescue-photos-2017 on URL http://www.nbcnews.com/tv/shows/responding-by-storm/news/dramatic-storm-rescue-photos-2017
http://www.nbcnews.com/news/us-news
http://www.nbcnews.com/news/us-news/trump-says-repeal-replace-not-dead-unless-gop-are-quitters-n787796
http://www.nbcnews.com/news/africa
http://www.nbcnews.com/better/health/how-chill-bottle-wine-fast-ncna787426
http://www.nbcnews.com/tech/gift-guide/tech-gift-guide-what-buy-pet-lover-n686836
http://www.today.com/video/utah-man-killed-his-wife-during-alaska-cruise-fbi-says-1012044355665
http://www.nbcnews.com/feature/cold-case-spotlight/mother-hopes-answers-2003-murder-joshua-wayne-crawford-n778621
http://www.nbcnews.com/politics/white-house/after-healthcare-defeat-trump-pushes-obamacare-implosion-n787591
http://www.nbcnews.com/feature/nbc-out/same-sex-domestic-violence-c

In [12]:
df_nbcnews.head()

Unnamed: 0_level_0,source,title,author,text
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
http://www.nbcnews.com/news/us-news,nbcnews,"U.S. News: Breaking News Photos, & Videos on t...",[],Justin Lane / EPA\n\nTrump Says Trumpcare Is N...
http://www.nbcnews.com/news/us-news/trump-says-repeal-replace-not-dead-unless-gop-are-quitters-n787796,nbcnews,"Trump Says Repeal and Replace is Not Dead, Unl...",[Phil Mccausland],After a series of GOP failures to pass any hea...
http://www.nbcnews.com/news/africa,nbcnews,"Africa News: Breaking News, Photos & Videos on...",[],NBC News works best with JavaScript turned on
http://www.nbcnews.com/better/health/how-chill-bottle-wine-fast-ncna787426,nbcnews,How to chill a bottle of wine in just minutes ...,[],Subscribe Let our news meet your inbox. SIGN U...
http://www.nbcnews.com/tech/gift-guide/tech-gift-guide-what-buy-pet-lover-n686836,nbcnews,Tech Gift Guide: What to Buy for the Pet Lover,[Andrea Smith],When it comes to giving holiday gifts to your ...


In [20]:
path = os.path.join('data','indv_file', 'nbcnews.csv')
df_nbcnews.to_csv(path)  

In [None]:
# test
# df_nbcnews[df_nbcnews['url'] == '']

In [None]:
articles_list = newspaper.build(url_nbcnews, memoize_articles=False)

for category in articles_list.category_urls(): 
    print(category)

## 2. msnbc news

In [21]:
url_msnbc = website_list[1]
df_msnbc = generate_data(url_msnbc, 200, True)

http://www.msnbc.com/
http://www.msnbc.com/the-beat-with-ari-melber/watch/dem-sen-on-gop-skinny-repeal-plan-it-s-just-a-ploy-1010759235544
http://www.msnbc.com/morning-joe/watch/don-t-underestimate-trump-s-raw-power-says-legal-scholar-1011139651847
http://www.msnbc.com/weekends-with-alex-witt/watch/with-reince-out-can-a-general-bring-unit-cohesion-to-w-h-1012962371941
http://www.msnbc.com/morning-joe/watch/i-have-a-desire-to-compromise-on-health-bill-senator-1009930307518
http://www.msnbc.com/velshi-ruhle/watch/for-facts-sake-trumpcare-can-t-keep-its-promises-1012418627769
http://www.msnbc.com/the-beat-with-ari-melber/watch/jill-abramson-trump-craves-approval-of-ny-times-1010767939785
http://www.msnbc.com/the-last-word/watch/lawrence-priebus-exit-ends-president-trump-s-worst-week-yet-1012681795510
http://www.msnbc.com/msnbc-news/watch/trump-announces-thousands-of-new-manufacturing-jobs-headed-to-u-s-1010609731907
http://www.msnbc.com/morning-joe/watch/climate-expert-says-he-was-demoted

In [24]:
df_msnbc.shape

(114, 4)

In [25]:
path = os.path.join('data', 'indv_file','msnbc.csv')
df_msnbc.to_csv(path)  

## 3. cbsnews

In [26]:
url_cbsnews = website_list[2]
df_cbsnews = generate_data(url_cbsnews, 200, True)

http://www.cbsnews.com/
http://www.cbsnews.com/news/political-podcast-the-takeout/
http://www.cbsnews.com/news/hamburg-germany-knife-attack-market/
http://www.cbsnews.com/news/trump-tweets-about-senate-after-failure-of-health-care-vote/
http://www.cbsnews.com/news/officials-escaped-prisoner-abducts-kills-assistant-wardens-teen-stepdaughter/
http://www.cbsnews.com/pictures/live-through-this-telling-the-stories-of-suicide-survivors/
http://www.cbsnews.com/news/amazon-hiring-job-fair-thousands-of-offers/
http://www.cbsnews.com/videos/why-bipartisanship-on-the-health-care-bill-is-unlikely/
http://www.cbsnews.com/pictures/g-20-summit-protesters-police-clash-hamburg-germany/
http://www.cbsnews.com/news/murder-on-the-orient-express-trailer-debuts/
http://www.cbsnews.com/pictures/the-cast-of-twin-peaks-then-and-now/
http://www.cbsnews.com/videos/rare-bookstore-still-thriving-in-new-york-city/
http://www.cbsnews.com/news/comedian-hannibal-buress-chicago-roots-stand-up/
http://www.cbsnews.com/ne

In [27]:
df_cbsnews.shape

(200, 4)

In [28]:
path = os.path.join('data','indv_file', 'cbsnews.csv')
df_cbsnews.to_csv(path)  

## 4. abcnews

In [29]:
url_abcnews = website_list[3]
df_abcnews = generate_data(url_abcnews, 200, True)

http://abcnews.go.com/
http://abcnews.go.com/Lifestyle/husband-makes-diy-enchanted-rose-wife-responds-star/story?id=48913278
http://abcnews.go.com/International/wireStory/spokesman-palestinian-leader-hospital-routine-tests-48922673
http://abcnews.go.com/Lifestyle/experts-warn-parents-snapchat-hook-teens-streaks/story?id=48778296
http://abcnews.go.com/US/wireStory/tree-tents-campers-off-ground-air-48915107
http://abcnews.go.com/Entertainment/tom-brady-make-make-pats-fans-dream-true/story?id=48882600
http://abcnews.go.com/Politics/analysis-front-center-mccain-shows-power-allies-dc/story?id=48912107
http://abcnews.go.com/International/stake-venezuelas-election/story?id=48905914
http://liveblog.abcnews.go.com/Event/Live_Blog_The_Latest_on_Hurricane_Matthew
http://liveblog.abcnews.go.com/Event/2014_FIFA_World_Cup_Live_Updates_Matchday_17
http://liveblog.abcnews.go.com/Event/2014_World_Cup_Live_Updates_Quarterfinals_Day_2
http://abcnews.go.com/Politics/wireStory/us-hits-iran-sanctions-respon

In [30]:
df_abcnews.shape

(185, 4)

In [31]:
path = os.path.join('data', 'indv_file','abcnews.csv')
df_abcnews.to_csv(path)  

## 5. newyorker

In [33]:
url_newyorker = website_list[4]
df_newyorker = generate_data(url_newyorker, 200, True)

http://www.newyorker.com/
http://www.newyorker.com/news/sporting-scene/a-chess-master-with-an-unpredictable-style-and-the-hopes-of-a-nation
http://www.newyorker.com/culture/listening-booth/tyler-the-creator-grows-on-where-this-flower-blooms
http://www.newyorker.com/magazine/2017/07/31/the-meaning-of-the-gops-health-care-fiasco
http://www.newyorker.com/news/sporting-scene/kyrie-irvings-anxiety-of-influence
http://www.newyorker.com/magazine/2017/07/31/can-poetry-change-your-life
http://www.newyorker.com/news/news-desk/trumps-tweeted-transgender-ban-is-not-a-law
http://www.newyorker.com/humor/daily-shouts/newly-discovered-very-illegal-thing-will-finally-bring-down-trump-or-be-another-cute-tile-in-the-mosaic-of-democracys-demise-i-guess
http://www.newyorker.com/magazine/2017/06/05/the-countess-private-secretary
http://www.newyorker.com/magazine/1966/01/22/among-the-wild-things
http://www.newyorker.com/magazine/2017/07/31/panoramas-contemporary-scope
http://www.newyorker.com/news/news-desk/

In [40]:
df_newyorker.head()

Unnamed: 0_level_0,source,title,author,text
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
http://www.newyorker.com/news/sporting-scene/a-chess-master-with-an-unpredictable-style-and-the-hopes-of-a-nation,newyorker,A Chess Master with an Unpredictable Style and...,[],Aronian has won dozens of tournaments and glob...
http://www.newyorker.com/culture/listening-booth/tyler-the-creator-grows-on-where-this-flower-blooms,newyorker,"Tyler, the Creator Grows on “Where This Flower...",[],A lot has changed since the twenty-six-year-ol...
http://www.newyorker.com/magazine/2017/07/31/the-meaning-of-the-gops-health-care-fiasco,newyorker,The Meaning of the G.O.P.’s Health-Care Fiasco,[],Anyone still inclined to pity the Senate Repub...
http://www.newyorker.com/news/sporting-scene/kyrie-irvings-anxiety-of-influence,newyorker,Kyrie Irving’s Anxiety of Influence,[],"When it was reported, last week, that Kyrie Ir..."
http://www.newyorker.com/magazine/2017/07/31/can-poetry-change-your-life,newyorker,Can Poetry Change Your Life?,[],The first eight pages of Michael Robbins’s new...


In [36]:
path = os.path.join('data', 'indv_file','newyorker.csv')
df_newyorker.to_csv(path)  

## Merge data

In [42]:
merge_df = pd.concat([df_nbcnews, df_msnbc, df_cbsnews, df_abcnews, df_newyorker])
merge_df.tail()

Unnamed: 0_level_0,source,title,author,text
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
http://www.newyorker.com/news/news-desk,newyorker,"News Desk: Breaking News, Reporting, and Polit...",[],
http://www.newyorker.com/cartoon/dernavich-2011-04-25,newyorker,A Cartoon from The New Yorker,[],
http://tunein.com/radio/New-Yorker-Poetry-p803807/,newyorker,New Yorker Poetry,[],Yusef Komunyakaa reads a poem by Marilyn Hacke...
http://video.newyorker.com/watch/shorts-murmurs-the-startup-to-end-all-startups,newyorker,The Startup to End All Startups,[],"The Startup to End All Startups\n\nMeet uBox, ..."
http://www.newyorker.com/humor/borowitz-report/cruz-the-dream-of-keeping-poor-people-from-seeing-a-doctor-must-never-die,newyorker,Cruz: “The Dream of Keeping Poor People from S...,[],WASHINGTON ( The Borowitz Report )—Acknowledgi...


In [38]:
merge_df.shape

(836, 4)

In [None]:
merge_df = merge_df[merge_df.url.str.contains("video") == False]
merge_df.shape

In [None]:
merge_df = merge_df[merge_df.url.str.contains("picture") == False]
merge_df.shape

In [45]:
path = os.path.join('data', 'realnews_jz.csv')
merge_df.to_csv(path)
# test: read data
data_realnews = pd.read_csv(path)
data_realnews.tail()

Unnamed: 0,url,source,title,author,text
831,http://www.newyorker.com/news/news-desk,newyorker,"News Desk: Breaking News, Reporting, and Polit...",[],
832,http://www.newyorker.com/cartoon/dernavich-201...,newyorker,A Cartoon from The New Yorker,[],
833,http://tunein.com/radio/New-Yorker-Poetry-p803...,newyorker,New Yorker Poetry,[],Yusef Komunyakaa reads a poem by Marilyn Hacke...
834,http://video.newyorker.com/watch/shorts-murmur...,newyorker,The Startup to End All Startups,[],"The Startup to End All Startups\n\nMeet uBox, ..."
835,http://www.newyorker.com/humor/borowitz-report...,newyorker,Cruz: “The Dream of Keeping Poor People from S...,[],WASHINGTON ( The Borowitz Report )—Acknowledgi...
