# Scraping news from real news websites

In [1]:
import newspaper
import pandas as pd
import numpy as np
import os

In [2]:
# collect all or most recent 100 articles from a website
def generate_data(news_url):
    col_names = ["url","source", "title", "author", "text"]
    article_df = pd.DataFrame(columns = col_names)
    
    print(news_url)
    news_articles = newspaper.build(news_url, memoize_articles=False)
    
    size_articles = news_articles.size()
    num_news = 100
    if size_articles < num_news:
        num_news = size_articles
     
    news_brand = news_articles.brand
    count = 0
    for i in range(0,num_news):
        article = news_articles.articles[i];
        try:
            article.download()
            article.parse()
            print(article.url)
            
            entry = pd.DataFrame([[article.url, news_brand, \
                                   article.title, article.authors, article.text]], columns=col_names)
            article_df = article_df.append(entry)
            count += 1
        except:
            pass
        
    print("The total number of " + str(news_brand) + " articles is ", count) 
        
    article_df.set_index('url', inplace=True)
    return article_df    

In [9]:
website_list = ['http://www.nbcnews.com/', 'http://www.msnbc.com/', 'http://www.cbsnews.com/', \
                'http://www.huffingtonpost.com/', 'http://www.bloomberg.com/', 'http://abcnews.go.com/', \
                'http://www.aljazeera.com/news/', 'https://www.afp.com/en/news-hub', 'http://www.newyorker.com/',\
                'https://www.theguardian.com/']
print(website_list)

['http://www.nbcnews.com/', 'http://www.msnbc.com/', 'http://www.cbsnews.com/', 'http://www.huffingtonpost.com/', 'http://www.bloomberg.com/', 'http://abcnews.go.com/', 'http://www.aljazeera.com/news/', 'https://www.afp.com/en/news-hub', 'http://www.newyorker.com/', 'https://www.theguardian.com/']


## nbcnews

In [10]:
url_nbcnews = website_list[0]
df_nbcnews = generate_data(url_nbcnews)

http://www.nbcnews.com/
http://www.nbcnews.com/video/parasitic-twin-removed-from-baby-girl-903135299656
http://www.nbcnews.com/better/business/how-college-students-can-avoid-overdraft-late-fees-ncna774051
http://www.nbcnews.com/news/us-news/epa-chief-taps-taxpayer-dollars-weekend-flights-home-n786471
http://www.nbcnews.com/dateline/video/bitter-pill-kate-snow-s-perspective-706022979680
http://www.nbcnews.com/storyline/immigration-border-crisis/tenth-suspected-migrant-dies-human-smuggling-operation-driver-due-court-n785921
http://www.nbcnewyork.com/news/local/Turkey-Hill-Ice-Cream-Dutch-Chocolate-Rocky-Road-Recall--396525521.html
http://www.nbcnews.com/news/us-news/california-woman-livestreamed-dying-teen-sister-instagram-after-car-crash-n785846
http://www.nbcnews.com/video/hive-installation-at-london-s-kew-gardens-interprets-beehive-activity-1004238915723
http://www.nbcnews.com/tech/gift-guide/tech-gift-guide-best-gadgets-foodies-n693171
http://www.today.com/video/chrissy-teigen-on-mot

In [11]:
path = os.path.join('data', 'nbcnews.csv')
df_nbcnews.to_csv(path)  
# test: read data
data_nbcnews = pd.read_csv(path)

In [12]:
data_nbcnews.tail()

Unnamed: 0,url,source,title,author,text
91,http://www.nbcnews.com/feature/making-a-differ...,nbcnews,Making a Difference,[],"""It makes my heart feel good to do it, and hel..."
92,http://www.nbcnews.com/news/us-news/jewel-thie...,nbcnews,Jewel Thief Doris Payne Charged With Walmart T...,['Tracy Connor'],"Doris Payne, the notorious 86-year-old jewel t..."
93,http://www.nbcnews.com/dateline/video/troubled...,nbcnews,Troubled Waters Part 4,[],Troubled Waters Part 4\n\nA California couple’...
94,http://www.nbcnews.com/dateline/video/troubled...,nbcnews,Troubled Waters Part 5,[],Troubled Waters Part 5\n\nA California couple’...
95,http://www.nbcnews.com/feature/nbc-out/meet-co...,nbcnews,Meet Colombia’s First Legally Recognized ‘Thro...,"[""Dimitri O'Donnell""]",Tucked away in a quiet corner of a community i...


In [None]:
data_nbcnews[data_nbcnews['url'] == 'http://www.nbcnews.com/dateline/video/on-assignment-chief-rehab-705280579921']

In [None]:
data_nbcnews.text[60]

In [None]:
articles_list = newspaper.build(url_nbcnews, memoize_articles=False)

for category in articles_list.category_urls(): 
    print(category)

In [None]:
url = 'http://www.nbcnews.com/news/us-news'
articles_list = newspaper.build(url,memoize_articles=False)
for i in range(0,100):  
    print(articles_list.articles[i].url)

## msnbc news

In [13]:
url_msnbc = website_list[1]
df_msnbc = generate_data(url_msnbc)

http://www.msnbc.com/
http://www.msnbc.com/morning-joe/watch/kushner-was-very-forthcoming-says-house-intel-member-1010139715764
http://www.msnbc.com/rachel-maddow-show/trump-takes-aim-support-historically-black-colleges
http://www.msnbc.com/rachel-maddow-show/devos-picking-schools-should-be-picking-uber-lyft-or-taxis
http://www.msnbc.com/morning-joe/watch/i-have-a-desire-to-compromise-on-health-bill-senator-1009930307518
http://www.msnbc.com/msnbc-news/watch/senate-rejects-obamacare-repeal-only-bill-1010519619712
http://www.msnbc.com/all-in/watch/watch-president-obama-own-anthony-scaramucci-1006059075579
http://www.msnbc.com/the-last-word/watch/franken-trump-firing-sessions-would-be-a-constitutional-crisis-1009189955929
http://www.msnbc.com/am-joy/watch/report-r-kelly-accused-of-allegedly-holding-women-against-their-will-1006554691856
http://www.msnbc.com/morning-joe/watch/climate-expert-says-he-was-demoted-after-speaking-out-1007517251553
http://www.msnbc.com/rachel-maddow-show/why-it

In [14]:
path = os.path.join('data', 'msnbc.csv')
df_msnbc.to_csv(path)  
# test: read data
data_msnbc = pd.read_csv(path)

In [15]:
data_msnbc.tail()

Unnamed: 0,url,source,title,author,text
95,http://www.msnbc.com/the-beat-with-ari-melber/...,msnbc,Schiff: It Is Constitutional to Indict a Sitti...,"['The Beat With Ari Melber', 'The Hour With Br...",Rep. Adam Schiff (D-CA) joins Ari Melber to di...
96,http://msnbc.tumblr.com/post/148790390931/trum...,msnbc,MSNBC,[],Donald Trump on Wednesday night appeared to bl...
97,http://www.msnbc.com/msnbc-news/watch/mccain-r...,msnbc,McCain Rips Trump for Tweeting Ban on Transgen...,"['Andrea Mitchell Reports', 'Msnbc Live', 'Mor...",Sen. John McCain says the president's tweet is...
98,http://www.msnbc.com/rachel-maddow-show/repeal...,msnbc,‘Repeal and replace’ plan’s defeat spells trou...,[],Yesterday’s developments on the Senate floor o...
99,http://www.msnbc.com/rachel-maddow-show/trump-...,msnbc,Trump announcement on transgender troops surpr...,[],"Ordinarily, when pundits talk about the White ..."


## cbsnews

In [16]:
url_cbsnews = website_list[2]
df_cbsnews = generate_data(url_cbsnews)

http://www.cbsnews.com/
http://www.cbsnews.com/news/refrigeration-not-essential-for-all-perishable-foods/
http://www.cbsnews.com/news/new-jersey-police-grant-wish-of-9-year-old-boy-with-rare-disease/
http://www.cbsnews.com/news/poll-americans-say-u-s-political-debate-is-increasingly-uncivil/
http://www.cbsnews.com/news/irs-video-chat-skype-about-your-tax-appeal/
http://www.cbsnews.com/news/cell-phone-spam-and-scam-calls-are-spreading/
http://www.cbsnews.com/news/chicago-pokemon-go-festival-kicks-off-with-fans-unable-to-log-in/
http://www.cbsnews.com/news/mickey-mouse-surprises-brother-and-sister-with-adoption-news/
http://www.cbsnews.com/news/report-brain-training-games-cognitive-benefits-claims/
http://www.cbsnews.com/news/top-alzheimers-researcher-on-how-you-can-help-protect-you-brain/
http://www.cbsnews.com/news/charlie-gard-parents-urged-make-end-of-life-plan-great-ormond-street-hospital/
http://www.cbsnews.com/videos/anthony-scaramucci-says-the-white-house-is-as-strong-as-our-weak

In [17]:
path = os.path.join('data', 'cbsnews.csv')
df_cbsnews.to_csv(path)  
# test: read data
data_cbsnews = pd.read_csv(path)

In [18]:
data_cbsnews.tail()

Unnamed: 0,url,source,title,author,text
95,http://www.cbsnews.com/pictures/adam-west-1928...,cbsnews,Adam West 1928-2017,[],
96,http://www.cbsnews.com/news/elon-musk-mark-zuc...,cbsnews,Elon Musk and Mark Zuckerberg clash over risks...,[],Two tech billionaires are clashing over the fu...
97,http://www.cbsnews.com/videos/confusion-and-my...,cbsnews,Confusion and mystery shroud health care bill ...,[],"July 25, 2017, 7:03 AM | The Republican vow to..."
98,http://www.cbsnews.com/news/pork-falls-from-sk...,cbsnews,15-pound bag of frozen pork lands on family's ...,[],"FORT LAUDERDALE, Fla. -- Meat falling from the..."
99,http://www.cbsnews.com/news/south-carolina-dom...,cbsnews,South Carolina domestic violence law unfair to...,[],"COLUMBIA, S.C. -- People in same-sex relations..."


In [20]:
merge_df = pd.concat([data_nbcnews, data_msnbc, data_cbsnews], ignore_index=True)
merge_df.tail()

Unnamed: 0,url,source,title,author,text
291,http://www.cbsnews.com/pictures/adam-west-1928...,cbsnews,Adam West 1928-2017,[],
292,http://www.cbsnews.com/news/elon-musk-mark-zuc...,cbsnews,Elon Musk and Mark Zuckerberg clash over risks...,[],Two tech billionaires are clashing over the fu...
293,http://www.cbsnews.com/videos/confusion-and-my...,cbsnews,Confusion and mystery shroud health care bill ...,[],"July 25, 2017, 7:03 AM | The Republican vow to..."
294,http://www.cbsnews.com/news/pork-falls-from-sk...,cbsnews,15-pound bag of frozen pork lands on family's ...,[],"FORT LAUDERDALE, Fla. -- Meat falling from the..."
295,http://www.cbsnews.com/news/south-carolina-dom...,cbsnews,South Carolina domestic violence law unfair to...,[],"COLUMBIA, S.C. -- People in same-sex relations..."


In [23]:
path = os.path.join('data', 'realnews_jz.csv')
merge_df.to_csv(path)
# test: read data
data_realnews = pd.read_csv(path)
data_realnews.tail()

Unnamed: 0.1,Unnamed: 0,url,source,title,author,text
291,291,http://www.cbsnews.com/pictures/adam-west-1928...,cbsnews,Adam West 1928-2017,[],
292,292,http://www.cbsnews.com/news/elon-musk-mark-zuc...,cbsnews,Elon Musk and Mark Zuckerberg clash over risks...,[],Two tech billionaires are clashing over the fu...
293,293,http://www.cbsnews.com/videos/confusion-and-my...,cbsnews,Confusion and mystery shroud health care bill ...,[],"July 25, 2017, 7:03 AM | The Republican vow to..."
294,294,http://www.cbsnews.com/news/pork-falls-from-sk...,cbsnews,15-pound bag of frozen pork lands on family's ...,[],"FORT LAUDERDALE, Fla. -- Meat falling from the..."
295,295,http://www.cbsnews.com/news/south-carolina-dom...,cbsnews,South Carolina domestic violence law unfair to...,[],"COLUMBIA, S.C. -- People in same-sex relations..."
