# Scraping news from fake news websites

Fake news websites are taken from
https://github.com/BigMcLargeHuge/opensources/blob/master/sources/sources.csv

## Fake news website list

In [32]:
url_interestingdailynews = 'https://www.interestingdailynews.com'
url_nbccomco = 'http://nbc.com.co'
url_newsbbcnet = 'http://www.newsbbc.net'
url_madworldnews = 'http://madworldnews.com'
url_majorthoughts = 'http://majorthoughts.com'
url_nationonenews = 'http://nationonenews.com'

In [13]:
import newspaper
import pandas as pd
import numpy as np
import os

In [24]:
# collect all or most recent 100 articles from a website
def generate_data(news_url):
    col_names = ["url","source", "title", "author", "text"]
    article_df = pd.DataFrame(columns = col_names)
    
    print(news_url)
    news_articles = newspaper.build(news_url, memoize_articles=False)
    
    size_articles = news_articles.size()
    num_news = 100
    if size_articles < num_news:
        num_news = size_articles
     
    news_brand = news_articles.brand
    count = 0
    for i in range(0,num_news):
        article = news_articles.articles[i];
        try:
            article.download()
            article.parse()
            print(article.url)
            
            entry = pd.DataFrame([[article.url, news_brand, \
                                   article.title, article.authors, article.text]], columns=col_names)
            article_df = article_df.append(entry)
            count += 1
        except:
            pass
        
    print("The total number of " + str(news_brand) + " articles is ", count) 
        
    article_df.set_index('url', inplace=True)
    return article_df    

In [63]:
# check the number of articles in the website before scraping 
# some websites do not have many articles
url = 'http://nationonenews.com'
articles_list = newspaper.build(url,memoize_articles=False)
articles_list.size()

62

## 1. Obtain news from 'https://www.interestingdailynews.com'

In [33]:
df_interestingdailynews = generate_data(url_interestingdailynews)

https://www.interestingdailynews.com
You must `download()` an article first!
You must `download()` an article first!
https://www.interestingdailynews.com/not-just-cnn-griffin-dumped-advertisers-career-self-decapitation/
https://www.interestingdailynews.com/man-rapes-infant-leaves-something-remember/
https://www.interestingdailynews.com/pro-trump-billboard-causes-huge-controversy-small-southern-town/
https://www.interestingdailynews.com/huge-melee-breaks-out-trump-rally-attacked-by-thugs-but-supporters-were-prepared-for-war-video/
https://www.interestingdailynews.com/2017-chevrolet-cruze-diesel-epa-rated-3052-mpg-with-manual/
https://www.interestingdailynews.com/benellis-upcoming-750cc-parallel-twin-naked-spotted/
https://www.interestingdailynews.com/rachel-maddow-exposes-mike-pences-4-biggest-lies-must-watch/
https://www.interestingdailynews.com/alert-1-million-pounds-of-americas-favorite-frozen-food-recalled-throw-it-away-now-it-could-kill-your-family/
https://www.interestingdailynews

In [34]:
output_path = os.path.join('data', 'interestingdailynews.csv')
df_interestingdailynews.to_csv(output_path)

In [35]:
# read data
input_path = output_path
data_interestingdailynews = pd.read_csv(input_path)

In [37]:
data_interestingdailynews.head()

Unnamed: 0,url,source,title,author,text
0,https://www.interestingdailynews.com/not-just-...,interestingdailynews,Not Just CNN: Griffin Dumped By Other Advertis...,['Interesting Daily News'],Not Just CNN: Griffin Dumped By Other Advertis...
1,https://www.interestingdailynews.com/man-rapes...,interestingdailynews,"Man Rapes Infant, Leaves Her With Something To...",['Interesting Daily News'],"Man Rapes Infant, Leaves Her With Something To..."
2,https://www.interestingdailynews.com/pro-trump...,interestingdailynews,Pro-Trump Billboard Causes ‘HUGE’ Controversy ...,['Interesting Daily News'],"It’s a stark, simple message on a black billbo..."
3,https://www.interestingdailynews.com/huge-mele...,interestingdailynews,HUGE MELEE BREAKS OUT! Trump Rally Attacked By...,['Interesting Daily News'],HUGE MELEE BREAKS OUT! Trump Rally Attacked By...
4,https://www.interestingdailynews.com/2017-chev...,interestingdailynews,2017 CHEVROLET CRUZE DIESEL EPA-RATED 30/52 MP...,['Interesting Daily News'],Nine-speed auto rated 31/47 mpg\n\nThe 2017 Ch...


## 2. Obtain news from 'http://nbc.com.co'

In [38]:
df_nbccomco = generate_data(url_nbccomco)

http://nbc.com.co
http://nbc.com.co/suicide-will-affect-aaron-hernandezs-estate-payout/
http://nbc.com.co/News/entertainment/music/
http://nbc.com.co/lifeguards-lose-protection-personal-injury-liability/
http://nbc.com.co/vince-gilligan-announces-breaking-bad-season-6-begins-shooting-august-2015/
http://nbc.com.co/best-places-find-free-roulette-game-web/
http://nbc.com.co/bill-murray-is-a-hero-in-london-after-saving-the-life-of-a-child-from-oncoming-traffic/
http://nbc.com.co/News/global/criminal/national/
http://nbc.com.co/News/news/
http://nbc.com.co/trump-to-build-wall-out-of-homes-for-veterans/
http://nbc.com.co/horror-movies-themes-online-games-popularity/
http://nbc.com.co/News/nbc-news/
http://nbc.com.co/News/global/criminal/regional/
http://nbc.com.co/News/global/criminal/
http://nbc.com.co/jonathan-gregory-the-creepiest-comedian-in-america/
http://nbc.com.co/News/politics/
http://nbc.com.co/personal-injury-claims-ford-explorer-rise/
http://nbc.com.co/News/lifestyle/
http://nbc

In [40]:
path = os.path.join('data', 'nbccomco.csv')
df_nbccomco.to_csv(path)  

# test: read data
data_nbccomco = pd.read_csv(path)
data_nbccomco.tail()

Unnamed: 0,url,source,title,author,text
22,http://nbc.com.co/News/entertainment/,nbc,Entertainment Archives,"['Almira Anke', 'Hanif Ibrahim', 'Jacob Morgan']",People are somehow wired to love scary stuff. ...
23,http://nbc.com.co/top-5-large-tablets-buy-righ...,nbc,Top 5 Large Tablets To Buy Right Now,"['Almira Anke', 'Hanif Ibrahim', 'Jacob Morgan']",Full-size tablets have been popular ever since...
24,http://nbc.com.co/News/technology/,nbc,Technology Archives,"['Almira Anke', 'Hanif Ibrahim', 'Jacob Morgan']",Counter-Strike: Global Offensive is one of the...
25,http://nbc.com.co/hate-crimes-rise-since-trump...,nbc,Are Hate Crimes On The Rise Since Trump Took O...,"['Almira Anke', 'Hanif Ibrahim', 'Jacob Morgan']",Whether you are in favor of or against the Tru...
26,http://nbc.com.co/push-cap-damages-medical-mal...,nbc,Push To Cap Damages For Medical Malpractice On...,"['Almira Anke', 'Hanif Ibrahim', 'Jacob Morgan']",Big changes may be on the horizon for the stat...


## 3. Obtain news from 'http://www.newsbbc.net'

In [41]:
df_newsbbcnet = generate_data(url_newsbbcnet)

http://www.newsbbc.net
http://yournewswire.com/john-kerry-calls-for-russia-and-syria-war-crimes-investigation/
http://www.globalhealingcenter.com/natural-health/what-is-a-raw-vegan-diet/?a_aid=54e3fadc2316d
http://www.zerohedge.com/news/2016-10-06/leaked-memo-confirms-hillary-was-given-questions-ahead-interview
http://www.globalhealingcenter.com/natural-health/healthy-lifestyle-changes-to-make-today/
http://inhabitat.com/trump-sons-selling-post-inauguration-access-and-private-hunting-trip-for-1m/
http://www.dailymail.co.uk/news/article-3827732/U-S-accuses-Russia-hacking-attempts-political-groups.html
http://freebeacon.com/politics/clinton-campaign-helped-script-steve-harvey-interview/
http://time.com/4634078/rex-tillerson-south-china-sea-donald-trump/
http://www.newsbbc.net/2017/04/tribute-for-family-of-soldier-killed.html
http://yournewswire.com/scientific-study-towers-collapsed-due-to-controlled-demolition/
http://www.newsbbc.net/2017/07/cia-agent-confesses-on-deathbed-we-blew.html
h

In [42]:
path = os.path.join('data', 'newsbbcnet.csv')
df_newsbbcnet.to_csv(path)  

# test: read data
data_newsbbcnet = pd.read_csv(path)
data_newsbbcnet.tail()  

Unnamed: 0,url,source,title,author,text
82,http://townhall.com/tipsheet/guybenson/2015/10...,newsbbc,Humiliation: Iran Test Fires Long-Range Missil...,"['Guy Benson', ""Cortney O'Brien"", 'Katie Pavli...","Oh, Here We Go: Left Wingers Say Trump's Speec..."
83,http://www.newsbbc.net/2016/10/us-officially-a...,newsbbc,U.S. Officially Accuses Russia Of Political Hacks,['Rossy Smith'],"Comprehensive up-to-date online news coverage,..."
84,http://www.newsbbc.net/2016/10/kanye-west-says...,newsbbc,Kanye West Says Illuminati Behind Paris Robbery,['Rossy Smith'],"Comprehensive up-to-date online news coverage,..."
85,http://www.nbcnews.com/meet-the-press/video/vp...,newsbbc,VP Biden on Russia and Cyber Warfare,[],MTP Exclusive: VP Biden Promises Response to R...
86,http://www.cbsnews.com/news/report-cia-lost-of...,newsbbc,Report: CIA Lost Office In WTC,['Cbsnews.Com Staff Cbsnews.Com Staff'],A secret office operated by the CIA was destro...


## 4. Obtain news from 'http://madworldnews.net'

In [50]:
df_madworldnews = generate_data(url_madworldnews) 

http://madworldnews.com
http://madworldnews.tumblr.com/post/163459150247/as-snowflakes-melt-over-trumps-transgender#notes
http://madworldnews.tumblr.com/post/163456261437/ben-jerrys-force-liberal-agenda-on-customers
http://madworldnews.tumblr.com/post/163455894382/teen-killed-dismembered-after-discovering-dads
http://madworldnews.tumblr.com/post/163458421947/what-dirty-dem-quietly-did-after-corrupt-it
http://madworldnews.tumblr.com/post/163456537032/melania-gets-revenge-on-michelle-obama-at-ohio
http://madworldnews.tumblr.com/post/163455658862/homesick-florida-woman-decides-to-check-google
http://madworldnews.tumblr.com/post/163457493467/us-senate-front-runner-declares-2-words-at-pork
http://madworldnews.tumblr.com/post/163459150247/as-snowflakes-melt-over-trumps-transgender
http://madworldnews.tumblr.com/post/163459516997/florida-thug-shoots-cop-in-the-face-gets-instant
http://madworldnews.tumblr.com/post/163455055437/ohio-pervert-raped-girlfriends-toddler-to-death#notes
http://madwor

In [51]:
path = os.path.join('data', 'madworldnews.csv')
df_madworldnews.to_csv(path)  

# test: read data
data_madworldnews = pd.read_csv(path)
data_madworldnews.tail() 

Unnamed: 0,url,source,title,author,text
10,http://madworldnews.com/2017/07/,madworldnews,Home • Mad World News,[],Melania Trump is ruthlessly scrutinized by the...
11,http://madworldnews.tumblr.com/post/1634565370...,madworldnews,MadWorldNews.com • Melania Gets Revenge On Mic...,[],The Voice of Reason in an Insane World - Mad W...
12,http://madworldnews.tumblr.com/post/1634550554...,madworldnews,MadWorldNews.com • Ohio Pervert Raped Girlfrie...,[],The Voice of Reason in an Insane World - Mad W...
13,http://madworldnews.tumblr.com/post/1634584219...,madworldnews,MadWorldNews.com • What Dirty Dem Quietly Did ...,[],The Voice of Reason in an Insane World - Mad W...
14,http://madworldnews.tumblr.com/post/1634597849...,madworldnews,MadWorldNews.com • Liberals Freak Out As Trump...,[],The Voice of Reason in an Insane World - Mad W...


## 5. Obtain news from 'http://majorthoughts.com'

In [56]:
df_majorthoughts = generate_data(url_majorthoughts)

http://majorthoughts.com
http://www.majorthoughts.com/gata-top-demais-representando-muito-linda-twerk-brazil-twerk-shake_024c27997.html
http://www.majorthoughts.com/list-of-rappers-currently-in-jail_aefd81c8d.html
http://www.majorthoughts.com/one-thot-puts-deadly-hands-on-other-thot-ghetto-fight-kings-2k15-1_dccd92699.html
http://www.majorthoughts.com/meek-mill-tells-nicki-minaj-to-give-his-35million-or-see-smoke-pull-over_b73a0872d.html
http://www.majorthoughts.com/she-should-have-never-put-hands-on-her-baby-people-roll-up-to-two-women-having-a-cat_a9c75ea67.html
http://www.majorthoughts.com/boosie-badazz-dice-game-with-meek-mill-lucci_6be2ddb97.html
http://www.majorthoughts.com/5-moments-you-wouldn%e2%80%99t-believe-if-they-weren%e2%80%99t-recorded_aa6ced657.html
http://www.majorthoughts.com/meek-mill-when-she-give-my-30million-i-will-tell-her-who-robbed-her-house_b6f55583b.html
http://www.majorthoughts.com/eastside-ghetto-fights-i-got-it_519a869e2.html
http://www.majorthoughts.com/b

In [60]:
path = os.path.join('data', 'majorthoughts.csv')
df_majorthoughts.to_csv(path)
# test: read data
data_majorthoughts = pd.read_csv(path)
data_majorthoughts.tail()  

Unnamed: 0,url,source,title,author,text
10,http://www.majorthoughts.com/how-to-get-a-bad-...,majorthoughts,HOW TO GET A BAD COP FIRED,[],Download your free copy of RISE OF THE WARRIOR...
11,http://www.majorthoughts.com/dbz-ghetto-fights...,majorthoughts,Dbz ghetto fights in the hood,[],"crazy shit goes on in the hood, enter if you d..."
12,http://www.majorthoughts.com/nicki-minaj-pulls...,majorthoughts,Nicki Minaj Pulls Up On Meek Mill At CVS Pharm...,[],Meek Mill in LA at CVS with Nicki Minaj\n\nSub...
13,http://www.majorthoughts.com/50-cent-clowns-bo...,majorthoughts,50 Cent Clowns Bow Wow Nails Bow Wow Challenge...,[],Thanks! Share it with your friends!\n\n×\n\nYo...
14,http://www.majorthoughts.com/uploads/thumbs/20...,majorthoughts,Major Thoughts,[],


## 6. Obtain news from 'http://nationonenews.com'

In [64]:
df_nationonenews = generate_data(url_nationonenews)

http://nationonenews.com
https://nationonenews.com/2017/07/23/breaking-san-antonio-human-smuggling-8-dead-28-serious-condition.html
https://nationonenews.com/2017/07/22/stein-recount-questions.html
https://nationonenews.com/2017/07/22/leaks-trump-going-money.html
https://nationonenews.com/2017/07/26/president-trump-just-started-war-can-win.html/feed
https://nationonenews.com/2017/07/14/trump-took-time-recognize-real-heros-france.html/feed
https://nationonenews.com/2017/03/06/porn-stars-get-offended.html
https://nationonenews.com/2016/11/18/video-high-school-takes-mannequin-challenge-new-extreme-raises-bar.html
https://nationonenews.com/2017/07/09/trump-helps-marine-everyone-missed.html
https://nationonenews.com/2017/05/16/immediately-abc-stabbed-tim-allen-back-learned-will-last-man-standing-not-abc.html
https://nationonenews.com/2017/07/10/trump-responds-report-says-james-comey-leaked-classified-information.html
https://nationonenews.com/2017/07/09/trump-helps-marine-everyone-missed.ht

In [65]:
path = os.path.join('data', 'nationonenews.csv')
df_nationonenews.to_csv(path)
# test: read data
data_nationonenews = pd.read_csv(path)
data_nationonenews.tail() 

Unnamed: 0,url,source,title,author,text
57,https://nationonenews.com/c/news,nationonenews,Nation One News,[],We use cookies to give you the best possible e...
58,https://nationonenews.com/2017/07/22/stein-rec...,nationonenews,"Comments on: Jill Steins still has $1,361,834....",[],
59,https://nationonenews.com/2017/07/21/not-fast-...,nationonenews,"Not so fast, Scaramucci is not replacing Spice...",[],Some news organizations are clamoring to disto...
60,https://nationonenews.com/2017/07/05/watch-fox...,nationonenews,"[WATCH] Fox News Put The Washington ""Hurt"" On ...",[],The Democrats in Congress will stop at nothing...
61,https://nationonenews.com/2017/07/22/viral-vid...,nationonenews,Comments on: [Viral Video] Trump appears on Ga...,[],


## Merge all data

In [66]:
merge_df = pd.concat([data_interestingdailynews, data_nbccomco, data_newsbbcnet, \
                      data_madworldnews, data_majorthoughts, data_nationonenews], ignore_index=True)
merge_df.tail()

Unnamed: 0,url,source,title,author,text
222,https://nationonenews.com/c/news,nationonenews,Nation One News,[],We use cookies to give you the best possible e...
223,https://nationonenews.com/2017/07/22/stein-rec...,nationonenews,"Comments on: Jill Steins still has $1,361,834....",[],
224,https://nationonenews.com/2017/07/21/not-fast-...,nationonenews,"Not so fast, Scaramucci is not replacing Spice...",[],Some news organizations are clamoring to disto...
225,https://nationonenews.com/2017/07/05/watch-fox...,nationonenews,"[WATCH] Fox News Put The Washington ""Hurt"" On ...",[],The Democrats in Congress will stop at nothing...
226,https://nationonenews.com/2017/07/22/viral-vid...,nationonenews,Comments on: [Viral Video] Trump appears on Ga...,[],


In [69]:
path = os.path.join('data', 'fakenews_jz.csv')
merge_df.to_csv(path)
# test: read data
data_fakenews = pd.read_csv(path)
data_fakenews.tail()

Unnamed: 0.1,Unnamed: 0,url,source,title,author,text
222,222,https://nationonenews.com/c/news,nationonenews,Nation One News,[],We use cookies to give you the best possible e...
223,223,https://nationonenews.com/2017/07/22/stein-rec...,nationonenews,"Comments on: Jill Steins still has $1,361,834....",[],
224,224,https://nationonenews.com/2017/07/21/not-fast-...,nationonenews,"Not so fast, Scaramucci is not replacing Spice...",[],Some news organizations are clamoring to disto...
225,225,https://nationonenews.com/2017/07/05/watch-fox...,nationonenews,"[WATCH] Fox News Put The Washington ""Hurt"" On ...",[],The Democrats in Congress will stop at nothing...
226,226,https://nationonenews.com/2017/07/22/viral-vid...,nationonenews,Comments on: [Viral Video] Trump appears on Ga...,[],
