# Web Scraping Raw Article Data for the Fake News Detection Project
### This script is for the step no.1 of 3 steps process: <br>
<font color = red>1. Web-scrap raw article data and re-format it into a pandas data structure </font><br>
<font color = black>2. Extracting useful features from the raw data<br>
3. Using the features, try different supervised learning to detect fake news</font>

<br>
### 30 'less reputable' and also known to be 'fake /extremly-biased' news organizations:
* 'http://ABCnews.com.co'
* 'http://bizstandardnews.com'
* 'http://Bloomberg.ma'
* 'http://70news.wordpress.com'
* 'http://beforeitsnews.com'
* 'http://ddsnewstrend.com'
* 'http://thebostontribune.com/'
* 'http://americanfreepress.net/'
* 'http://www.bipartisanreport.com/'
* 'http://aurora-news.us/'
* 'http://conservativefighters.com/'
* 'http://conservativespirit.com/'
* 'http://conservative101.com/'
* 'http://DrudgeReport.com.co'
* 'http://NBCNews.com.co'
* 'http://TrueTrumpers.com'
* 'http://UndergroundNewsReport.com'
* 'http://washingtonpost.com.co'
* 'http://YourNewsWire.com'
* 'http://cnn.com.de/'
* 'http://rickwells.us/'
* 'http://thedcgazette.com/'
* 'http://donaldtrumppotus45.com/'
* 'http://24wpn.com'
* 'http://AmericanFlavor.news'
* 'http://AmericanPresident.co'
* 'http://AMPosts.com'
* 'http://BB4SP.com'
* 'http://BlueVisionPost.com'
* 'http://CivicTribune.com'

### 30 'reputable' and 'trusted' news organizations:
* 'https://www.wsj.com/'
* 'https://www.nytimes.com/'
* 'http://www.bbc.com/news'
* 'http://www.npr.org/sections/news/'
* 'http://www.reuters.com/'
* 'https://www.economist.com/'
* 'https://www.apnews.com/'
* 'http://www.cnn.com'
* 'http://www.foxnews.com/'
* 'http://www.politico.com/'
* 'http://www.nbcnews.com/'
* 'http://www.msnbc.com/'
* 'http://www.cbsnews.com/'
* 'http://www.huffingtonpost.com/'
* 'http://www.bloomberg.com/'
* 'http://abcnews.go.com/'
* 'http://www.aljazeera.com/news/'
* 'https://www.afp.com/en/news-hub'
* 'http://www.newyorker.com/'
* 'https://www.theguardian.com/'
* 'http://www.telegraph.co.uk/'
* 'http://www.zeit.de/english/index',
* 'http://www.chicagotribune.com/'
* 'http://www.vox.com/'
* 'http://www.bostonherald.com/'
* 'http://www.dailypress.com/'
* 'http://www.detroitnews.com/'
* 'https://www.ft.com/'
* 'http://www.ibtimes.com/'
* 'http://www.voanews.com/'



### Reference:
1. 'http://www.politifact.com/punditfact/article/2017/apr/20/politifacts-guide-fake-news-websites-and-what-they/'
2. 'https://mediabiasfactcheck.com/'
3. 'https://en.wikipedia.org/wiki/List_of_fake_news_websites'
4. 'http://www.fakenewsai.com/'
5. 'https://morningconsult.com/2016/12/07/poll-majority-find-major-media-outlets-credible/'
6. 'http://www.pewresearch.org/fact-tank/2014/10/30/which-news-organization-is-the-most-trusted-the-answer-is-complicated/'

#### Setting the fake news and true news list

In [1]:
hyesoo_fake_news = ['http://ABCnews.com.co','http://Americannews.com','http://Americanoverlook.com', 
                    'http://Bighairynews.com','http://bizstandardnews.com','http://Bloomberg.ma',
                    'http://70news.wordpress.com','http://beforeitsnews.com','http://Cap-news.com',
                    'http://ddsnewstrend.com', 'http://thebostontribune.com/','http://americanfreepress.net/',
                    'http://www.bipartisanreport.com/','http://aurora-news.us/', 'http://Clashdaily.com',
                    'http://Conservativedailypost.com', 'http://Conservativeinfidel.com','http://Dailyheadlines.com',
                    'http://DeadlyClear.wordpress.com', 'http://Donaldtrumpnews.co', 'http://Freedomdaily.com']


hyesoo_true_news = ['https://www.nytimes.com/','http://www.bbc.com/news',
             'http://www.npr.org/sections/news/', 'http://www.reuters.com/',
             'https://www.apnews.com/', 'http://www.cnn.com', 'http://www.foxnews.com/', 
             'http://www.politico.com/'] 

print("We have {} fake news and {} true news organizations".format(len(hyesoo_fake_news), len(hyesoo_true_news)))


We have 21 fake news and 8 true news organizations


#### import libraries and tools
* downloading nltk can take some time
* alternative of 'nltk spell check' is to use a libary called enchanted, but I failed to install it for some reasons 

In [2]:
import pandas as pd
import newspaper 
from itertools import islice
import os

In [7]:
def generate_raw_true_data(news_list, data_name):

    col_names = ["url","source", "title", "author", "text"]
    article_df = pd.DataFrame(columns = col_names)
    final_news_list = []
    final_article_number = {}
    total_count = 0
    for news in news_list:
        try:
            news_articles = newspaper.build(news, memoize_articles=False)
            final_news_list += [news_articles]
        except:
            pass
    print ([a.brand for a in final_news_list], len(final_news_list))


    for news_articles in final_news_list:
        count = 0
        num = len([x for x in news_articles.articles])
        if num >= 250:
            news_articles_articles = news_articles.articles[:250]
        else:
            news_articles_articles = news_articles.articles
        for article in news_articles_articles:
            try:
                article.download()
                article.parse()
                entry = pd.DataFrame([[article.url, news_articles.brand, article.title, article.authors, article.text]], columns=col_names)                    
                article_df = article_df.append(entry)
                count += 1
                total_count += 1
                print(article.url)
            except:
                pass
        print("The total number of " + str(news_articles.brand) + " articles is ", count) 
        final_article_number[news_articles.brand] = count

    print(total_count)
    path = os.path.join('data', data_name+'.csv')
    article_df.to_csv(path)
    return final_article_number


In [8]:
# true
generate_raw_true_data(hyesoo_true_news, 'hyesoo_true_news_rawdata')

['nytimes', 'bbc', 'npr', 'reuters', 'apnews', 'cnn', 'foxnews', 'politico'] 8
https://www.nytimes.com/2017/07/27/us/politics/senate-health-care-vote.html?partner=rss&amp;emc=rss
https://www.nytimes.com/2017/07/27/us/politics/senate-health-care-vote.html
https://www.nytimes.com/2017/07/27/us/politics/obamacare-partial-repeal-senate-republicans-revolt.html?partner=rss&amp;emc=rss
https://www.nytimes.com/2017/07/27/us/politics/obamacare-partial-repeal-senate-republicans-revolt.html
https://www.nytimes.com/video/us/politics/100000005314709/health-bill-vote-senate.html?partner=rss&amp;emc=rss
https://www.nytimes.com/video/us/politics/100000005314709/health-bill-vote-senate.html
https://www.nytimes.com/2017/07/27/health/obamacare-repeal-healthcare-anxiety.html?partner=rss&amp;emc=rss
https://www.nytimes.com/2017/07/27/health/obamacare-repeal-healthcare-anxiety.html
https://www.nytimes.com/2017/07/27/us/politics/scaramucci-priebus-leaks.html?partner=rss&amp;emc=rss
https://www.nytimes.com/20

{'apnews': 0,
 'bbc': 161,
 'cnn': 249,
 'foxnews': 247,
 'npr': 250,
 'nytimes': 245,
 'politico': 217,
 'reuters': 215}

In [5]:
def generate_raw_fake_data(news_list, data_name):
    col_names = ["url", "source", "title", "author", "text"]
    article_df = pd.DataFrame(columns = col_names)
    final_news_list = []
    final_article_number = {}
    total_count = 0
    for news in news_list:
        try:
            news_articles = newspaper.build(news, memoize_articles=False)
            final_news_list += [news_articles]
        except:
            pass
    print (final_news_list, len(final_news_list))
    for news_articles in final_news_list:
        if total_count < 1200:
            count = 0
            for article in news_articles.articles:
                try:
                    article.download()
                    article.parse()
                    entry = pd.DataFrame([[article.url, news_articles.brand, article.title, article.authors, article.text]], columns=col_names)
                    article_df = article_df.append(entry)
                    count += 1
                    total_count += 1
                    print(article.url)
                except:
                    pass
            print("The total number of " + str(news_articles.brand) + " articles is ", count) 
            final_article_number[news_articles.brand] = count
        else:
            pass
    print(total_count)
    article_df.to_csv(data_name+".csv")
    return final_article_number

In [6]:
# fake
generate_raw_fake_data(hyesoo_fake_news, 'hyesoo_fake_news_rawdata')

[Source parse ERR] http://Bloomberg.ma
[Source parse ERR] http://thebostontribune.com/
[<newspaper.source.Source object at 0x12e45feb8>, <newspaper.source.Source object at 0x12e45f1d0>, <newspaper.source.Source object at 0x114aba438>, <newspaper.source.Source object at 0x12bf08f60>, <newspaper.source.Source object at 0x12e9c90f0>, <newspaper.source.Source object at 0x11bef9128>, <newspaper.source.Source object at 0x1185a06d8>, <newspaper.source.Source object at 0x1111e56d8>, <newspaper.source.Source object at 0x12e45f390>, <newspaper.source.Source object at 0x11690b908>, <newspaper.source.Source object at 0x10bf3e358>, <newspaper.source.Source object at 0x1223e0f28>, <newspaper.source.Source object at 0x11721beb8>, <newspaper.source.Source object at 0x114deb588>, <newspaper.source.Source object at 0x11c6e69e8>, <newspaper.source.Source object at 0x1212c5c50>, <newspaper.source.Source object at 0x115c17eb8>, <newspaper.source.Source object at 0x11734a208>, <newspaper.source.Source objec

{'ABCnews': 20,
 'Americannews': 11,
 'Americanoverlook': 16,
 'Bighairynews': 59,
 'Bloomberg': 0,
 'Cap-news': 0,
 'Clashdaily': 132,
 'Conservativedailypost': 62,
 'americanfreepress': 177,
 'aurora-news': 26,
 'beforeitsnews': 283,
 'bipartisanreport': 42,
 'bizstandardnews': 59,
 'ddsnewstrend': 47,
 'thebostontribune': 0,
 'wordpress': 212}