## Download articles 

In [34]:
import feedparser as fp
import json 
import newspaper
from newspaper import Article 
from time import mktime
from datetime import datetime


In [35]:
#set the limit from number of articles to download 
LIMIT=10000

data={}
data['newspapers']={}

In [36]:
#links of different sites to download articles from them 
dic={
  "cnn": {
    "link": "http://edition.cnn.com/"
  },
  "bbc": {
    "rss": "http://feeds.bbci.co.uk/news/rss.xml",
    "link": "http://www.bbc.com/"
  },
  "breitbart": {
    "link": "http://www.breitbart.com/"
  },
  "infowars": {
    "link": "https://www.infowars.com/"
  },
  "washingtonpost": {
    "rss": "http://feeds.washingtonpost.com/rss/world",
    "link": "https://www.washingtonpost.com/"
  } 
}

In [37]:
json_object=json.dumps(dic,indent=4)

In [38]:
with open("NewsPapers.json", "w") as outfile: 
    outfile.write(json_object)

In [39]:
# Loads the JSON files with news sites
with open('NewsPapers.json') as data_file:
    companies = json.load(data_file)

In [40]:
companies

{'cnn': {'link': 'http://edition.cnn.com/'},
 'bbc': {'rss': 'http://feeds.bbci.co.uk/news/rss.xml',
  'link': 'http://www.bbc.com/'},
 'breitbart': {'link': 'http://www.breitbart.com/'},
 'infowars': {'link': 'https://www.infowars.com/'},
 'washingtonpost': {'rss': 'http://feeds.washingtonpost.com/rss/world',
  'link': 'https://www.washingtonpost.com/'}}

In [41]:
count=1
#iterate through each news company 
for company, value in companies.items():
    # If a RSS link is provided in the JSON file, this will be the first choice.
    # Reason for this is that, RSS feeds often give more consistent and correct data.
    # If you do not want to scrape from the RSS-feed, just leave the RSS attr empty in the JSON file.
    if 'rss' in value:
        d = fp.parse(value['rss'])
        print("Downloading articles from ", company)
        newsPaper = {
            "rss": value['rss'],
            "link": value['link'],
            "articles": []
        }
        for entry in d.entries:
            # Check if publish date is provided, if no the article is skipped.
            # This is done to keep consistency in the data and to keep the script from crashing.
            # we will use hasattr() to know if entry has published attribute
            if hasattr(entry, 'published'):
                if count > LIMIT:
                    break
                article = {}
                article['link'] = entry.link
                date = entry.published_parsed
                article['published'] = datetime.fromtimestamp(mktime(date)).isoformat()
                try:
                    content = Article(entry.link,language="en")
                    content.download()
                    content.parse()
                except Exception as e:
                    # If the download for some reason fails (ex. 404) the script will continue downloading
                    # the next article.
                    print(e)
                    print("continuing...")
                    continue
                article['title'] = content.title
                article['text'] = content.text
                newsPaper['articles'].append(article)
                print(count, "articles downloaded from", company, ", url: ", entry.link)
                count = count + 1
    else:
        # This is the fallback method if a RSS-feed link is not provided.
        # It uses the python newspaper library to extract articles
        print("Building site for ", company)
        paper = newspaper.build(value['link'], memoize_articles=False,language="en")
        newsPaper = {
            "link": value['link'],
            "articles": []
        }
        noneTypeCount = 0
        for content in paper.articles:
            if count > LIMIT:
                break
            try:
                content.download()
                content.parse()
            except Exception as e:
                print(e)
                print("continuing...")
                continue
            # Again, for consistency, if there is no found publish date the article will be skipped.
            # After 100 downloaded articles from the same newspaper without publish date, the company will be skipped.
            if content.publish_date is None:
                #print(count, " Article has date of type None...")
                noneTypeCount = noneTypeCount + 1
                if noneTypeCount > 100:
                    print("Too many noneType dates, aborting...")
                    noneTypeCount = 0
                    break
                count = count + 1
                continue
            article = {}
            article['title'] = content.title
            article['text'] = content.text
            article['link'] = content.url
            article['published'] = content.publish_date.isoformat()
            newsPaper['articles'].append(article)
            print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
            count = count + 1
            noneTypeCount = 0
    count = 1
    data['newspapers'][company] = newsPaper

# Finally it saves the articles as a JSON-file.
try:
    with open('scraped_articles.json', 'w') as outfile:
        json.dump(data, outfile)
except Exception as e:
    print(e)

Building site for  cnn
3 articles downloaded from cnn  using newspaper, url:  http://edition.cnn.com/2020/08/19/uk/police-officer-handcuffs-scli-gbr-intl/index.html
4 articles downloaded from cnn  using newspaper, url:  http://edition.cnn.com/2020/08/21/business/brexit-trade-negotiations/index.html
5 articles downloaded from cnn  using newspaper, url:  http://edition.cnn.com/2019/11/21/us/2019-in-review-fast-facts/index.html
6 articles downloaded from cnn  using newspaper, url:  http://edition.cnn.com/2020/08/12/business/ben-jerrys-uk-refugees/index.html
7 articles downloaded from cnn  using newspaper, url:  http://edition.cnn.com/2020/08/23/tech/algorithms-bias-inequality-intl-gbr/index.html
8 articles downloaded from cnn  using newspaper, url:  http://edition.cnn.com/2020/08/21/entertainment/taylor-swift-fan-donation-scli-intl-gbr/index.html
9 articles downloaded from cnn  using newspaper, url:  http://edition.cnn.com/2018/11/19/us/2018-in-review-fast-facts/index.html
10 articles dow

In [23]:
with open('scraped_articles.json') as json_data:
    d = json.load(json_data)

In [24]:
for i, site in enumerate((list(d['newspapers']))):
    print(i, site)

0 cnn
1 bbc
2 breitbart
3 infowars
4 washingtonpost


In [25]:
import pandas as pd
for i, site in enumerate((list(d['newspapers']))):
    articles = list(d['newspapers'][site]['articles'])
    if i == 0:
        df = pd.DataFrame.from_dict(articles)
        df["site"] = site
    else:
        new_df = pd.DataFrame.from_dict(articles)
        new_df["site"] = site
        df = pd.concat([df, new_df], ignore_index = True)

In [26]:
df.shape

(1606, 5)

In [27]:
df

Unnamed: 0,link,published,text,title,site
0,http://edition.cnn.com/2020/08/19/uk/police-of...,2020-08-19T00:00:00,London (CNN) A British police officer needed t...,Police officer freed by firefighters after get...,cnn
1,http://edition.cnn.com/2020/08/21/business/bre...,2020-08-21T00:00:00,London (CNN Business) The latest round of trad...,UK-EU trade talks are going backward and time ...,cnn
2,http://edition.cnn.com/2019/11/21/us/2019-in-r...,2019-11-21T00:00:00,(CNN) Here is a look back at the events of 201...,2019 In Review Fast Facts,cnn
3,http://edition.cnn.com/2020/08/12/business/ben...,2020-08-12T00:00:00,"In a series of tweets Tuesday, the politically...",Ben & Jerry's takes on UK government's handlin...,cnn
4,http://edition.cnn.com/2020/08/21/entertainmen...,2020-08-21T00:00:00,London (CNN) Musician Taylor Swift has donated...,Taylor Swift donates to London student Vitoria...,cnn
5,http://edition.cnn.com/2018/11/19/us/2018-in-r...,2018-11-19T00:00:00,(CNN) Here is a look back at the events of 201...,2018 In Review Fast Facts,cnn
6,http://edition.cnn.com/2020/08/20/europe/manch...,2020-08-20T00:00:00,"(CNN) Hashem Abedi, the brother of the Manches...","Hashem Abedi, brother of Ariana Grande concert...",cnn
7,http://edition.cnn.com/2020/08/20/football/liv...,2020-08-20T00:00:00,(CNN) Leeds United's long-awaited return to th...,Liverpool to face newly-promoted Leeds in firs...,cnn
8,http://edition.cnn.com/2020/08/19/asia/us-hong...,2020-08-19T00:00:00,Hong Kong (CNN) The United States government h...,US suspends Hong Kong extradition treaty over ...,cnn
9,http://edition.cnn.com/2020/08/19/politics/us-...,2020-08-19T00:00:00,(CNN) The Trump administration has informed th...,US tells UK it won't seek death penalty for IS...,cnn


In [30]:
#convert it to csv file 
df.to_csv(r'E:\Machine_learning_projects\fake_news_detection.csv', index = False, header=True)