<a href="https://colab.research.google.com/github/Abkhenaten/Web_Scraping/blob/main/Web_Scraping_Fake_News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

For the scraper I used code from: https://holwech.github.io/blog/Automatic-news-scraper/ but modified some of the code to scrape more articles.

In [None]:
!pip install feedparser
!pip install newspaper3k

import feedparser as fp
import json
import newspaper
import os
import pandas as pd
from newspaper import Article
from time import mktime
from datetime import datetime



In [None]:
from google.colab import drive

# Mount google drive
DRIVE_MOUNT='/content/gdrive'
drive.mount(DRIVE_MOUNT)

# create folder to write data to
CIS545_FOLDER=os.path.join(DRIVE_MOUNT, 'My Drive', 'CIS545_2020')
HOMEWORK_FOLDER=os.path.join(CIS545_FOLDER, 'Project')
os.makedirs(HOMEWORK_FOLDER, exist_ok=True)

Mounted at /content/gdrive


In [None]:
dictionary = {
  "cnn": {
    "link": "http://edition.cnn.com/"
  },
  "bbc": {
    "rss": "http://feeds.bbci.co.uk/news/rss.xml",
    "link": "http://www.bbc.com/"
  },
  "theguardian": {
    "rss": "https://www.theguardian.com/uk/rss",
    "link": "https://www.theguardian.com/international"
  },
  "breitbart": {
    "link": "http://www.breitbart.com/"
  },
  "infowars": {
    "link": "https://www.infowars.com/"
  },
  "foxnews": {
    "link": "http://www.foxnews.com/"
  },
  "nbcnews": {
    "link": "http://www.nbcnews.com/"
  },
  "washingtonpost": {
    "rss": "http://feeds.washingtonpost.com/rss/world",
    "link": "https://www.washingtonpost.com/"
  },
  "theonion": {
    "link": "http://www.theonion.com/"
  }
}

In [None]:
json_object = json.dumps(dictionary, indent = 4)

In [None]:
with open('NewsPapers.json', 'w') as outfile:
  outfile.write(json_object)

In [None]:
# Set the limit for number of articles to download
LIMIT = 14500

data = {}
data['newspapers'] = {}

# Loads the JSON files with news sites
with open('NewsPapers.json') as data_file:
    companies = json.load(data_file)

count = 1

# Iterate through each news company
for company, value in companies.items():
    # If a RSS link is provided in the JSON file, this will be the first choice.
    # Reason for this is that, RSS feeds often give more consistent and correct data.
    # If you do not want to scrape from the RSS-feed, just leave the RSS attr empty in the JSON file.
    if 'rss' in value:
        d = fp.parse(value['rss'])
        print("Downloading articles from ", company)
        newsPaper = {
            "rss": value['rss'],
            "link": value['link'],
            "articles": []
        }
        for entry in d.entries:
            # Check if publish date is provided, if no the article is skipped.
            # This is done to keep consistency in the data and to keep the script from crashing.
            if hasattr(entry, 'published'):
                if count > LIMIT:
                    break
                article = {}
                article['link'] = entry.link
                date = entry.published_parsed
                article['published'] = datetime.fromtimestamp(mktime(date)).isoformat()
                try:
                    content = Article(entry.link)
                    content.download()
                    content.parse()
                except Exception as e:
                    # If the download for some reason fails (ex. 404) the script will continue downloading
                    # the next article.
                    print(e)
                    print("continuing...")
                    continue
                article['title'] = content.title
                article['text'] = content.text
                newsPaper['articles'].append(article)
                print(count, "articles downloaded from", company, ", url: ", entry.link)
                count = count + 1
    else:
        # This is the fallback method if a RSS-feed link is not provided.
        # It uses the python newspaper library to extract articles
        print("Building site for ", company)
        paper = newspaper.build(value['link'], memoize_articles=False)
        newsPaper = {
            "link": value['link'],
            "articles": []
        }
        noneTypeCount = 0
        for content in paper.articles:
            if count > LIMIT:
                break
            try:
                content.download()
                content.parse()
            except Exception as e:
                print(e)
                print("continuing...")
                continue
            # Again, for consistency, if there is no found publish date the article will be skipped.
            # After 10 downloaded articles from the same newspaper without publish date, the company will be skipped.
            if content.publish_date is None:
                print(count, " Article has date of type None...")
                noneTypeCount = noneTypeCount + 1
                if noneTypeCount > 100:
                    print("Too many noneType dates, aborting...")
                    noneTypeCount = 0
                    break
                count = count + 1
                continue
            article = {}
            article['title'] = content.title
            article['text'] = content.text
            article['link'] = content.url
            article['published'] = content.publish_date.isoformat()
            newsPaper['articles'].append(article)
            print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
            count = count + 1
            noneTypeCount = 0
    count = 1
    data['newspapers'][company] = newsPaper

# Saves the articles as a JSON-file.
try:
    with open('scraped_articles.json', 'w') as outfile:
        json.dump(data, outfile)
except Exception as e: print(e)

Building site for  cnn
1  Article has date of type None...
2  Article has date of type None...
3  Article has date of type None...
4  Article has date of type None...
5  Article has date of type None...
6  Article has date of type None...
7  Article has date of type None...
8  Article has date of type None...
9  Article has date of type None...
10  Article has date of type None...
11  Article has date of type None...
12  Article has date of type None...
13  Article has date of type None...
14  Article has date of type None...
15  Article has date of type None...
16  Article has date of type None...
17  Article has date of type None...
18  Article has date of type None...
19  Article has date of type None...
20  Article has date of type None...
21  Article has date of type None...
22  Article has date of type None...
23  Article has date of type None...
24  Article has date of type None...
25  Article has date of type None...
26  Article has date of type None...
27  Article has date of 

In [None]:
with open ('scraped_articles.json') as json_data:
  d = json.load(json_data)

In [None]:
for i, site in enumerate((list(d['newspapers']))):
  print(i, site)

0 cnn
1 bbc
2 theguardian
3 breitbart
4 infowars
5 foxnews
6 nbcnews
7 washingtonpost
8 theonion


In [None]:
import pandas as pd
for i, site in enumerate((list(d['newspapers']))):
    articles = list(d['newspapers'][site]['articles'])
    if i == 0:
        df = pd.DataFrame.from_dict(articles)
        df["site"] = site
    else:
        new_df = pd.DataFrame.from_dict(articles)
        new_df["site"] = site
        df = pd.concat([df, new_df], ignore_index = True)


In [None]:
df.shape

(1543, 5)

In [None]:
df

Unnamed: 0,title,text,link,published,site
0,'WandaVision' caps off Marvel's version of a l...,"The following contains spoilers about the ""Wan...",http://edition.cnn.com/2021/03/05/entertainmen...,2021-03-05T00:00:00,cnn
1,'Raya and the Last Dragon' mixes a serious mes...,(CNN) The latest Disney animated adventure can...,http://edition.cnn.com/2021/03/05/entertainmen...,2021-03-05T00:00:00,cnn
2,Lamar Odom gets emotional watching 'Khloe & La...,(CNN) Lamar Odom let the world in on his remin...,http://edition.cnn.com/2021/03/05/entertainmen...,2021-03-05T00:00:00,cnn
3,Paris Hilton got an apology from Sarah Silverm...,(CNN) All is well now between Paris Hilton and...,http://edition.cnn.com/2021/03/05/entertainmen...,2021-03-05T00:00:00,cnn
4,Bruno Mars is having quite the week,(CNN) Bruno Mars really does have the Midas to...,http://edition.cnn.com/2021/03/05/entertainmen...,2021-03-05T00:00:00,cnn
...,...,...,...,...,...
1538,US gives hope to previously denied asylum seek...,"Late Friday night, an official with Mexico’s F...",https://www.washingtonpost.com/world/the_ameri...,2021-03-06T07:14:11,washingtonpost
1539,Pro-democracy protest in Thailand passes witho...,"There were several protest marches Saturday, b...",https://www.washingtonpost.com/world/asia_paci...,2021-03-06T16:06:41,washingtonpost
1540,Egypt’s president el-Sissi visits Sudan amid r...,“This visit comes within the framework of clos...,https://www.washingtonpost.com/world/middle_ea...,2021-03-06T10:53:13,washingtonpost
1541,Pakistan’s PM wins vote of confidence after Se...,Khan needed 172 votes to show a simple majorit...,https://www.washingtonpost.com/world/asia_paci...,2021-03-06T10:12:18,washingtonpost


In [None]:

!cp scraped_articles.json '/content/drive/My Drive/CIS545_2020/Project/'

cp: cannot create regular file '/content/drive/My Drive/CIS545_2020/Project/': No such file or directory
