# Scrape main news sources looking for speakers

In [None]:
import sys
from __future__ import unicode_literals, print_function
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import pickle
from datetime import datetime
from datetime import timedelta
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import newspaper

In [None]:
sys.executable

In [None]:
# Important variables
config_yaml_dir = "/Users/AndreCNF/OneDrive/TEDxULisboa/SpeakersScrap/configs/andreferreira_yaml_0.1.yaml"
current_date = datetime.now()

### Fetch news articles

In [None]:
# Read yaml configuration file, with the requested news sources
with open(config_yaml_dir, 'r') as stream:
    try:
        config_yaml = yaml.load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [None]:
data = {}
data['newspapers'] = {}

In [None]:
# Iterate through each news company
for company, value in tqdm.tqdm(config_yaml['news_sources'].items()):
    print("Building site for ", company)
    
    # Counting the number of articles read from a news source company
    count = 1
    
    paper = newspaper.build(value['link'], memoize_articles=False)
    newsPaper = {
        "link": value['link'],
        "articles": []
    }
    
    # Counting the number of articles without a readable publish date
    noneTypeCount = 0
    
    for content in paper.articles:
        if count > LIMIT:
            break
        try:
            content.download()
            content.parse()
        except Exception as e:
            print(e)
            print("continuing...")
            continue
            
        # Again, for consistency, if there is no found publish date the article will be skipped.
        # After 10 downloaded articles from the same newspaper without publish date, the company will be skipped.
        if content.publish_date is None:
            print(count, " Article has date of type None...")
            noneTypeCount = noneTypeCount + 1
            if noneTypeCount > 10:
                print("Too many noneType dates, aborting...")
                noneTypeCount = 0
                break
            count = count + 1
            continue
            
        # Ignore news articles older than a day ago
        elif content.publish_date < current_date - timedelta(days=1):
            break
            
        article = {}
        article['title'] = content.title
        article['text'] = content.text
        article['link'] = content.url
        article['published'] = content.publish_date.isoformat()
        content.nlp()
        article['keywords'] = content.keywords
        article['summary'] = content.summary
        newsPaper['articles'].append(article)
        print(count, "articles downloaded from", company, "using newspaper, url:", content.url)
        count = count + 1
        noneTypeCount = 0

        data['newspapers'][company] = newsPaper
        
try:
    with open('results/scraped_articles.yaml', 'w') as outfile:
        yaml.dump(data, outfile, allow_unicode=True)
except Exception as e: print(e)

### Add to Google Spreadsheet