# Scrape main news sources looking for speakers

In [None]:
import sys
from __future__ import unicode_literals, print_function
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import pickle
from datetime import datetime
from datetime import timedelta
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import newspaper
import yaml
import tqdm
import pendulum
from langdetect import detect

In [None]:
sys.executable

In [None]:
ls NLP/models/speakers_model_2018_09_23_02_50_37

In [None]:
# Important variables
config_yaml_dir = "/Users/AndreCNF/OneDrive/TEDxULisboa/SpeakersScrap/configs/andreferreira_yaml_0.1.yaml"
current_date = pendulum.now('Europe/Lisbon')
model_dir = 'NLP/models/speakers_model_2018_09_23_02_50_37'

### Load NLP model

In [None]:
# Load custom NLP model trained to find articles about good speakers
nlp = spacy.load(model_dir)

In [None]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp_en = spacy.load('en')

# Load Portuguese tokenizer, tagger, parser, NER and word vectors
nlp_pt = spacy.load('pt')

### Fetch news articles

In [None]:
# Read yaml configuration file, with the requested news sources
with open(config_yaml_dir, 'r') as stream:
    try:
        config_yaml = yaml.load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [None]:
# Set the limit for number of articles to download, per news source
LIMIT = 30

data = {}
data['newspapers'] = {}

In [None]:
# Iterate through each news company
for company, value in tqdm.tqdm(config_yaml['news_sources'].items()):
    print("Building site for ", company)
    
    # Counting the number of articles read from a news source company
    count = 1
    
    paper = newspaper.build(value['link'], memoize_articles=False)
    newsPaper = {
        "link": value['link'],
        "articles": []
    }
    
    # Counting the number of articles without a readable publish date
    noneTypeCount = 0
    
    for content in paper.articles:
        if count > LIMIT:
            break
            
        try:
            content.download()
            content.parse()
            
        except Exception as e:
            print(e)
            print("continuing...")
            continue
            
        # Ignore short texts
        if len(content.text) < 280:
            print("Skipping text of length " + str(len(content.text)))
            continue
            
        lang = detect(content.text)
        
        # Ignore texts written in a language that's not portuguese or english
        if lang != 'pt' and lang != 'en':
            print("Ignoring text that is written in " + lang + " language.")
            continue
            
        # Use the correct language model to find mentions of people
        if lang == 'pt':
            nlp_lang = nlp_pt
        elif lang == 'en':
            nlp_lang = nlp_en
            
        people_list = []
            
        # Get the list of people mentioned in the text
        for entity in nlp_lang(content.text).ents:
            if 'PER' in entity.label_:
                people_list.append(entity.text)
                
        if len(people_list) == 0:
            print("Ignoring text as no mention to people was found.")
            continue
            
        # Again, for consistency, if there is no found publish date the article will be skipped.
        # After 10 downloaded articles from the same newspaper without publish date, the company will be skipped.
        if content.publish_date is None:
            print(count, " Article has date of type None...")
            noneTypeCount = noneTypeCount + 1
            if noneTypeCount > 10:
                print("Too many noneType dates, aborting...")
                noneTypeCount = 0
                break
            count = count + 1
            continue
            
        # Get yesterday's date, at the same time (hours, minutes, seconds) as now
        yesterday = current_date - timedelta(days=1)
            
        # If the current article doesn't have a timezone specified, ignore our timezone info to avoid problems
        if content.publish_date.tzinfo == None:
            yesterday = yesterday.replace(tzinfo=None)
            
        # Ignore news articles older than a day ago
        elif content.publish_date < yesterday:
            print("Skipping article from " + str(content.publish_date))
            count = count + 1
            continue
            
        # Score given by the NLP model, indicating the probability that it thinks that the
        # article mentions a good speaker.
        nlp_score = nlp(content.text).cats
            
        # Ignore news articles with a bad NLP score
        if nlp_score < 0.7:
            print("Ignoring article with an NLP score of " + nlp_score)
            continue
            
        article = {}
        article['title'] = content.title
        article['text'] = content.text
        article['link'] = content.url
        article['published'] = content.publish_date.isoformat()
        content.nlp()
        article['keywords'] = content.keywords
        article['summary'] = content.summary
        
        # Add the names of people mentioned in the text
        article['people'] = people_list
        
        # Score given by the NLP model, indicating the probability that it thinks that the
        # article mentions a good speaker.
        article['nlp_score'] = nlp_score
        
        # Add article data to the news source's list
        newsPaper['articles'].append(article)
        print(count, "articles downloaded from", company, "using newspaper, previous article's date: " + 
              content.publish_date.isoformat() + ", url:", content.url)
        count = count + 1
        noneTypeCount = 0

        # Add the current news source's articles data to the whole news list
        data['newspapers'][company] = newsPaper
        
try:
    articles_dir = 'results/scraped_articles_' + str(current_date).replace(' ', '_').replace('-', '_') + '.yaml'
    
    with open(articles_dir, 'w') as outfile:
        yaml.dump(data, outfile, allow_unicode=True)
except Exception as e: print(e)

In [None]:
print(content.publish_date)

In [None]:
data

### Add to Google Spreadsheet