## Code reference:
* News scraping
    - http://theautomatic.net/2020/08/05/how-to-scrape-news-articles-with-python/
    - https://newspaper.readthedocs.io/en/latest/
    - https://github.com/miguelfzafra/Latest-News-Classifier
* VADER code reference: 
    - https://towardsdatascience.com/sentimental-analysis-using-vader-a3415fef7664

In [5]:
# Import modules
import newspaper
from newspaper import Article
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

from __future__ import (absolute_import, division, print_function, unicode_literals)
import backtrader as bt
import backtrader.indicators as btind
import datetime
import os.path
import sys

import time
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

# News scraping

In [6]:
titles = []
authors = []
dates = []
texts = []

In [7]:
source_url = "https://www.cnn.com"
source = newspaper.build(source_url)
source

<newspaper.source.Source at 0x1423010e460>

In [8]:
article = source.articles[0]
article.download()

In [9]:
article.parse()
article.url

'https://cnnespanol.cnn.com/video/cheque-de-estimulo-cuando-lo-recibire-quienes-recibiran-primero-pagos-stimulus-lklv-cnn-dinero/'

In [10]:
for article in source.articles:
    # Select the article wanted and download
    article.download()
    # Parse the html
    article.parse()
    titles.append(article.title)
    authors.append(article.authors)
    dates.append(article.publish_date)
    texts.append(article.text)
    time.sleep(3)
df = pd.DataFrame({
    'Title': titles,
    'Authors': authors,
    'dates': dates,
    'Text': texts,
})
df

Unnamed: 0,Title,Authors,dates,Text
0,Nuevos cheques de estímulo empezarán a entrega...,[],2021-03-12,Los estadounidenses podrían recibir sus nuevos...
1,Italia tendrá viajes en tren «libres de covid»...,[],2021-03-12,Trenes libres de coronavirus es el nuevo conce...


In [26]:
# Request
source_url = "https://www.cnn.com/"
req = requests.get(source_url)
req.status_code

200

In [27]:
# Get the content on the page
page = req.content
# Soup creation
soup = BeautifulSoup(page, 'html5lib')

In [28]:
# News identification
news = soup.find_all('a', class_ = 'title')
num_articles = 10

articles = []
titles = []
authors = []
dates = []
texts = []

In [15]:
# Get news links and build article
article_list = []
for i in np.arange(0, num_articles):
    if (news[i]['href'] != ''):
        # Get the link for the news article
        link= news[i]['href']
print(link)


https://www.msn.com/en-us/news/us/probe-faults-mayor-officials-for-keeping-prude-death-secret/ar-BB1ewLKk?ocid=BingNews


In [16]:
article = Article(link)
article_list.append(article)

In [19]:
article.download()

In [22]:
article.parse()
article.publish_date

In [21]:


for i in range(0,len(article_list)):
    # Select the article wanted and download
    article = article_list[i]
    time.sleep(2)
    article.download()
    # Parse the html and ppend the new article to the list 
    article.parse()
    titles.append(article.title)
    authors.append(article.authors)
    dates.append(article.publish_date)
    texts.append(article.text)

In [22]:
df = pd.DataFrame({
    'Title': titles,
    'Authors': authors,
    'dates': dates,
    'Text': texts,
})

In [23]:
df

Unnamed: 0,Title,Authors,dates,Text
0,Chuck Schumer joins congressional Democrats' c...,[],,"© Chip Somodevilla/Getty Images WASHINGTON, DC..."
1,U.S. grants Myanmar nationals relief from depo...,[],,By Simon Lewis\n\nWASHINGTON (Reuters) - The B...
2,Harry and Meghan's seismic interview will be f...,[],,© Harpo Productions/Joe Pugliese/Getty Images ...
3,Georgetown law professor terminated after rema...,[],,A Georgetown Law School professor has been ter...
4,"After Biden stimulus, US economic growth could...",[],,© Chris Kleponis/CNP/Bloomberg/Getty Images U....
5,The stimulus bill's target: Working and middle...,[],,© Getty Images/iStockphoto Stimulus Check in t...
6,"Probe faults mayor, officials for keeping Prud...",[],,NEW YORK (AP) — An investigation into the offi...
7,Farmers of color receive billions in latest st...,[],,© ANGELA WEISS US-RACISM-MINORITIES-FARMS\n\nT...


# VADER analysis

## Train the model

In [24]:
df['scores'] = df['Text'].apply(lambda review:sid.polarity_scores(review))
df['compound']  = df['scores'].apply(lambda score_dict:score_dict['compound'])
df['comp_score'] = df['compound'].apply(lambda c: 'pos' if c >=0 else 'neg')

df.head()

Unnamed: 0,Title,Authors,dates,Text,scores,compound,comp_score
0,Chuck Schumer joins congressional Democrats' c...,[],,"© Chip Somodevilla/Getty Images WASHINGTON, DC...","{'neg': 0.114, 'neu': 0.803, 'pos': 0.083, 'co...",-0.9939,neg
1,U.S. grants Myanmar nationals relief from depo...,[],,By Simon Lewis\n\nWASHINGTON (Reuters) - The B...,"{'neg': 0.084, 'neu': 0.798, 'pos': 0.118, 'co...",0.8488,pos
2,Harry and Meghan's seismic interview will be f...,[],,© Harpo Productions/Joe Pugliese/Getty Images ...,"{'neg': 0.063, 'neu': 0.869, 'pos': 0.068, 'co...",0.8333,pos
3,Georgetown law professor terminated after rema...,[],,A Georgetown Law School professor has been ter...,"{'neg': 0.105, 'neu': 0.836, 'pos': 0.06, 'com...",-0.98,neg
4,"After Biden stimulus, US economic growth could...",[],,© Chris Kleponis/CNP/Bloomberg/Getty Images U....,"{'neg': 0.04, 'neu': 0.839, 'pos': 0.121, 'com...",0.9976,pos


# Use the model

def news_scraping(source):
    if (source == "Bing news"):
        # Request
        source_url = "https://www.bing.com/news"
        req = requests.get(source_url)

        # Get the content on the page
        page = req.content
        # Soup creation
        soup = BeautifulSoup(page, 'html5lib')

        # News identification
        news = soup.find_all('a', class_ = 'title')
        num_articles = 10

        articles = []
        titles = []
        authors = []
        dates = []
        texts = []

        # Get news links and build article
        article_list = []
        for i in np.arange(0, num_articles):
            if (news[i]['href'] != ''):
                # Get the link for the news article
                link= news[i]['href']
                article = Article(link)
                article_list.append(article)

        for i in range(0,len(article_list)):
            # Select the article wanted and download
            article = article_list[i]
            article.download()
            # Parse the html and ppend the new article to the list 
            article.parse()
            titles.append(article.title)
            authors.append(article.authors)
            dates.append(article.publish_date)
            texts.append(article.text)

        df = pd.DataFrame({
            'Title': titles,
            'Authors': authors,
            'dates': dates,
            'Text': texts,
        })
    elif (source == "CNN"):
        source_url = "http://cnn.com"
        source = newspaper.build(source_url)
        titles = []
        authors = []
        dates = []
        texts = []
        for article in source.articles:
            # Select the article wanted and download
            article.download()
            # Parse the html
            article.parse()
            titles.append(article.title)
            authors.append(article.authors)
            dates.append(article.publish_date)
            texts.append(article.text)
        df = pd.DataFrame({
            'Title': titles,
            'Authors': authors,
            'dates': dates,
            'Text': texts,
        })
    else:
        print("Unrecognized news source")