## Code reference:
* News scraping
    - http://theautomatic.net/2020/08/05/how-to-scrape-news-articles-with-python/
    - https://newspaper.readthedocs.io/en/latest/
    - https://github.com/miguelfzafra/Latest-News-Classifier
* VADER code reference: 
    - https://towardsdatascience.com/sentimental-analysis-using-vader-a3415fef7664

In [1]:
# Import modules
import newspaper
from newspaper import Article
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

from __future__ import (absolute_import, division, print_function, unicode_literals)
import backtrader as bt
import backtrader.indicators as btind
import datetime
import os.path
import sys


import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\lemon\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# News scraping

In [2]:
#source = "Bing news"
source = "CNN"

In [5]:
def news_scraping(source):
    if (source == "Bing news"):
        # Request
        source_url = "https://www.bing.com/news"
        req = requests.get(source_url)

        # Get the content on the page
        page = req.content
        # Soup creation
        soup = BeautifulSoup(page, 'html5lib')

        # News identification
        news = soup.find_all('a', class_ = 'title')
        num_articles = 10

        articles = []
        titles = []
        authors = []
        dates = []
        texts = []

        # Get news links and build article
        article_list = []
        for i in np.arange(0, num_articles):
            if (news[i]['href'] != ''):
                # Get the link for the news article
                link= news[i]['href']
                article = Article(link)
                article_list.append(article)

        for i in range(0,len(article_list)):
            # Select the article wanted and download
            article = article_list[i]
            article.download()
            # Parse the html and ppend the new article to the list 
            article.parse()
            titles.append(article.title)
            authors.append(article.authors)
            dates.append(article.publish_date)
            texts.append(article.text)

        df = pd.DataFrame({
            'Title': titles,
            'Authors': authors,
            'dates': dates,
            'Text': texts,
        })
    elif (source == "CNN"):
        source_url = "http://cnn.com"
        source = newspaper.build(source_url)
        titles = []
        authors = []
        dates = []
        texts = []
        for article in source.articles:
            # Select the article wanted and download
            article.download()
            # Parse the html
            article.parse()
            titles.append(article.title)
            authors.append(article.authors)
            dates.append(article.publish_date)
            texts.append(article.text)
        df = pd.DataFrame({
            'Title': titles,
            'Authors': authors,
            'dates': dates,
            'Text': texts,
        })
    else:
        print("Unrecognized news source")

In [7]:
source_url = "http://cnn.com"
source = newspaper.build(source_url)
titles = []
authors = []
dates = []
texts = []
for article in source.articles:
    # Select the article wanted and download
    article.download()
    # Parse the html
    article.parse()
    titles.append(article.title)
    authors.append(article.authors)
    dates.append(article.publish_date)
    texts.append(article.text)
df = pd.DataFrame({
    'Title': titles,
    'Authors': authors,
    'dates': dates,
    'Text': texts,
})

In [8]:
df

Unnamed: 0,Title,Authors,dates,Text


# VADER analysis

## Train the model

In [None]:
df['scores'] = df['Text'].apply(lambda review:sid.polarity_scores(review))
df['compound']  = df['scores'].apply(lambda score_dict:score_dict['compound'])
df['comp_score'] = df['compound'].apply(lambda c: 'pos' if c >=0 else 'neg')

df.head()

# Use the model