In [31]:
#import libraries
import os
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
from dotenv import load_dotenv
import alpaca_trade_api as tradeapi
from newsapi.newsapi_client import NewsApiClient
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [10]:
# Set News API Key
newsapi = NewsApiClient(api_key='dc895cb5774c42ceaadd10ca782350dc')

In [11]:
# Set current date and the date from one month ago using the ISO format
current_date = pd.Timestamp("2021-01-11", tz="America/New_York").isoformat()
past_date = pd.Timestamp("2020-12-10", tz="America/New_York").isoformat()


In [17]:
print(current_date)


2021-01-11T00:00:00-05:00


In [22]:
type(current_date)

str

In [25]:
def get_headlines(keyword):
    all_headlines = []
    all_dates = []    
    date = datetime.strptime(current_date[:10], "%Y-%m-%d")
    end_date = datetime.strptime(past_date[:10], "%Y-%m-%d")
    print(f"Fetching news about '{keyword}'")
    print("*" * 30)
    while date > end_date:
        print(f"retrieving news from: {date}")
        articles = newsapi.get_everything(
            q=keyword,
            from_param=date,
            to=date,
            language="en",
            sort_by="relevancy",
            page=1,
        )
        headlines = []
        for i in range(0, len(articles["articles"])):
            headlines.append(articles["articles"][i]["title"])
        all_headlines.append(headlines)
        all_dates.append(date)
        date = date - timedelta(days=1)
    return all_headlines, all_dates

In [26]:
# Get vote topics
election_headlines, dates = get_headlines("election")

Fetching news about 'election'
******************************
retrieving news from: 2021-01-11 00:00:00
retrieving news from: 2021-01-10 00:00:00
retrieving news from: 2021-01-09 00:00:00
retrieving news from: 2021-01-08 00:00:00
retrieving news from: 2021-01-07 00:00:00
retrieving news from: 2021-01-06 00:00:00
retrieving news from: 2021-01-05 00:00:00
retrieving news from: 2021-01-04 00:00:00
retrieving news from: 2021-01-03 00:00:00
retrieving news from: 2021-01-02 00:00:00
retrieving news from: 2021-01-01 00:00:00
retrieving news from: 2020-12-31 00:00:00
retrieving news from: 2020-12-30 00:00:00
retrieving news from: 2020-12-29 00:00:00
retrieving news from: 2020-12-28 00:00:00
retrieving news from: 2020-12-27 00:00:00
retrieving news from: 2020-12-26 00:00:00
retrieving news from: 2020-12-25 00:00:00
retrieving news from: 2020-12-24 00:00:00
retrieving news from: 2020-12-23 00:00:00
retrieving news from: 2020-12-22 00:00:00
retrieving news from: 2020-12-21 00:00:00
retrieving new

In [27]:
sid = SentimentIntensityAnalyzer()

# Create function that computes average compound sentiment of headlines for each day
def headline_sentiment_summarizer_avg(headlines):
    sentiment = []
    for day in headlines:
        day_score = []
        for h in day:
            if h == None:
                continue
            else:
                day_score.append(sid.polarity_scores(h)["compound"])
        sentiment.append(sum(day_score) / len(day_score))
    return sentiment

In [28]:
# Get averages of each topics sentiment
election_avg = headline_sentiment_summarizer_avg(election_headlines)


In [66]:
# Combine Sentiment Averages into DataFrame
election_sentiments = pd.DataFrame(
    {
        "election_avg": election_avg,
    }
)

election_sentiments.index = pd.to_datetime(dates)

display(election_sentiments)

Unnamed: 0,election_avg
2021-01-11,0.021515
2021-01-10,-0.04739
2021-01-09,-0.18235
2021-01-08,-0.153045
2021-01-07,-0.238405
2021-01-06,-0.05462
2021-01-05,-0.004395
2021-01-04,-0.06446
2021-01-03,-0.173055
2021-01-02,-0.141885


In [64]:
data = Path('protest_vote_sentiment.csv')
protest_vote_df = pd.read_csv(data, index_col=0)
protest_vote_df.head()

Unnamed: 0,vote_avg,protest_avg
2021-01-11,-0.042335,-0.03945
2021-01-10,-0.061795,-0.258725
2021-01-09,-0.178465,-0.14398
2021-01-08,-0.147315,-0.36207
2021-01-07,-0.33845,-0.361915


In [71]:
protest_vote_df.shape, election_sentiments.shape

((32, 2), (31, 1))

In [72]:
news_sentiment = protest_vote_df.join(election_sentiments, how='inner')

In [76]:
news_sentiment

Unnamed: 0,vote_avg,protest_avg,election_avg
2021-01-11,-0.042335,-0.03945,0.021515
2021-01-10,-0.061795,-0.258725,-0.04739
2021-01-09,-0.178465,-0.14398,-0.18235
2021-01-08,-0.147315,-0.36207,-0.153045
2021-01-07,-0.33845,-0.361915,-0.238405
2021-01-06,0.058345,-0.27163,-0.05462
2021-01-05,-0.09968,-0.30757,-0.004395
2021-01-04,0.07661,-0.164505,-0.06446
2021-01-03,0.04914,-0.46155,-0.173055
2021-01-02,-0.15559,-0.20167,-0.141885


In [None]:
#put it in a dataframe

In [None]:
#segment/cut dataframe by date of article.  Drop unnecessary data columns 

In [None]:
#combine or classify articles by date

In [None]:
#Data cleaning to news content as needed: lowercase, lemmantize

In [None]:
#TD-IDF word counts by date 