In [1]:
# Initial imports
import os
import pandas as pd
from datetime import datetime, timedelta
from pathlib import Path
from newsapi import NewsApiClient
from dotenv import load_dotenv
from nltk.corpus import stopwords, reuters
load_dotenv()
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

%matplotlib inline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/samuelarciniega/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
api_key = os.getenv("news_api")
#print(api_key)

In [3]:
newsapi = NewsApiClient(api_key=api_key)

In [4]:
tech_news = newsapi.get_everything(q = "Inflation", language="en", page_size=100, sort_by="relevancy")
print(f"Total articles about Inflation: {tech_news['totalResults']}")
tech_news["articles"][0]

Total articles about Inflation: 14917


{'source': {'id': None, 'name': 'New York Times'},
 'author': 'David Leonhardt',
 'title': 'The Specter of Inflation',
 'description': 'Is it haunting the U.S. economy?',
 'url': 'https://www.nytimes.com/2021/06/17/briefing/inflation-us-economy-covid-pandemic.html',
 'urlToImage': 'https://static01.nyt.com/images/2021/06/17/lens/17ambriefing-promo/17ambriefing-inflation-facebookJumbo-v2.jpg',
 'publishedAt': '2021-06-17T10:30:22Z',
 'content': 'With excess saving they can afford more of everything, Jason Furman, a Harvard economist and former Obama administration official, wrote this week.\r\nAlthough companies are increasing the supply of ma… [+2099 chars]'}

In [5]:
inflation_news_df = pd.DataFrame(inflation_news)

In [6]:
inflation_news_df.to_csv("inflation_news.csv")

In [13]:
sentiments = []
for article in tech_news["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]

        sentiments.append({
            "Text": text,
            "Date": date,
            "Compound":compound,
            "Positive": pos,
            "Negative": neg,
            "Neutral": neu
        })
    except AttributeError:
        pass
tech_df = pd.DataFrame(sentiments)

columns = ["Date", "Text", "Compound", "Positive", "Negative", "Neutral"]
tech_df = tech_df[columns]

tech_df.head(100)

Unnamed: 0,Date,Text,Compound,Positive,Negative,Neutral
0,2021-06-17,With excess saving they can afford more of eve...,0.0000,0.000,0.000,1.000
1,2021-06-21,"10. And finally, figuring out how to be a supe...",0.0000,0.000,0.000,1.000
2,2021-06-10,"4. As the U.S. withdraws from Afghanistan, int...",-0.4939,0.000,0.091,0.909
3,2021-06-16,"A day after ousting two top executives, the el...",0.2023,0.050,0.000,0.950
4,2021-06-10,,0.0000,0.000,0.000,0.000
...,...,...,...,...,...,...
95,2021-06-18,By Reuters Staff\r\nFILE PHOTO: A man points a...,0.0000,0.000,0.000,1.000
96,2021-07-01,El Salvador's President Nayib Bukele speaks at...,0.0000,0.000,0.000,1.000
97,2021-06-10,"By Reuters Staff\r\n* KOSPI rises, foreigners ...",0.7717,0.193,0.000,0.807
98,2021-06-08,"BENGALURU, June 8 (Reuters) - Indian shares re...",-0.2500,0.114,0.141,0.745


In [31]:
df = pd.DataFrame()
df["Compound"] = tech_df["Compound"]
df["Positive"] = tech_df["Positive"]
df["Negative"] = tech_df["Negative"]
df["Neutral"] = tech_df["Neutral"]
df["Date"] = tech_df["Date"]
#df.index = df.index.Date
df.head(500)

Unnamed: 0,Compound,Positive,Negative,Neutral,Date
0,0.0000,0.000,0.000,1.000,2021-06-17
1,0.0000,0.000,0.000,1.000,2021-06-21
2,-0.4939,0.000,0.091,0.909,2021-06-10
3,0.2023,0.050,0.000,0.950,2021-06-16
4,0.0000,0.000,0.000,0.000,2021-06-10
...,...,...,...,...,...
95,0.0000,0.000,0.000,1.000,2021-06-18
96,0.0000,0.000,0.000,1.000,2021-07-01
97,0.7717,0.193,0.000,0.807,2021-06-10
98,-0.2500,0.114,0.141,0.745,2021-06-08


In [32]:
df.set_index("Date")

Unnamed: 0_level_0,Compound,Positive,Negative,Neutral
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-06-17,0.0000,0.000,0.000,1.000
2021-06-21,0.0000,0.000,0.000,1.000
2021-06-10,-0.4939,0.000,0.091,0.909
2021-06-16,0.2023,0.050,0.000,0.950
2021-06-10,0.0000,0.000,0.000,0.000
...,...,...,...,...
2021-06-18,0.0000,0.000,0.000,1.000
2021-07-01,0.0000,0.000,0.000,1.000
2021-06-10,0.7717,0.193,0.000,0.807
2021-06-08,-0.2500,0.114,0.141,0.745


In [22]:
tech_df.describe()

Unnamed: 0,Compound,Positive,Negative,Neutral
count,100.0,100.0,100.0,100.0
mean,0.093097,0.05973,0.0351,0.90516
std,0.394231,0.063182,0.051089,0.070182
min,-0.7579,0.0,0.0,0.751
25%,-0.064375,0.0,0.0,0.85875
50%,0.0,0.0595,0.0,0.912
75%,0.4068,0.088,0.0675,0.953
max,0.7964,0.239,0.217,1.0


In [10]:
inflation_df.to_csv("inflation_polarity.csv")

In [11]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [12]:
lemmatizer = WordNetLemmatizer()
sw = set(stopwords.words('english'))

In [14]:
def tokenizer(text):
    regex = re.compile("[^a-zA-Z]")
    re_clean = regex.sub(' ', text)
    words = word_tokenize(re_clean.lower())
    lem = [lemmatizer.lemmatize(word) for word in words]
    tokens = [word.lower() for word in lem if word.lower() not in sw]
    return tokens
tokenizer(inflation_df.iloc[0]["Text"])

['excess',
 'saving',
 'afford',
 'everything',
 'jason',
 'furman',
 'harvard',
 'economist',
 'former',
 'obama',
 'administration',
 'official',
 'wrote',
 'week',
 'although',
 'company',
 'increasing',
 'supply',
 'char']

In [15]:
inflation_df["tokens"] = inflation_df["Text"].apply(tokenizer)
inflation_df.head(20)

Unnamed: 0,Date,Text,Compound,Positive,Negative,Neutral,tokens
0,2021-06-17,With excess saving they can afford more of eve...,0.0,0.0,0.0,1.0,"[excess, saving, afford, everything, jason, fu..."
1,2021-06-21,"10. And finally, figuring out how to be a supe...",0.0,0.0,0.0,1.0,"[finally, figuring, super, ager, maybe, know, ..."
2,2021-06-10,"4. As the U.S. withdraws from Afghanistan, int...",-0.4939,0.0,0.091,0.909,"[u, withdraws, afghanistan, interpreter, fear,..."
3,2021-06-16,"A day after ousting two top executives, the el...",0.2023,0.05,0.0,0.95,"[day, ousting, two, top, executive, electric, ..."
4,2021-06-10,,0.0,0.0,0.0,0.0,[]
5,2021-06-14,This story originally appeared on StockMarketA...,0.4404,0.079,0.0,0.921,"[story, originally, appeared, stockmarketare, ..."
6,2021-06-07,"By Reuters Staff\r\nBRASILIA, June 7 (Reuters)...",0.3818,0.075,0.0,0.925,"[reuters, staff, brasilia, june, reuters, fore..."
7,2021-06-25,"MEXICO CITY, June 25 (Reuters) - Mexico is fac...",0.1027,0.08,0.069,0.851,"[mexico, city, june, reuters, mexico, facing, ..."
8,2021-06-25,"By Reuters Staff\r\nBRASILIA, June 25 (Reuters...",0.0,0.0,0.0,1.0,"[reuters, staff, brasilia, june, reuters, braz..."
9,2021-06-23,U.S. Treasury Secretary Janet Yellen testifies...,0.3818,0.114,0.0,0.886,"[u, treasury, secretary, janet, yellen, testif..."


In [5]:
current_date = pd.Timestamp(datetime.now(), tz="America/New_York").isoformat()
past_date = pd.Timestamp(datetime.now()- timedelta(30), tz="America/New_York").isoformat()

In [6]:
def get_headlines(keyword):
    all_headlines = []
    all_dates = []    
    date = datetime.strptime(current_date[:10], "%Y-%m-%d")
    end_date = datetime.strptime(past_date[:10], "%Y-%m-%d")
    print(f"Fetching news about '{keyword}'")
    print("*" * 30)
    while date > end_date:
        print(f"retrieving news from: {date}")
        articles = newsapi.get_everything(
            q=keyword,
            from_param=str(date)[:10],
            to=str(date)[:10],
            language="en",
            sort_by="relevancy",
            page=1,
        )
        headlines = []
        for i in range(0, len(articles["articles"])):
            headlines.append(articles["articles"][i]["title"])
        all_headlines.append(headlines)
        all_dates.append(date)
        date = date - timedelta(days=1)
    return all_headlines, all_dates

In [37]:
inflation_headlines, dates = get_headlines("Inflation")

Fetching news about 'Inflation'
******************************
retrieving news from: 2021-06-28 00:00:00
retrieving news from: 2021-06-27 00:00:00
retrieving news from: 2021-06-26 00:00:00
retrieving news from: 2021-06-25 00:00:00
retrieving news from: 2021-06-24 00:00:00
retrieving news from: 2021-06-23 00:00:00
retrieving news from: 2021-06-22 00:00:00
retrieving news from: 2021-06-21 00:00:00
retrieving news from: 2021-06-20 00:00:00
retrieving news from: 2021-06-19 00:00:00
retrieving news from: 2021-06-18 00:00:00
retrieving news from: 2021-06-17 00:00:00
retrieving news from: 2021-06-16 00:00:00
retrieving news from: 2021-06-15 00:00:00
retrieving news from: 2021-06-14 00:00:00
retrieving news from: 2021-06-13 00:00:00
retrieving news from: 2021-06-12 00:00:00
retrieving news from: 2021-06-11 00:00:00
retrieving news from: 2021-06-10 00:00:00
retrieving news from: 2021-06-09 00:00:00
retrieving news from: 2021-06-08 00:00:00
retrieving news from: 2021-06-07 00:00:00
retrieving ne

In [7]:
tesla_headlines, dates = get_headlines("tesla")

Fetching news about 'tesla'
******************************
retrieving news from: 2021-07-04 00:00:00
retrieving news from: 2021-07-03 00:00:00
retrieving news from: 2021-07-02 00:00:00
retrieving news from: 2021-07-01 00:00:00
retrieving news from: 2021-06-30 00:00:00
retrieving news from: 2021-06-29 00:00:00
retrieving news from: 2021-06-28 00:00:00
retrieving news from: 2021-06-27 00:00:00
retrieving news from: 2021-06-26 00:00:00
retrieving news from: 2021-06-25 00:00:00


NewsAPIException: {'status': 'error', 'code': 'rateLimited', 'message': 'You have made too many requests recently. Developer accounts are limited to 100 requests over a 24 hour period (50 requests available every 12 hours). Please upgrade to a paid plan if you need more requests.'}

In [8]:
tesla_headlines.to_csv("tesla.csv")

NameError: name 'tesla_headlines' is not defined