In [22]:
# Initial imports
import os
import pandas as pd
from datetime import datetime, timedelta
from pathlib import Path
from newsapi import NewsApiClient
from dotenv import load_dotenv
from nltk.corpus import stopwords, reuters
load_dotenv()
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

%matplotlib inline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/samuelarciniega/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [23]:
api_key = os.getenv("NEWS_API_KEY")
print(api_key)

6dde5b0486654982bdc6ebe38ab427a4


In [24]:
newsapi = NewsApiClient(api_key=api_key)

In [25]:
tech_news = newsapi.get_everything(q = "big tech regulation", language="en", page_size=100, sort_by="relevancy")
print(f"Total articles about Tech & Regulation: {tech_news['totalResults']}")
tech_news["articles"][0]

Total articles about Tech & Regulation: 860


{'source': {'id': 'techcrunch', 'name': 'TechCrunch'},
 'author': 'Natasha Lomas',
 'title': 'CJEU ruling could open big tech to more privacy litigation in Europe',
 'description': 'A long running privacy fight between Belgium’s data protection authority and Facebook — over the latter’s use of online trackers like pixels and social plug-ins to snoop on web users — has culminated in a ruling by Europe’s top court today that could have wid…',
 'url': 'http://techcrunch.com/2021/06/15/cjeu-ruling-could-open-big-tech-to-more-privacy-litigation-in-europe/',
 'urlToImage': 'https://techcrunch.com/wp-content/uploads/2015/11/data-privacy-law.png?w=711',
 'publishedAt': '2021-06-15T12:57:31Z',
 'content': 'A long running privacy fight between Belgium’s data protection authority and Facebook — over the latter’s use of online trackers like pixels and social plug-ins to snoop on web users — has culminated… [+6936 chars]'}

In [33]:
sentiments = []
for article in tech_news["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]

        sentiments.append({
            "Text": text,
            "Date": date,
            "Compound":compound,
            "Positive": pos,
            "Negative": neg,
            "Neutral": neu
        })
    except AttributeError:
        pass
tech_df = pd.DataFrame(sentiments)

columns = ["Date", "Text", "Compound", "Positive", "Negative", "Neutral"]
tech_df = tech_df[columns]

tech_df.head(20)

Unnamed: 0,Date,Text,Compound,Positive,Negative,Neutral
0,2021-06-15,A long running privacy fight between Belgium’s...,0.0516,0.107,0.073,0.819
1,2021-06-14,Decision comes after company investigated by t...,0.1531,0.088,0.0,0.912
2,2021-06-17,Democratic Senator Kirsten Gillibrand has revi...,0.0,0.0,0.0,1.0
3,2021-06-25,To get a roundup of TechCrunchs biggest and mo...,0.624,0.13,0.0,0.87
4,2021-06-18,The UK’s chief data protection regulator has w...,-0.5859,0.0,0.13,0.87
5,2021-06-28,The UK’s digital businesses can breathe a sign...,0.4767,0.086,0.0,0.914
6,2021-06-21,Lina Khan has been one of Big Tech's biggest c...,-0.296,0.0,0.059,0.941
7,2021-05-31,The Station is a weekly newsletter dedicated t...,0.7184,0.154,0.0,0.846
8,2021-06-15,By Reuters Staff\r\nFILE PHOTO: A 3D printed G...,0.0,0.0,0.0,1.0
9,2021-06-15,If the purpose is to investigate if Google or ...,-0.2732,0.106,0.13,0.764


In [27]:
tech_df.describe()

Unnamed: 0,Compound,Positive,Negative,Neutral
count,100.0,100.0,100.0,100.0
mean,0.116337,0.07188,0.0381,0.88002
std,0.417127,0.071691,0.059306,0.121786
min,-0.8634,0.0,0.0,0.0
25%,-0.1027,0.0,0.0,0.832
50%,0.0516,0.062,0.0,0.8985
75%,0.430825,0.10625,0.07075,0.94525
max,0.9153,0.33,0.285,1.0


In [28]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [29]:
lemmatizer = WordNetLemmatizer()
sw = set(stopwords.words('english'))

In [31]:
def tokenizer(text):
    regex = re.compile("[^a-zA-Z]")
    re_clean = regex.sub(' ', text)
    words = word_tokenize(re_clean.lower())
    lem = [lemmatizer.lemmatize(word) for word in words]
    tokens = [word.lower() for word in lem if word.lower() not in sw]
    return tokens
tokenizer(tech_df.iloc[0]["Text"])

['long',
 'running',
 'privacy',
 'fight',
 'belgium',
 'data',
 'protection',
 'authority',
 'facebook',
 'latter',
 'use',
 'online',
 'tracker',
 'like',
 'pixel',
 'social',
 'plug',
 'snoop',
 'web',
 'user',
 'ha',
 'culminated',
 'char']

In [34]:
tech_df["tokens"] = tech_df["Text"].apply(tokenizer)
tech_df.head(20)

Unnamed: 0,Date,Text,Compound,Positive,Negative,Neutral,tokens
0,2021-06-15,A long running privacy fight between Belgium’s...,0.0516,0.107,0.073,0.819,"[long, running, privacy, fight, belgium, data,..."
1,2021-06-14,Decision comes after company investigated by t...,0.1531,0.088,0.0,0.912,"[decision, come, company, investigated, uk, co..."
2,2021-06-17,Democratic Senator Kirsten Gillibrand has revi...,0.0,0.0,0.0,1.0,"[democratic, senator, kirsten, gillibrand, ha,..."
3,2021-06-25,To get a roundup of TechCrunchs biggest and mo...,0.624,0.13,0.0,0.87,"[get, roundup, techcrunchs, biggest, important..."
4,2021-06-18,The UK’s chief data protection regulator has w...,-0.5859,0.0,0.13,0.87,"[uk, chief, data, protection, regulator, ha, w..."
5,2021-06-28,The UK’s digital businesses can breathe a sign...,0.4767,0.086,0.0,0.914,"[uk, digital, business, breathe, sign, relief,..."
6,2021-06-21,Lina Khan has been one of Big Tech's biggest c...,-0.296,0.0,0.059,0.941,"[lina, khan, ha, one, big, tech, biggest, crit..."
7,2021-05-31,The Station is a weekly newsletter dedicated t...,0.7184,0.154,0.0,0.846,"[station, weekly, newsletter, dedicated, thing..."
8,2021-06-15,By Reuters Staff\r\nFILE PHOTO: A 3D printed G...,0.0,0.0,0.0,1.0,"[reuters, staff, file, photo, printed, google,..."
9,2021-06-15,If the purpose is to investigate if Google or ...,-0.2732,0.106,0.13,0.764,"[purpose, investigate, google, apple, anything..."


In [35]:
current_date = pd.Timestamp(datetime.now(), tz="America/New_York").isoformat()
past_date = pd.Timestamp(datetime.now()- timedelta(30), tz="America/New_York").isoformat()

In [36]:
def get_headlines(keyword):
    all_headlines = []
    all_dates = []    
    date = datetime.strptime(current_date[:10], "%Y-%m-%d")
    end_date = datetime.strptime(past_date[:10], "%Y-%m-%d")
    print(f"Fetching news about '{keyword}'")
    print("*" * 30)
    while date > end_date:
        print(f"retrieving news from: {date}")
        articles = newsapi.get_everything(
            q=keyword,
            from_param=str(date)[:10],
            to=str(date)[:10],
            language="en",
            sort_by="relevancy",
            page=1,
        )
        headlines = []
        for i in range(0, len(articles["articles"])):
            headlines.append(articles["articles"][i]["title"])
        all_headlines.append(headlines)
        all_dates.append(date)
        date = date - timedelta(days=1)
    return all_headlines, all_dates

In [37]:
inflation_headlines, dates = get_headlines("Inflation")

Fetching news about 'Inflation'
******************************
retrieving news from: 2021-06-28 00:00:00
retrieving news from: 2021-06-27 00:00:00
retrieving news from: 2021-06-26 00:00:00
retrieving news from: 2021-06-25 00:00:00
retrieving news from: 2021-06-24 00:00:00
retrieving news from: 2021-06-23 00:00:00
retrieving news from: 2021-06-22 00:00:00
retrieving news from: 2021-06-21 00:00:00
retrieving news from: 2021-06-20 00:00:00
retrieving news from: 2021-06-19 00:00:00
retrieving news from: 2021-06-18 00:00:00
retrieving news from: 2021-06-17 00:00:00
retrieving news from: 2021-06-16 00:00:00
retrieving news from: 2021-06-15 00:00:00
retrieving news from: 2021-06-14 00:00:00
retrieving news from: 2021-06-13 00:00:00
retrieving news from: 2021-06-12 00:00:00
retrieving news from: 2021-06-11 00:00:00
retrieving news from: 2021-06-10 00:00:00
retrieving news from: 2021-06-09 00:00:00
retrieving news from: 2021-06-08 00:00:00
retrieving news from: 2021-06-07 00:00:00
retrieving ne

In [40]:
tech_headlines, dates = get_headlines("tech regulation")

Fetching news about 'tech regulation'
******************************
retrieving news from: 2021-06-28 00:00:00
retrieving news from: 2021-06-27 00:00:00
retrieving news from: 2021-06-26 00:00:00
retrieving news from: 2021-06-25 00:00:00
retrieving news from: 2021-06-24 00:00:00
retrieving news from: 2021-06-23 00:00:00
retrieving news from: 2021-06-22 00:00:00
retrieving news from: 2021-06-21 00:00:00
retrieving news from: 2021-06-20 00:00:00
retrieving news from: 2021-06-19 00:00:00
retrieving news from: 2021-06-18 00:00:00
retrieving news from: 2021-06-17 00:00:00
retrieving news from: 2021-06-16 00:00:00
retrieving news from: 2021-06-15 00:00:00
retrieving news from: 2021-06-14 00:00:00
retrieving news from: 2021-06-13 00:00:00
retrieving news from: 2021-06-12 00:00:00
retrieving news from: 2021-06-11 00:00:00
retrieving news from: 2021-06-10 00:00:00
retrieving news from: 2021-06-09 00:00:00
retrieving news from: 2021-06-08 00:00:00
retrieving news from: 2021-06-07 00:00:00
retriev