In [1]:
import os
import json
import requests
import pandas as pd

# Import VADER Dependencies
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Gallo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [38]:
searches = ['American%20Politics','US%20economy', 'US%20stock%20market', 'US%20President','Cryptocurrency%20OR%20Bitcoin%20OR%20Ethereum']
size = 200
dictionary = {'Date': [], 'Category': [], 'Title': []}
for s in searches:
    guardian_url = f"https://content.guardianapis.com/search?q={s}&to-date=2020-12-31&from-date=2020-01-01&order-by=oldest&page-size={size}&api-key=2e00ffe0-1ee3-4914-90b1-b39eb782de0b"
    guardian_response = requests.get(guardian_url)
    guardian_response_json = guardian_response.json()
    list1 = guardian_response_json['response']['results']
    for i, a in enumerate(list1):
        date = list1[i]['webPublicationDate']
        name = list1[i]['webTitle']
        dictionary["Date"].append(date)
        dictionary["Title"].append(name)
        dictionary["Category"].append(s.replace("%20"," "))

In [3]:
clickbait_df = pd.DataFrame(dictionary)
clickbait_df["Date"]= pd.to_datetime(clickbait_df["Date"]) 
# clickbait_df.set_index('Date', inplace=True)
clickbait_df.head()

Unnamed: 0,Date,Category,Title
0,2020-01-01 08:30:00+00:00,American Politics,The final sprint: will any of the Democratic c...
1,2020-01-01 09:01:00+00:00,American Politics,The Power of Bad and How to Overcome It review...
2,2020-01-01 10:00:02+00:00,American Politics,'I'm on the hunt for humour and hope': what wi...
3,2020-01-01 11:15:02+00:00,American Politics,There is no such thing as 'authentic' food. Ig...
4,2020-01-01 16:29:23+00:00,American Politics,Pete Buttigieg fundraising surges amid attacks...


In [4]:
# Create a new column to hold sentiment scores
Sentiment = []

for title in clickbait_df["Title"]:
    try:
        text = title
        sentiment = analyzer.polarity_scores(title)
        compound = sentiment["compound"]
        
        Sentiment.append(compound)
        
    except AttributeError:
        pass

sentiment_df = pd.DataFrame(Sentiment)
sentiment_df.columns = ['Compound']
sentiment_df.head()

Unnamed: 0,Compound
0,0.4767
1,-0.5423
2,0.4767
3,-0.5719
4,-0.4404


In [5]:
combined_df = pd.concat([clickbait_df, sentiment_df], axis="columns", join="inner")
combined_df.set_index('Date', inplace=True)
combined_df["Date Only"] = combined_df.index.date
combined_df.set_index('Date Only', inplace=True)
combined_df.tail()

Unnamed: 0_level_0,Category,Title,Compound
Date Only,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-12-27,Cryptocurrency OR Bitcoin OR Ethereum,Far-right Israeli football fans rebel over Bei...,-0.1531
2020-12-28,Cryptocurrency OR Bitcoin OR Ethereum,Seabird patrols to self-healing buildings: the...,0.0
2020-12-30,Cryptocurrency OR Bitcoin OR Ethereum,'I've never seen anything like it': 2020 smash...,-0.2755
2020-12-30,Cryptocurrency OR Bitcoin OR Ethereum,"Bitcoin surges to record $28,500, quadrupling ...",0.34
2020-12-30,Cryptocurrency OR Bitcoin OR Ethereum,Bitcoin hits record high as US dollar slides –...,0.0


In [6]:
stock_market_df = pd.read_csv('SPY_DJI_RUT.csv', infer_datetime_format=True)
stock_market_df["Date"]= pd.to_datetime(stock_market_df["Date"]) 
stock_market_df.set_index('Date', inplace=True)
stock_market_df["Date Only"] = stock_market_df.index.date
stock_market_df.set_index('Date Only', inplace=True)
stock_market_df = stock_market_df.pct_change()
stock_market_df.dropna()

Unnamed: 0_level_0,DOW JONES,S&P 500,RUSSELL 2000
Date Only,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-03,-0.008103,-0.007060,-0.003540
2020-01-06,0.002392,0.003533,0.001439
2020-01-07,-0.004170,-0.002803,-0.002976
2020-01-08,0.005647,0.004902,0.003184
2020-01-09,0.007369,0.006655,0.000842
...,...,...,...
2020-12-23,0.003809,0.000746,0.008654
2020-12-24,0.002325,0.003537,-0.001569
2020-12-28,0.006758,0.008723,-0.003842
2020-12-29,-0.002246,-0.002227,-0.018480


In [18]:
super_df = combined_df.merge(stock_market_df, left_on=['Date Only'], right_on=['Date Only'], how='right')
super_df = super_df.dropna()
super_df

Unnamed: 0_level_0,Category,Title,Compound,DOW JONES,S&P 500,RUSSELL 2000
Date Only,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-03,American Politics,Impeachment: newly unredacted emails a 'devast...,-0.6486,-0.008103,-0.007060,-0.003540
2020-01-03,American Politics,Who is Qassem Suleimani? Iran farm boy who bec...,0.4754,-0.008103,-0.007060,-0.003540
2020-01-03,American Politics,High fashion: the rise of heels for men,0.0000,-0.008103,-0.007060,-0.003540
2020-01-03,American Politics,Abu Mahdi al-Muhandis: Iraqi killed in US stri...,-0.7184,-0.008103,-0.007060,-0.003540
2020-01-03,American Politics,What drives the ‘moral grandstanding’ that has...,-0.4939,-0.008103,-0.007060,-0.003540
...,...,...,...,...,...,...
2020-12-24,Cryptocurrency OR Bitcoin OR Ethereum,"'What am I about?' Ben Lee takes on QAnon, the...",0.4404,0.002325,0.003537,-0.001569
2020-12-28,Cryptocurrency OR Bitcoin OR Ethereum,Seabird patrols to self-healing buildings: the...,0.0000,0.006758,0.008723,-0.003842
2020-12-30,Cryptocurrency OR Bitcoin OR Ethereum,'I've never seen anything like it': 2020 smash...,-0.2755,0.002436,0.001342,0.010529
2020-12-30,Cryptocurrency OR Bitcoin OR Ethereum,"Bitcoin surges to record $28,500, quadrupling ...",0.3400,0.002436,0.001342,0.010529


In [36]:
american_politics_df = super_df[super_df["Category"]=='American Politics']
american_politics_df.head()
american_politics_df.corr()

Unnamed: 0,Compound,DOW JONES,S&P 500,RUSSELL 2000
Compound,1.0,0.130094,0.093592,0.128523
DOW JONES,0.130094,1.0,0.909324,0.742464
S&P 500,0.093592,0.909324,1.0,0.675777
RUSSELL 2000,0.128523,0.742464,0.675777,1.0


In [35]:
us_economy_df = super_df[super_df["Category"]=='US economy']
us_economy_df.head()
us_economy_df.corr()

Unnamed: 0,Compound,DOW JONES,S&P 500,RUSSELL 2000
Compound,1.0,0.147449,0.100819,0.172123
DOW JONES,0.147449,1.0,0.916128,0.750258
S&P 500,0.100819,0.916128,1.0,0.699037
RUSSELL 2000,0.172123,0.750258,0.699037,1.0


In [33]:
us_stockmarket_df = super_df[super_df["Category"]=='US stock market']
us_stockmarket_df.head()
us_stockmarket_df.corr()

Unnamed: 0,Compound,DOW JONES,S&P 500,RUSSELL 2000
Compound,1.0,-0.017151,-0.016975,0.005601
DOW JONES,-0.017151,1.0,0.928252,0.782477
S&P 500,-0.016975,0.928252,1.0,0.702112
RUSSELL 2000,0.005601,0.782477,0.702112,1.0


In [34]:
us_president_df = super_df[super_df["Category"]=='US President']
us_president_df.head()
us_president_df.corr()

Unnamed: 0,Compound,DOW JONES,S&P 500,RUSSELL 2000
Compound,1.0,0.042979,0.020847,0.102471
DOW JONES,0.042979,1.0,0.993823,0.920134
S&P 500,0.020847,0.993823,1.0,0.911958
RUSSELL 2000,0.102471,0.920134,0.911958,1.0


In [37]:
bitcoin_ethereum_df = super_df[super_df["Category"]=='Cryptocurrency OR Bitcoin OR Ethereum']
bitcoin_ethereum_df.head()
bitcoin_ethereum_df.corr()

Unnamed: 0,Compound,DOW JONES,S&P 500,RUSSELL 2000
Compound,1.0,-0.05891,-0.03101,-0.026914
DOW JONES,-0.05891,1.0,0.980357,0.928672
S&P 500,-0.03101,0.980357,1.0,0.913261
RUSSELL 2000,-0.026914,0.928672,0.913261,1.0


In [28]:
# Get the shape of the dataframe
super_df.shape

(631, 6)

In [29]:
# Get data information
super_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 631 entries, 2020-01-03 to 2020-12-30
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Category      631 non-null    object 
 1   Title         631 non-null    object 
 2   Compound      631 non-null    float64
 3   DOW JONES     631 non-null    float64
 4   S&P 500       631 non-null    float64
 5   RUSSELL 2000  631 non-null    float64
dtypes: float64(4), object(2)
memory usage: 34.5+ KB


In [15]:
# import spacy
# nlp = spacy.load("en_core_web_lg")

In [16]:
# for Title, rows in clickbait_df.iterrows():
#    doc = nlp(rows['Title'])
#    print(doc)