In [1]:
# Initial imports
import os
import pandas as pd
from dotenv import load_dotenv
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

%matplotlib inline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\bobo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Read your api key env variable
load_dotenv()
api_key = os.getenv("API_KEY")

In [3]:
# Create a newsapi client
from newsapi.newsapi_client import NewsApiClient

In [4]:
newsapi = NewsApiClient(api_key="API_KEY")

In [6]:
# Fetch the Bitcoin news articles
btc_headlines = newsapi.get_everything(q='bitcoin',
                                      from_param='2022-01-01',
                                      language='en',
                                      sort_by='relevancy',
                                      page=2)

In [7]:
# Fetch the Ethereum news articles
eth_headlines = newsapi.get_everything(q='ethereum',
                                      from_param='2022-01-01',
                                      language='en',
                                      sort_by='relevancy',
                                      page=2)

In [8]:
# Create the Bitcoin sentiment scores DataFrame
sentiments = []

for articles in btc_headlines["articles"]:
    try:
        text = articles["content"]
        results = analyzer.polarity_scores(text)
        compound = results["compound"]
        pos = results["pos"]
        neu = results["neu"]
        neg = results["neg"]

        sentiments.append({
            "text": text,
            "Compound": compound,
            "Positive": pos,
            "Negative": neg,
            "Neutral": neu,
        })
    except AttributeError:
        pass
    
btc  = pd.DataFrame(sentiments)
btc.head()

Unnamed: 0,text,Compound,Positive,Negative,Neutral
0,While the nation was in lockdown mode in the s...,0.624,0.107,0.0,0.893
1,Bitcoin is likely to take more market share fr...,0.8473,0.278,0.0,0.722
2,"LONDON, Jan 3 (Reuters) - The dollar ticked up...",-0.4215,0.0,0.074,0.926
3,Believers in the transformative power of the b...,0.0,0.0,0.0,1.0
4,"Kosovo has banned Bitcoin mining, alongside ot...",-0.8126,0.164,0.305,0.531


In [9]:
# Create the ethereum sentiment scores DataFrame
sentiments = []

for articles in eth_headlines["articles"]:
    try:
        text = articles["content"]
        results = analyzer.polarity_scores(text)
        compound = results["compound"]
        pos = results["pos"]
        neu = results["neu"]
        neg = results["neg"]

        sentiments.append({
            "text": text,
            "Compound": compound,
            "Positive": pos,
            "Negative": neg,
            "Neutral": neu,
        })
    except AttributeError:
        pass
    
eth  = pd.DataFrame(sentiments)
eth.head()

Unnamed: 0,text,Compound,Positive,Negative,Neutral
0,A group of academics from the University of Ca...,0.4019,0.091,0.0,0.909
1,Crypto.com coin price prediction\r\nThe Crypto...,0.2023,0.055,0.0,0.945
2,Big Technology— by Alex Kantrowitz\r\nRevealin...,0.1779,0.049,0.0,0.951
3,Lately I’m paying more attention to the crypto...,-0.34,0.0,0.062,0.938
4,Cryptocurrency was one of the hottest investme...,0.3612,0.077,0.0,0.923


In [10]:
# Describe the Bitcoin Sentiment
btc.describe()

Unnamed: 0,Compound,Positive,Negative,Neutral
count,20.0,20.0,20.0,20.0
mean,0.063885,0.05445,0.03495,0.9106
std,0.382502,0.072785,0.072692,0.115925
min,-0.8126,0.0,0.0,0.531
25%,0.0,0.0,0.0,0.89375
50%,0.0,0.0245,0.0,0.933
75%,0.260025,0.09,0.065,1.0
max,0.8473,0.278,0.305,1.0


In [11]:
# Describe the Ethereum Sentiment
eth.describe()

Unnamed: 0,Compound,Positive,Negative,Neutral
count,20.0,20.0,20.0,20.0
mean,0.394625,0.115,0.0139,0.8711
std,0.379987,0.075164,0.036625,0.07423
min,-0.4588,0.0,0.0,0.732
25%,0.1962,0.05725,0.0,0.83175
50%,0.4393,0.1005,0.0,0.8655
75%,0.6833,0.167,0.0,0.939
max,0.9349,0.268,0.141,1.0


## Natural Language Processing

In [15]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [16]:
# Instantiate the lemmatizer
wnl = WordNetLemmatizer() 

# Create a list of stopwords
stop = stopwords.words('english')

# Expand the default stopwords list if necessary
stop.append("u")
stop.append("it'")
stop.append("'s")
stop.append("n't")
stop.append('…')
stop.append("\`")
stop.append('``')
stop.append('char')
stop.append("''")
stop = set(stop)

In [17]:
# Complete the tokenizer function
def tokenizer(text):
    """Tokenizes text."""
    
    # Create a list of the words
    words = word_tokenize(text)

    # Convert the words to lowercase
    words = list(filter(lambda w: w.lower(), words))
    
    # Remove the punctuation
    words = list(filter(lambda t: t not in punctuation, words))
    
    # Remove the stopwords
    words = list(filter(lambda t: t.lower() not in stop, words))
    
    # Lemmatize Words into root words
    tokens = [wnl.lemmatize(word) for word in words]
    
    return tokens

In [18]:
# Create a new tokens column for Bitcoin
btc["tokens"] = btc.text.apply(tokenizer)
btc.head()

Unnamed: 0,text,Compound,Positive,Negative,Neutral,tokens
0,While the nation was in lockdown mode in the s...,0.624,0.107,0.0,0.893,"[nation, lockdown, mode, spring, 2020, nation,..."
1,Bitcoin is likely to take more market share fr...,0.8473,0.278,0.0,0.722,"[Bitcoin, likely, take, market, share, gold, s..."
2,"LONDON, Jan 3 (Reuters) - The dollar ticked up...",-0.4215,0.0,0.074,0.926,"[LONDON, Jan, 3, Reuters, dollar, ticked, majo..."
3,Believers in the transformative power of the b...,0.0,0.0,0.0,1.0,"[Believers, transformative, power, blockchain,..."
4,"Kosovo has banned Bitcoin mining, alongside ot...",-0.8126,0.164,0.305,0.531,"[Kosovo, banned, Bitcoin, mining, alongside, e..."


In [19]:
# Create a new tokens column for Ethereum
eth["tokens"] = eth.text.apply(tokenizer)
eth.head()

Unnamed: 0,text,Compound,Positive,Negative,Neutral,tokens
0,A group of academics from the University of Ca...,0.4019,0.091,0.0,0.909,"[group, academic, University, California, Sant..."
1,Crypto.com coin price prediction\r\nThe Crypto...,0.2023,0.055,0.0,0.945,"[Crypto.com, coin, price, prediction, Crypto.c..."
2,Big Technology— by Alex Kantrowitz\r\nRevealin...,0.1779,0.049,0.0,0.951,"[Big, Technology—, Alex, Kantrowitz, Revealing..."
3,Lately I’m paying more attention to the crypto...,-0.34,0.0,0.062,0.938,"[Lately, ’, paying, attention, crypto, currenc..."
4,Cryptocurrency was one of the hottest investme...,0.3612,0.077,0.0,0.923,"[Cryptocurrency, one, hottest, investment, the..."


## NGrams and Frequency Analysis

In [21]:
from collections import Counter
from nltk import ngrams

In [22]:
# Generate the Bitcoin N-grams where N=2
N = 2
grams = ngrams(tokenizer(btc.text.str.cat()), N)
Counter(grams).most_common(20)

[(('char', 'NEW'), 5),
 (('NEW', 'YORK'), 5),
 (('YORK', 'Jan'), 5),
 (('Reuters', 'U.S.'), 5),
 (('U.S.', 'dollar'), 5),
 (('dollar', 'rose'), 5),
 (('Jan', '4'), 4),
 (('4', 'Reuters'), 4),
 (('char', 'LONDON'), 3),
 (('LONDON', 'Jan'), 3),
 (('Jan', '3'), 3),
 (('3', 'Reuters'), 3),
 (('government', 'bond'), 3),
 (('bond', 'yield'), 3),
 (('first', 'trading'), 3),
 (('trading', 'day'), 3),
 (('five-year', 'high'), 3),
 (('yen', 'Tuesday'), 3),
 (('minute', 'Federal'), 2),
 (('December', 'meeting'), 2)]

In [23]:
# Generate the Ethereum N-grams where N=2
N = 2
grams = ngrams(tokenizer(eth.text.str.cat()), N)
Counter(grams).most_common(20)

[(('char', 'Bitcoin'), 5),
 (('crypto', 'market'), 4),
 (('like', 'Bitcoin'), 2),
 (('Bitcoin', 'CRYPTO'), 2),
 (('CRYPTO', 'BTC'), 2),
 (('group', 'academic'), 1),
 (('academic', 'University'), 1),
 (('University', 'California'), 1),
 (('California', 'Santa'), 1),
 (('Santa', 'Barbara'), 1),
 (('Barbara', 'demonstrated'), 1),
 (('demonstrated', 'call'), 1),
 (('call', 'scalable'), 1),
 (('scalable', 'technique'), 1),
 (('technique', 'vet'), 1),
 (('vet', 'smart'), 1),
 (('smart', 'contract'), 1),
 (('contract', 'mitigate'), 1),
 (('mitigate', 'state-inconsistency'), 1),
 (('state-inconsistency', 'bug'), 1)]

In [24]:
# Function token_count generates the top 10 words for a given coin
def token_count(tokens, N=3):
    """Returns the top N tokens from the frequency count"""
    return Counter(tokens).most_common(N)

In [25]:
# Use token_count to get the top 10 words for Bitcoin
all_tokens = tokenizer(btc.text.str.cat())
token_count(all_tokens, 10)

[('char', 20),
 ('Reuters', 9),
 ('Jan', 8),
 ('investor', 7),
 ('day', 7),
 ('rose', 7),
 ('dollar', 6),
 ('U.S.', 6),
 ('Tuesday', 5),
 ('trading', 5)]

In [26]:
# Use token_count to get the top 10 words for Ethereum
all_tokens = tokenizer(eth.text.str.cat())
token_count(all_tokens, 10)

[('char', 20),
 ('year', 10),
 ('crypto', 8),
 ('Bitcoin', 7),
 ('market', 6),
 ('cryptocurrency', 5),
 ('price', 4),
 ('high', 4),
 ('2021', 4),
 ('new', 4)]

## Named Entity Recognition

In [None]:
# Download the language model for SpaCy if needed
# !python -m spacy download en_core_web_sm

In [35]:
import spacy
from spacy import displacy

In [36]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

In [37]:
# Concatenate all of the bitcoin text together
all_btc_text = btc.text.str.cat()
all_btc_text

"While the nation was in lockdown mode in the spring of 2020, the nations most successful private equity software investor found himself in Puerto Rico with nothing to do. Orlando Bravo, the billionai… [+17516 chars]Bitcoin is likely to take more market share from gold as a store of value thanks to broader adoption of digital assets and bitcoins scaling solutions, according to a Tuesday report by Goldman Sachs.\xa0… [+995 chars]LONDON, Jan 3 (Reuters) - The dollar ticked up against its major rivals as an upbeat market mood on Monday lifted European equities and government bond yields for the first day of trading of 2022.\r\nB… [+2388 chars]Believers in the transformative power of the blockchain have labeled the rise of bitcoin \r\n BTCUSD,\r\n +0.06%\r\nand other cryptocurrencies a revolution, implicitly placing financial industry incumbents… [+2762 chars]Kosovo has banned Bitcoin mining, alongside other emergency measures, to save energy.\r\nThe Balkan territory is facing its worst en

In [38]:
# Run the NER processor on all of the text
doc = nlp(all_btc_text)

# Add a title to the document
doc.user_data["title"] = "Bitcoin NER"

In [39]:
# Render the visualization
displacy.render(doc, style='ent', jupyter=True)

In [40]:
# List all Entities
for ent in doc.ents:
    print(ent.text, ent.label_)

the spring of 2020 DATE
Puerto Rico GPE
Orlando Bravo PERSON
Tuesday DATE
Goldman Sachs ORG
Jan 3 DATE
Reuters ORG
Monday DATE
European NORP
the first day DATE
2022 DATE
BTCUSD WORK_OF_ART
Bitcoin PERSON
Balkan NORP
a decade DATE
Matthew Sparkes PERSON
2017 DATE
Getty Images ORG
UK GPE
almost a third CARDINAL
$1.15 billion MONEY
$750 million MONEY
January 2020 DATE
Wednesday DATE
minutes TIME
the Federal Open Market Committees LAW
December DATE
Jan 3 DATE
Reuters ORG
U.S. GPE
Monday DATE
the first trading day DATE
the new year DATE
Fed ORG
Jan 4 DATE
Reuters ORG
U.S. GPE
five-year DATE
Tuesday DATE
U.S. Federal Reserve ORG
Wednesday DATE
minutes TIME
the Federal Reserve's ORG
December DATE
chars]NEW YORK GPE
Jan 4 DATE
Reuters Breakingviews ORG
2021 DATE
Breakingviews ORG
Jan 4 DATE
Reuters ORG
U.S. GPE
a fifth straight day DATE
Japanese NORP
Tuesday DATE
five-year DATE
Omicron ORG
Jan 3 DATE
Reuters ORG
U.S. GPE
Monday DATE
the first trading day DATE
the new year DATE
Jan 5 DATE
Reute

## Ethereum NER

In [42]:
# Concatenate all of the bitcoin text together
all_eth_text = eth.text.str.cat()
all_eth_text

'A group of academics from the University of California, Santa Barbara, has demonstrated what it calls a "scalable technique" to vet smart contracts and mitigate state-inconsistency bugs, discovering … [+2700 chars]Crypto.com coin price prediction\r\nThe Crypto.com Coin cryptocurrency hit a record high on November 15, reaching $0.52 at its peak, according to CoinMarketCap.\r\nIn comparison, it was sitting at just $… [+725 chars]Big Technology— by Alex Kantrowitz\r\nRevealing the systems in the tech world that drive what we see in the headlines, focusing primarily on the tech giants: Amazon, Apple, Facebook, Google, and Micros… [+3 chars]Lately I’m paying more attention to the crypto currency and “DeFi” spaces, again.\r\nOne discussion that comes up regularly: are miners to blame for the high GPU prices?\r\nTo add some facts to this disc… [+4052 chars]Cryptocurrency was one of the hottest investment themes of last year. Leaders like Bitcoin soared past $60,000. And meme tokens skyrocket

In [43]:
# Run the NER processor on all of the text
eth_doc = nlp(all_eth_text)

# Add a title to the document
eth_doc.user_data["title"] = "Ethereum NER"

In [44]:
# Render the visualization
displacy.render(eth_doc, style='ent', jupyter=True)

In [45]:
# List all Entities
for ent in eth_doc.ents:
    print(ent.text, ent.label_)

the University of California ORG
Santa Barbara GPE
The Crypto.com Coin ORG
November 15 DATE
0.52 MONEY
CoinMarketCap ORG
Alex Kantrowitz PERSON
Amazon ORG
Apple ORG
Micros ORG
’m CARDINAL
DeFi GPE
One CARDINAL
GPU ORG
one CARDINAL
last year DATE
60,000 MONEY
Shiba Inu ORG
45,000,000% PERCENT
a big year DATE
2021 DATE
2022 DATE
2021 DATE
the year DATE
2022 DATE
crypto GPE
Crypto PERSON
39.4% PERCENT
Tuesday DATE
May 2018 DATE
recent years DATE
Subscribe ORG
Forbes ORG
CryptoAsset & ORG
Blockchain Advisor PERSON
NFT ORG
$300 million MONEY
Paradigm PERSON
Coatue ORG
$13.3 billion US MONEY
one CARDINAL
one CARDINAL
just over $100 MONEY
2014 DATE
nearly $65,000 MONEY
mid-2021 DATE
the past year DATE
close to $70,000 MONEY
November DATE
Norton ORG
Symantec NORP
2021 DATE
token ORG
more than 60% PERCENT
the year DATE
roughly $100 trillion MONEY
pri ORG
Tuesday DATE
1.6% PERCENT
a good year DATE
a fantastic year DATE
the end of the year DATE
