Sentiment Analysis Using nltk

In [1]:
#imports
import os
import pandas as pd
from dotenv import load_dotenv
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
from newsapi import NewsApiClient

%matplotlib inline

In [2]:
#read in api
load_dotenv()
api_key = os.getenv('NEWS_API_KEY')

In [3]:
#newsapi client
newsapi = NewsApiClient(api_key = api_key)

In [4]:
#bitcoin headlines
btc_headlines = newsapi.get_everything(
    q = 'bitcoin',
    language = 'en',
    sort_by = 'relevancy'
)

In [5]:
#ethereum headlines
eth_headlines = newsapi.get_everything(
    q = 'ethereum',
    language = 'en',
    sort_by = 'relevancy'
)

In [6]:
sentiments = []

for articles in btc_headlines['articles']:
    try:
        text = articles['content']
        results = analyzer.polarity_scores(text)
        compound = results['compound']
        pos = results['pos']
        neu = results['neu']
        neg = results['neg']
        
        sentiments.append({
            'Compound': compound,
            'Positive': pos,
            'Negative': neg,
            'Neutral': neu,
            'text': text,
        })
    except AttributeError:
        pass
    
btc = pd.DataFrame(sentiments)
btc.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,text
0,0.3612,0.077,0.0,0.923,Musk.MARK RALSTON/AFP via Getty Images\r\nElon...
1,0.0,0.0,0.0,1.0,When Denis Rusinovich set up cryptocurrency mi...
2,0.3182,0.105,0.0,0.895,El Salvador introduced Bitcoin as a legal tend...
3,-0.4404,0.0,0.083,0.917,Were officially building an open Bitcoin minin...
4,-0.3182,0.045,0.084,0.871,Israeli national Tal Prihar pled guilty to rou...


In [7]:
sentiments = []

for articles in eth_headlines['articles']:
    try:
        text = articles['content']
        results = analyzer.polarity_scores(text)
        compound = results['compound']
        pos = results['pos']
        neu = results['neu']
        neg = results['neg']
        
        sentiments.append({
            'Compound': compound,
            'Positive': pos,
            'Negative': neg,
            'Neutral': neu,
            'text': text,
        })
    except AttributeError:
        pass
    
eth = pd.DataFrame(sentiments)
eth.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,text
0,0.0,0.0,0.0,1.0,"In a new blog post the company said that 4,836..."
1,0.0,0.0,0.0,1.0,Hackers who made off with roughly $15 million ...
2,0.1779,0.052,0.0,0.948,"On some level, the new mayor is simply employi..."
3,0.0772,0.038,0.0,0.962,"Back in September\r\n, Robinhood announced pla..."
4,-0.1027,0.056,0.067,0.877,Trading platform Crypto.com lost about $34 mil...


In [8]:
#summary statistics btc
btc.describe()

Unnamed: 0,Compound,Positive,Negative,Neutral
count,20.0,20.0,20.0,20.0
mean,0.041505,0.0582,0.0379,0.9039
std,0.366363,0.057018,0.043444,0.062064
min,-0.4404,0.0,0.0,0.765
25%,-0.33155,0.0,0.0,0.8755
50%,0.0386,0.054,0.0,0.917
75%,0.32895,0.084,0.08325,0.934
max,0.6808,0.185,0.101,1.0


In [9]:
#summary statistics eth
eth.describe()

Unnamed: 0,Compound,Positive,Negative,Neutral
count,20.0,20.0,20.0,20.0
mean,0.164365,0.05305,0.0087,0.9382
std,0.272452,0.065873,0.021502,0.066563
min,-0.1531,0.0,0.0,0.783
25%,0.0,0.0,0.0,0.894
50%,0.0,0.0395,0.0,0.951
75%,0.4068,0.09025,0.0,1.0
max,0.7579,0.217,0.067,1.0


Tokenization 

In [10]:
#additional imports for tokenization
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [11]:
#lemmatization
wnl = WordNetLemmatizer()

stop = stopwords.words('english')
stop.append("u")
stop.append("it'")
stop.append("'s")
stop.append("n't")
stop.append('…')
stop.append("\`")
stop.append('``')
stop.append('char')
stop.append("''")
stop = set(stop)

In [12]:
#tokenizer function
def tokenizer(text):
    words = word_tokenize(text)
    #convert all to lowercase
    words = list(filter(lambda w: w.lower(), words))
     # Remove the punctuation
    words = list(filter(lambda t: t not in punctuation, words))
    # Remove the stopwords
    words = list(filter(lambda t: t.lower() not in stop, words))
    # Lemmatize Words into root words
    tokens = [wnl.lemmatize(word) for word in words]
    return tokens

In [13]:
#tokens column for btc
btc['tokens'] = btc.text.apply(tokenizer)
btc.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,text,tokens
0,0.3612,0.077,0.0,0.923,Musk.MARK RALSTON/AFP via Getty Images\r\nElon...,"[Musk.MARK, RALSTON/AFP, via, Getty, Images, E..."
1,0.0,0.0,0.0,1.0,When Denis Rusinovich set up cryptocurrency mi...,"[Denis, Rusinovich, set, cryptocurrency, minin..."
2,0.3182,0.105,0.0,0.895,El Salvador introduced Bitcoin as a legal tend...,"[El, Salvador, introduced, Bitcoin, legal, ten..."
3,-0.4404,0.0,0.083,0.917,Were officially building an open Bitcoin minin...,"[officially, building, open, Bitcoin, mining, ..."
4,-0.3182,0.045,0.084,0.871,Israeli national Tal Prihar pled guilty to rou...,"[Israeli, national, Tal, Prihar, pled, guilty,..."


In [14]:
#tokens column for eth
eth['tokens'] = eth.text.apply(tokenizer)
eth.head()

Unnamed: 0,Compound,Positive,Negative,Neutral,text,tokens
0,0.0,0.0,0.0,1.0,"In a new blog post the company said that 4,836...","[new, blog, post, company, said, 4,836, ETH, 4..."
1,0.0,0.0,0.0,1.0,Hackers who made off with roughly $15 million ...,"[Hackers, made, roughly, 15, million, ethereum..."
2,0.1779,0.052,0.0,0.948,"On some level, the new mayor is simply employi...","[level, new, mayor, simply, employing, age-old..."
3,0.0772,0.038,0.0,0.962,"Back in September\r\n, Robinhood announced pla...","[Back, September, Robinhood, announced, plan, ..."
4,-0.1027,0.056,0.067,0.877,Trading platform Crypto.com lost about $34 mil...,"[Trading, platform, Crypto.com, lost, 34, mill..."


NGrams and Frequency Analysis

In [15]:
#additional imports
from collections import Counter
from nltk import ngrams

In [16]:
#btc N-grams where N = 2
N = 2
grams = ngrams(tokenizer(btc.text.str.cat()), N)
Counter(grams).most_common(20)

[(('mining', 'system'), 4),
 (('El', 'Salvador'), 3),
 (('Illustration', 'Alex'), 3),
 (('Alex', 'Castro'), 3),
 (('Castro', 'Verge'), 3),
 (('building', 'open'), 3),
 (('blog', 'post'), 3),
 (('Elon', 'Musk'), 2),
 (('alongside', 'US'), 2),
 (('US', 'dollar'), 2),
 (('International', 'Monetary'), 2),
 (('Monetary', 'Fund'), 2),
 (('officially', 'building'), 2),
 (('open', 'Bitcoin'), 2),
 (('Bitcoin', 'mining'), 2),
 (('Raedle/Getty', 'Images'), 2),
 (('Images', 'Block'), 2),
 (('new', 'blog'), 2),
 (('post', 'company'), 2),
 (('company', 'said'), 2)]

In [17]:
#eth N-grams where N = 2
N = 2
grams = ngrams(tokenizer(eth.text.str.cat()), N)
Counter(grams).most_common(20)

[(('blog', 'post'), 3),
 (('new', 'blog'), 2),
 (('post', 'company'), 2),
 (('company', 'said'), 2),
 (('according', 'new'), 2),
 (('New', 'York'), 2),
 (('York', 'City'), 2),
 (('char', 'Crypto'), 2),
 (('Crypto', 'Street'), 2),
 (('Clearwater', 'Beach'), 2),
 (('Beach', 'Florida'), 2),
 (('li', 'NFT'), 2),
 (('char', 'BELIEVERS'), 2),
 (('BELIEVERS', 'OPEN'), 2),
 (('OPEN', 'public'), 2),
 (('public', 'blockchains'), 2),
 (('blockchains', 'provide'), 2),
 (('provide', 'second'), 2),
 (('second', 'chance'), 2),
 (('chance', 'building'), 2)]

In [30]:
#generates top 10 words for a given coin
def token_count(tokens, N):
    """Returns the top N tokens from the frequency count"""
    return Counter(tokens).most_common(N)

In [31]:
# Use token_count to get the top 10 words for Bitcoin
all_tokens = tokenizer(btc.text.str.cat())
token_count(all_tokens, 10)

[('char', 20),
 ('Bitcoin', 9),
 ('company', 8),
 ('cryptocurrency', 7),
 ('mining', 7),
 ('open', 5),
 ('new', 5),
 ('bitcoin', 5),
 ('El', 4),
 ('building', 4)]

In [32]:
# Use token_count to get the top 10 words for Ethereum
all_tokens = tokenizer(eth.text.str.cat())
token_count(all_tokens, 10)

[('char', 20),
 ('Ethereum', 7),
 ('new', 6),
 ('cryptocurrency', 5),
 ('Bitcoin', 5),
 ('company', 4),
 ('digital', 4),
 ('2021', 4),
 ('blockchains', 4),
 ('blog', 3)]

Word Clouds

In [21]:
#additional imports
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [20.0, 10.0]