In [2]:
# import SentimentIntensityAnalyzer class 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np

# Update VADER dictionary with financial lexicon

In [3]:
#import dictionary from https://sraf.nd.edu/textual-analysis/resources/#Master%20Dictionary
fin_dict = pd.read_csv('Financial Dictionary.csv')

In [4]:
#drop extra columns
fin_dict = fin_dict.drop(['Sequence Number', 'Word Count', 'Word Proportion','Average Proportion', 
                          'Std Dev', 'Doc Count', 'Uncertainty', 'Litigious', 'Constraining', 'Superfluous',
                          'Interesting', 'Modal', 'Irr_Verb', 'Harvard_IV', 'Syllables','Source'],axis=1)

In [5]:
#clean words
fin_dict['Word'] = fin_dict['Word'].astype(str)
fin_dict['Word'] = fin_dict['Word'].map(lambda x: x.lower())

In [6]:
#get negative and positive words and give scores
fin_dict_neg = fin_dict[fin_dict.Negative > 0]
fin_dict_neg = fin_dict_neg.drop(['Negative','Positive'], axis=1)
fin_dict_neg['score'] = -1
fin_dict_pos = fin_dict[fin_dict.Positive > 0]
fin_dict_pos = fin_dict_pos.drop(['Negative','Positive'], axis=1)
fin_dict_pos['score'] = 1

In [7]:
#convert to dictionary
fin_dict_neg = fin_dict_neg.set_index('Word').T.to_dict('list')
fin_dict_pos = fin_dict_pos.set_index('Word').T.to_dict('list')

In [9]:
#combine negative and postive
update_lexicon = fin_dict_neg
update_lexicon.update(fin_dict_pos)

In [10]:
update_lexicon = {k:vs[0] for k,vs in update_lexicon.items()} # turn lists inside dictionary to numbers

In [11]:
#add some manual updates
manual_updates = {
    'sell' : -1,
    'short': -1,
    'shorts' : -1,
    'monster' : 1,
    'explode' : 1,
    'explodes': 1,
    'raised': 1,
    'raise' : 1,
    'share' : 0,
    'shares' : 0
    }

In [12]:
update_lexicon.update(manual_updates)

In [13]:
Analyzer = SentimentIntensityAnalyzer()

#update the lexicon with manual updates and financial vocabury
Analyzer.lexicon.update(update_lexicon) 

# Simple ML using Lexicon

In [8]:
df = pd.read_csv('Tweets_clean_vader.csv')

In [16]:
# Create a SentimentIntensityAnalyzer object. 
sid_obj = Analyzer 

In [17]:
def sentiment_scores1(sentence): 
    # polarity_scores method of SentimentIntensityAnalyzer 
    # oject gives a sentiment dictionary. 
    # which contains pos, neg, neu, and compound scores. 
    sentiment_dict = sid_obj.polarity_scores(sentence)   
    return sentiment_dict

In [40]:
#return a final score using a threshold
def sentiment_scores2(sentence): 
    sentiment_dict = sid_obj.polarity_scores(sentence) 
    if sentiment_dict['compound'] >= 0.1 : 
        return("Positive") 
    elif sentiment_dict['compound'] <= - 0.1 : 
        return("Negative") 
    else : 
        return("Neutral") 

In [41]:
df['scores'] = df['clean_tweet'].apply(lambda x: sentiment_scores1(x))
df['prediction'] = df['clean_tweet'].apply(lambda x: sentiment_scores2(x))

In [42]:
# map original labels to words to compare to VADER results
di = {1: "Positive", 2: "Neutral", 0: "Negative"}
df['Label_Map'] = df['Label'].map(di)
df['result'] = np.where(df['Label_Map'] == df['prediction'], 1, 0)

In [43]:
df['result'].sum()/len(df)

0.538546255506608

In [22]:
df.to_csv('VADER_results.csv')

In [23]:
#code to test any sentence

sentence = 'wheaton precious metals corp announces quarterly dividend stocks'

tokenized_sentence = word_tokenize(sentence)
pos_word_list=[]
neu_word_list=[]
neg_word_list=[]

for word in tokenized_sentence:
    if (Analyzer.polarity_scores(word)['compound']) >= 0.1:
        pos_word_list.append(word)
    elif (Analyzer.polarity_scores(word)['compound']) <= -0.1:
        neg_word_list.append(word)
    else:
        neu_word_list.append(word)                

print('Positive:',pos_word_list)
print('Neutral:',neu_word_list)
print('Negative:',neg_word_list) 
score = Analyzer.polarity_scores(sentence)
print('\nScores:', score)

Positive: ['precious']
Neutral: ['wheaton', 'metals', 'corp', 'announces', 'quarterly', 'dividend', 'stocks']
Negative: []

Scores: {'neg': 0.0, 'neu': 0.654, 'pos': 0.346, 'compound': 0.5719}
