In [11]:
import pandas as pd
import numpy as np
import nltk
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/work/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
data = pd.read_csv('../data/topic_enron.csv')
data

Unnamed: 0.1,Unnamed: 0,content,fcontent,topic
0,0,Here is our forecast\n\n,['forecast'],"[(0, 0.012501605), (1, 0.012501605), (2, 0.012..."
1,1,Traveling to have a business meeting takes the...,"['travel', 'business', 'meeting', 'take', 'fun...","[(0, 0.15656357), (1, 0.03257292), (2, 0.01856..."
2,2,test successful. way to go!!!,"['test', 'successful', 'way']","[(10, 0.2562595), (34, 0.5062273)]"
3,3,"Randy,\n\n Can you send me a schedule of the s...","['randy', 'send', 'schedule', 'salary', 'level...","[(6, 0.120515056), (18, 0.19427584), (19, 0.16..."
4,4,Let's shoot for Tuesday at 11:45.,"['let', 'shoot', 'tuesday']","[(15, 0.34167174), (37, 0.3415987)]"
...,...,...,...,...
98218,98218,Effective 11/1/00 deliveries to Eastrans is 30...,"['effective', 'delivery', 'eastrans', 'mmbtudy...","[(9, 0.17084026), (18, 0.17078577), (19, 0.170..."
98219,98219,"PW, \n\nHPL's spot and base purchases by zone ...","['pw', 'hpls', 'spot', 'base', 'purchase', 'zo...","[(8, 0.11305909), (11, 0.12829624), (15, 0.149..."
98220,98220,The schedule is attached. I will remind you a...,"['schedule', 'attach', 'remind', 'day', 'advan...","[(0, 0.36822718), (20, 0.09971282), (25, 0.202..."
98221,98221,Enron Methanol nominates the following natural...,"['enron', 'methanol', 'nominate', 'follow', 'n...","[(5, 0.2574057), (8, 0.07809814), (18, 0.33238..."


# Setting up the sentiment analysis
We first define a function for analysing an email via VADER.

In [21]:
def vader_analyzer(doc):
    analyzer = SentimentIntensityAnalyzer()
    df = pd.DataFrame(columns=['Sentiment'])
    sentences = tokenize.sent_tokenize(doc)
    total_sentiment = 0
    if not(sentences == []):
        sentiments = [analyzer.polarity_scores(sentence)['compound'] for sentence in sentences]
        total_sentiment = sum(sentiments)/len(sentiments)
    else:
        toatl_sentiment = -2
    return total_sentiment

In [22]:
data['vader_sentiment'] = np.vectorize(vader_analyzer)(data['content'])

We now create the setiment bins.

In [23]:
neg_sent_bins = np.linspace(-1,-0.05,5)[1:]
neg_sent_bins

array([-0.7625, -0.525 , -0.2875, -0.05  ])

In [24]:
pos_sent_bins = -neg_sent_bins[0:]
pos_sent_bins

array([0.7625, 0.525 , 0.2875, 0.05  ])

In [25]:
sb=np.append(pos_sent_bins,np.flip(neg_sent_bins))
sb

array([ 0.7625,  0.525 ,  0.2875,  0.05  , -0.05  , -0.2875, -0.525 ,
       -0.7625])

In [26]:
s_intensity = ['Extremely','Very','Moderately','Slightly']
s_label = ['Positive','Negative']
s_order = [intensity+' '+label for label in s_label[::-1] for intensity in s_intensity]
s_order.insert(4,'Neutral')

In [27]:
neg_order = [intensity+' '+'Negative' for intensity in s_intensity]
pos_order = [intensity+' '+'Positive' for intensity in s_intensity[::-1]]

s_order= neg_order + ['Neutral'] + pos_order
s_order

['Extremely Negative',
 'Very Negative',
 'Moderately Negative',
 'Slightly Negative',
 'Neutral',
 'Slightly Positive',
 'Moderately Positive',
 'Very Positive',
 'Extremely Positive']

We now convert the VADER sentiment into VADER sentiment bins. 

In [28]:
import math

In [29]:
def sentimentLabel(sentiment):
    abs_sent = abs(sentiment)
    
    if abs_sent<=0.05:
        return 'Neutral'
    else:
        i = math.ceil(sentiment)-1
        label = s_label[i]
        ii = [abs_sent>x for x in pos_sent_bins].index(True)
        intensity = s_intensity[ii]
        
        return '{} {}'.format(intensity,label)

In [30]:
data['vader_sentiment_bin'] = np.vectorize(sentimentLabel)(data['vader_sentiment'])

In [40]:
data.drop(columns='Unnamed: 0').to_csv('../data/both_enron.csv')