In [1]:
from textblob import TextBlob
import pandas as pd
from tqdm import tqdm
from vaderSentiment import vaderSentiment
import matplotlib.pyplot as plt
%matplotlib inline
# set some nicer defaults for matplotlib
from matplotlib import rcParams

#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (8, 5)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
#rcParams['grid.color'] = '#eeeeee'
rcParams['grid.color'] = 'white'
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 16
rcParams['patch.edgecolor'] = 'none'

Since I do not want to manually label each tweet in order to run sentiment analysis, I will instead import the pre-trained sentiment classifier, `TextBlob`, which is already trained on several IMDB movie reviews. 

# Individual Tweet Analysis

In [2]:
tweets_df = pd.read_csv('cleaned_tweets.csv', index_col=0)

In [3]:
tweets_df = tweets_df.dropna()

In [4]:
polarity = [] # Lies in range [-1,1]  Very negative sentiment to very positive
subjectivity = [] # [0,1] Very objective statement to very subjective statement

for tweet in tqdm(tweets_df.lemmatized):
    try:
        text = TextBlob(tweet).sentiment
        polarity.append(text.polarity)
        subjectivity.append(text.subjectivity)
    except:
        print(tweet)

100%|██████████| 210289/210289 [01:42<00:00, 2046.22it/s]


In addition to the TextBlob Analysis, let's incorporate VaderSentiment since it was designed to work well with tweets. However, since we have already eliminated punctuation, it will not be as useful.  

In [5]:
analyzer = vaderSentiment.SentimentIntensityAnalyzer()
vader_polarity = [] # Lies in range [-1,1]  Very negative sentiment to very positive

for tweet in tqdm(tweets_df.lemmatized):
    try:
        text = analyzer.polarity_scores(tweet)
        vader_polarity.append(text['compound'])
    except:
        print(tweet)

100%|██████████| 210289/210289 [00:38<00:00, 5473.79it/s]


In [6]:
tweets_df['textblob_polarity'] = polarity
tweets_df['textblob_subjectivity'] = subjectivity

In [7]:
tweets_df['vader_polarity'] = vader_polarity

In [8]:
tweets_df.head()

Unnamed: 0,full_text,screen_name,retweet,filtered_tweets,lemmatized,textblob_polarity,textblob_subjectivity,vader_polarity
0,We are mobilizing to support those impacted by...,sundarpichai,0,we are mobilizing to support those impacted by...,mobilize support impact campfire hillfire wool...,0.0,0.0,0.5719
1,RT @stshank: The @GoogleAI and @googlecloud fo...,sundarpichai,1,rt the and folks are helping the process 5 7 m...,folk help process 5 7 million analog photo mor...,0.0,0.0,0.4019
2,RT @Google: Go Vote: Here’s where to find the ...,sundarpichai,1,rt go vote here s where to find the informatio...,go vote find information need make voice hear ...,0.0,0.0,0.0
3,RT @AndrewCrow: Amazing change to Google's sit...,sundarpichai,1,rt amazing change to google s site govote,amazing change google site govote,0.6,0.9,0.5859
4,We’re announcing our AI for Social Good progra...,sundarpichai,0,we re announcing our ai for social good progra...,announce ai social good program apply expertis...,0.320202,0.40404,0.872


In [23]:
difference_in_polarity = tweets_df.vader_polarity - tweets_df.textblob_polarity

In [9]:
tweets_df.to_csv('tweets_with_sentiment.csv')

In [10]:
tweets_df = pd.read_csv('tweets_with_sentiment.csv', index_col=0)

In [11]:
tweets_df.head()

Unnamed: 0,full_text,screen_name,retweet,filtered_tweets,lemmatized,textblob_polarity,textblob_subjectivity,vader_polarity
0,We are mobilizing to support those impacted by...,sundarpichai,0,we are mobilizing to support those impacted by...,mobilize support impact campfire hillfire wool...,0.0,0.0,0.5719
1,RT @stshank: The @GoogleAI and @googlecloud fo...,sundarpichai,1,rt the and folks are helping the process 5 7 m...,folk help process 5 7 million analog photo mor...,0.0,0.0,0.4019
2,RT @Google: Go Vote: Here’s where to find the ...,sundarpichai,1,rt go vote here s where to find the informatio...,go vote find information need make voice hear ...,0.0,0.0,0.0
3,RT @AndrewCrow: Amazing change to Google's sit...,sundarpichai,1,rt amazing change to google s site govote,amazing change google site govote,0.6,0.9,0.5859
4,We’re announcing our AI for Social Good progra...,sundarpichai,0,we re announcing our ai for social good progra...,announce ai social good program apply expertis...,0.320202,0.40404,0.872


# Sentiment Analysis per User

In [None]:
user_df = pd.read_csv('twitter_users.csv', index_col=0)

In [14]:
polarity = [] # Lies in range [-1,1]  Very negative sentiment to very positive
subjectivity = [] # [0,1] Very objective statement to very subjective statement

for tweet in tqdm(user_df.documents):
    try:
        text = TextBlob(tweet).sentiment
        polarity.append(text.polarity)
        subjectivity.append(text.subjectivity)
    except:
        print(tweet)

100%|██████████| 79/79 [00:12<00:00,  7.25it/s]


In [15]:
vader_polarity = [] # Lies in range [-1,1]  Very negative sentiment to very positive

for tweet in tqdm(user_df.documents[0:3]):
    text = analyzer.polarity_scores(tweet)
    vader_polarity.append(text['compound'])

100%|██████████| 3/3 [05:49<00:00, 110.81s/it]


Trying to do the polarity for the vader sentiment on a document per user basis took much too long.  Instead, we can groupby each user and average the sentiments per tweet. 

In [None]:
user_d

In [73]:
user_df['polarity'] = polarity
user_df['subjectivity'] = subjectivity

In [74]:
user_df.to_csv('user_with_sentiment.csv')

In [12]:
user_df = pd.read_csv('user_with_sentiment.csv', index_col=0)