In [1]:
%pylab inline
import os
import pickle
import json
import pandas as pd
from plotly import express as px
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from afinn import Afinn
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from tqdm import tqdm

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_pickle('translated_tweets.pkl')
additional_stopwords = ['na', 'https', 'ha', 'hahaha', 'haha', 'lang', 'ang', 'yan', 'ng', 'sa', 'rin', 'yun', 'yang', 'si', 'ako', 'siya', 'ka', 'po', 
                        'mga', 'yung', 'pa', 'pala', 'na', 'ni', 'sya', 'ba', 'ko', 'nyo', 'man']
additional_stopwords = set(list(ENGLISH_STOP_WORDS) + additional_stopwords)

In [3]:
afinn = Afinn(emoticons=True)
vader_analyzer = SentimentIntensityAnalyzer()

In [4]:
df['afinn'] = df['translated_text'].apply(lambda x: x['translatedText']).apply(lambda x: afinn.score(x))

In [5]:
df['vader'] = df['translated_text'].apply(lambda x: x['translatedText']).apply(lambda x: vader_analyzer.polarity_scores(x))

In [6]:
df['translated_text'] = df['translated_text'].apply(lambda x: x['translatedText'])

In [7]:
## NRC
def text_emotion(df, column):
    '''
    Takes a DataFrame and a specified column of text and adds 10 columns to the
    DataFrame for each of the 10 emotions in the NRC Emotion Lexicon, with each
    column containing the value of the text in that emotions
    INPUT: DataFrame, string
    OUTPUT: the original DataFrame with ten new columns
    '''

    new_df = df.copy()

    filepath = ('data/'
                'NRC-Emotion-Lexicon-Wordlevel-v0.92.txt')
    emolex_df = pd.read_csv(filepath,
                            names=["word", "emotion", "association"],
                            sep='\t')
    emolex_words = emolex_df.pivot(index='word',
                                   columns='emotion',
                                   values='association').reset_index()
    emotions = emolex_words.columns.drop('word')
    emo_df = pd.DataFrame(0, index=df.index, columns=emotions)

    stemmer = SnowballStemmer("english")

    
    book = ''
    chapter = ''
    
    with tqdm(total=len(list(new_df.iterrows()))) as pbar:
        for i, row in new_df.iterrows():
            pbar.update(1)
            document = word_tokenize(new_df.loc[i][column])
            for word in document:
                word = stemmer.stem(word.lower())
                emo_score = emolex_words[emolex_words.word == word]
                if not emo_score.empty:
                    for emotion in list(emotions):
                        emo_df.at[i, emotion] += emo_score[emotion]

    new_df = pd.concat([new_df, emo_df], axis=1)

    return new_df

In [8]:
nrc_df = text_emotion(df, 'translated_text')

  4%|‚ñà‚ñà‚ñà‚ñç                                                                          | 655/14923 [01:15<27:23,  8.68it/s]


KeyboardInterrupt: 

In [None]:
df['created_at_date'] = pd.to_datetime(df['created_at_date'])

In [None]:
df['vader_neg'] = df['vader'].apply(lambda x: x['neg'])
df['vader_neu'] = df['vader'].apply(lambda x: x['neu'])
df['vader_pos'] = df['vader'].apply(lambda x: x['pos'])
df['vader_compound'] = df['vader'].apply(lambda x: x['compound'])

In [10]:
tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('vacc')].groupby(['created_at_date'])['afinn'].sum().reset_index(),
    x='created_at_date', y='afinn', title='Tweets that contain the Vaccine Keyword - AFINN per day'
)

with open('plots/sentiment/AFINN_daily_Tweets that contain the Vaccine Keyword.html', 'w') as f:
    f.write(tmp.to_html())

tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('vacc')].groupby(['created_at_date'])['afinn'].sum().resample('M').sum().reset_index(),
    x='created_at_date', y='afinn', title='Tweets that contain the Vaccine Keyword - AFINN per month'
)
with open('plots/sentiment/AFINN_monthly_Tweets that contain the Vaccine Keyword.html', 'w') as f:
    f.write(tmp.to_html())


tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('vacc')].groupby(['created_at_date'])['afinn'].sum().resample('Y').sum().reset_index(),
    x='created_at_date', y='afinn', title='Tweets that contain the Vaccine Keyword - AFINN per year'
)
with open('plots/sentiment/AFINN_yearly_Tweets that contain the Vaccine Keyword.html', 'w') as f:
    f.write(tmp.to_html())

In [11]:
tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengue')].groupby(['created_at_date'])['afinn'].sum().reset_index(),
    x='created_at_date', y='afinn', title='Tweets that contain the Dengue Keyword - AFINN per day'
)

with open('plots/sentiment/AFINN_daily_Tweets that contain the Dengue Keyword.html', 'w') as f:
    f.write(tmp.to_html())

tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengue')].groupby(['created_at_date'])['afinn'].sum().resample('M').sum().reset_index(),
    x='created_at_date', y='afinn', title='Tweets that contain the Dengue Keyword - AFINN per month'
)
with open('plots/sentiment/AFINN_monthly_Tweets that contain the Dengue Keyword.html', 'w') as f:
    f.write(tmp.to_html())


tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengue')].groupby(['created_at_date'])['afinn'].sum().resample('Y').sum().reset_index(),
    x='created_at_date', y='afinn', title='Tweets that contain the Dengue Keyword - AFINN per year'
)
with open('plots/sentiment/AFINN_yearly_Tweets that contain the Dengue Keyword.html', 'w') as f:
    f.write(tmp.to_html())

In [12]:
tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengvax')].groupby(['created_at_date'])['afinn'].sum().reset_index(),
    x='created_at_date', y='afinn', title='Tweets that contain the Dengvaxia Keyword - AFINN per day'
)

with open('plots/sentiment/AFINN_daily_Tweets that contain the Dengvaxia Keyword.html', 'w') as f:
    f.write(tmp.to_html())

tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengvax')].groupby(['created_at_date'])['afinn'].sum().resample('M').sum().reset_index(),
    x='created_at_date', y='afinn', title='Tweets that contain the Dengvaxia Keyword - AFINN per month'
)
with open('plots/sentiment/AFINN_monthly_Tweets that contain the Dengvaxia Keyword.html', 'w') as f:
    f.write(tmp.to_html())


tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengvax')].groupby(['created_at_date'])['afinn'].sum().resample('Y').sum().reset_index(),
    x='created_at_date', y='afinn', title='Tweets that contain the Dengvaxia Keyword - AFINN per year'
)
with open('plots/sentiment/AFINN_yearly_Tweets that contain the Dengvaxia Keyword.html', 'w') as f:
    f.write(tmp.to_html())

In [14]:
tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('vacc')].groupby(['created_at_date'])['vader_compound'].sum().reset_index(),
    x='created_at_date', y='vader_compound', title='Tweets that contain the Vaccine Keyword - VADER per day'
)

with open('plots/sentiment/VADER_daily_Tweets that contain the Vaccine Keyword.html', 'w') as f:
    f.write(tmp.to_html())

tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('vacc')].groupby(['created_at_date'])['vader_compound'].sum().resample('M').sum().reset_index(),
    x='created_at_date', y='vader_compound', title='Tweets that contain the Vaccine Keyword - VADER per month'
)
with open('plots/sentiment/VADER_monthly_Tweets that contain the Vaccine Keyword.html', 'w') as f:
    f.write(tmp.to_html())


tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('vacc')].groupby(['created_at_date'])['vader_compound'].sum().resample('Y').sum().reset_index(),
    x='created_at_date', y='vader_compound', title='Tweets that contain the Vaccine Keyword - VADER per year'
)
with open('plots/sentiment/VADER_yearly_Tweets that contain the Vaccine Keyword.html', 'w') as f:
    f.write(tmp.to_html())
    
tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengue')].groupby(['created_at_date'])['vader_compound'].sum().reset_index(),
    x='created_at_date', y='vader_compound', title='Tweets that contain the Dengue Keyword - VADER per day'
)

with open('plots/sentiment/VADER_daily_Tweets that contain the Dengue Keyword.html', 'w') as f:
    f.write(tmp.to_html())

tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengue')].groupby(['created_at_date'])['vader_compound'].sum().resample('M').sum().reset_index(),
    x='created_at_date', y='vader_compound', title='Tweets that contain the Dengue Keyword - VADER per month'
)
with open('plots/sentiment/VADER_monthly_Tweets that contain the Dengue Keyword.html', 'w') as f:
    f.write(tmp.to_html())


tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengue')].groupby(['created_at_date'])['vader_compound'].sum().resample('Y').sum().reset_index(),
    x='created_at_date', y='vader_compound', title='Tweets that contain the Dengue Keyword - VADER per year'
)
with open('plots/sentiment/VADER_yearly_Tweets that contain the Dengue Keyword.html', 'w') as f:
    f.write(tmp.to_html())
    
tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengvax')].groupby(['created_at_date'])['vader_compound'].sum().reset_index(),
    x='created_at_date', y='vader_compound', title='Tweets that contain the Dengvaxia Keyword - VADER per day'
)

with open('plots/sentiment/VADER_daily_Tweets that contain the Dengvaxia Keyword.html', 'w') as f:
    f.write(tmp.to_html())

tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengvax')].groupby(['created_at_date'])['vader_compound'].sum().resample('M').sum().reset_index(),
    x='created_at_date', y='vader_compound', title='Tweets that contain the Dengvaxia Keyword - VADER per month'
)
with open('plots/sentiment/VADER_monthly_Tweets that contain the Dengvaxia Keyword.html', 'w') as f:
    f.write(tmp.to_html())


tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengvax')].groupby(['created_at_date'])['vader_compound'].sum().resample('Y').sum().reset_index(),
    x='created_at_date', y='vader_compound', title='Tweets that contain the Dengvaxia Keyword - VADER per year'
)
with open('plots/sentiment/VADER_yearly_Tweets that contain the Dengvaxia Keyword.html', 'w') as f:
    f.write(tmp.to_html())

In [15]:
nrc_cols = {col: 'nrc_' + col for col in ['anger',
       'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive',
       'sadness', 'surprise', 'trust']}

In [16]:
df = df.merge(
    nrc_df.rename(nrc_cols, axis=1)[nrc_cols.values()],
    left_index=True, right_index=True)

In [17]:
df['nrc_sum'] = df['nrc_positive'] - df['nrc_negative']

In [18]:
tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('vacc')].groupby(['created_at_date'])['nrc_sum'].sum().reset_index(),
    x='created_at_date', y='nrc_sum', title='Tweets that contain the Vaccine Keyword - NRC per day'
)

with open('plots/sentiment/NRC_daily_Tweets that contain the Vaccine Keyword.html', 'w') as f:
    f.write(tmp.to_html())

tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('vacc')].groupby(['created_at_date'])['nrc_sum'].sum().resample('M').sum().reset_index(),
    x='created_at_date', y='nrc_sum', title='Tweets that contain the Vaccine Keyword - NRC per month'
)
with open('plots/sentiment/NRC_monthly_Tweets that contain the Vaccine Keyword.html', 'w') as f:
    f.write(tmp.to_html())


tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('vacc')].groupby(['created_at_date'])['nrc_sum'].sum().resample('Y').sum().reset_index(),
    x='created_at_date', y='nrc_sum', title='Tweets that contain the Vaccine Keyword - NRC per year'
)
with open('plots/sentiment/NRC_yearly_Tweets that contain the Vaccine Keyword.html', 'w') as f:
    f.write(tmp.to_html())
    
tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengue')].groupby(['created_at_date'])['nrc_sum'].sum().reset_index(),
    x='created_at_date', y='nrc_sum', title=f'Tweets that contain the Dengue Keyword - NRC {emotion} per day'
)

with open('plots/sentiment/NRC_daily_Tweets that contain the Dengue Keyword.html', 'w') as f:
    f.write(tmp.to_html())

tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengue')].groupby(['created_at_date'])['nrc_sum'].sum().resample('M').sum().reset_index(),
    x='created_at_date', y='nrc_sum', title='Tweets that contain the Dengue Keyword - NRC per month'
)
with open('plots/sentiment/NRC_monthly_Tweets that contain the Dengue Keyword.html', 'w') as f:
    f.write(tmp.to_html())


tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengue')].groupby(['created_at_date'])['nrc_sum'].sum().resample('Y').sum().reset_index(),
    x='created_at_date', y='nrc_sum', title='Tweets that contain the Dengue Keyword - NRC per year'
)
with open('plots/sentiment/NRC_yearly_Tweets that contain the Dengue Keyword.html', 'w') as f:
    f.write(tmp.to_html())
    
tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengvax')].groupby(['created_at_date'])['nrc_sum'].sum().reset_index(),
    x='created_at_date', y='nrc_sum', title='Tweets that contain the Dengvaxia Keyword - NRC per day'
)

with open('plots/sentiment/NRC_daily_Tweets that contain the Dengvaxia Keyword.html', 'w') as f:
    f.write(tmp.to_html())

tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengvax')].groupby(['created_at_date'])['nrc_sum'].sum().resample('M').sum().reset_index(),
    x='created_at_date', y='nrc_sum', title='Tweets that contain the Dengvaxia Keyword - NRC per month'
)
with open('plots/sentiment/NRC_monthly_Tweets that contain the Dengvaxia Keyword.html', 'w') as f:
    f.write(tmp.to_html())


tmp = px.line(
    df[df['translated_text'].str.lower().str.contains('dengvax')].groupby(['created_at_date'])['nrc_sum'].sum().resample('Y').sum().reset_index(),
    x='created_at_date', y='nrc_sum', title='Tweets that contain the Dengvaxia Keyword - NRC per year'
)
with open('plots/sentiment/NRC_yearly_Tweets that contain the Dengvaxia Keyword.html', 'w') as f:
    f.write(tmp.to_html())

In [19]:
tmp = df.drop([
    'contributors', 'coordinates', 'entities','geo', 'id_str',
       'in_reply_to_screen_name', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_user_id_str', 'lang', 'matching_rules',
       'place', 'source', 'text', 'truncated','display_text_range', 'extended_entities', 'extended_tweet',
    'quoted_status_id_str', 'vader',
       'quoted_status_permalink', 
], axis=1)


In [20]:
tmp['user_id'] = tmp['user'].apply(lambda x: x['id'])
tmp['user_name'] = tmp['user'].apply(lambda x: x['screen_name'])

In [21]:
tmp['created_at'] = tmp['created_at'].astype(str)

tmp.drop('user', axis=1).to_excel('tweets_with_sentiment.xlsx')


Ignoring URL 'https://t.co/ZTVKFGmKYA%20SHOW%20na%20naman%20si%20Sen.%20Dick%20Gordon.%20No.1%20pa%20naman%20ikaw%20sa%20senator%20list%20ko%20kaso%20sa%20napapanood%20ko%20ngayon%20sa%20dengvaxia%20hearing%20nag.eexplain%20pa%20ung%20mga%20ininvite%20na%20resource%20person%20eh%20binabara%20mo%20na%20agad%20kasi%20ayaw%20mo%20nung%20sinasabi%20mapa-Pro%20or%20anti%20admin%20man.' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS


Ignoring URL 'https://t.co/lcG5aoJZ8S
%22Acosta‚Äôs%20allegations%20have%20resulted¬†in%20the%20decline%20in%20vaccine%20confidence%20&amp;%20a%20rise%20in%20cases%20of%20Measles%20&amp;%20other%20vaccine%20preventable%20diseases.
DOH%20noted%20that%20cases%20of%20measles%20&amp;%20rubella%20dramatically¬†increased%20to%20over%2018,000%20in%202018%20vs%203,804%20cases%20in%202017.%22%20üò†' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS


Ignoring URL 'https://t.co/lcG5aoJZ8S%20&quo

In [9]:
tmp = pd.read_excel('tweets_with_sentiment.xlsx')

In [17]:
tmp['keyword_vaccine'] = tmp['translated_text'].str.lower().str.contains('vacc') | tmp['translated_text'].str.lower().str.contains('bakun')

In [14]:
tmp['keyword_dengvaxia'] = tmp['translated_text'].str.lower().str.contains('dengvax') 

In [15]:
tmp['keyword_dengue'] = tmp['translated_text'].str.lower().str.contains('dengue') 

In [41]:
tmp.to_excel('tweets_with_sentiment.xlsx')

In [42]:
with open('NRC for Tweets with Vaccine keyword per Month.html', 'w') as f:
    t = px.line(tmp[tmp['keyword_vaccine']].groupby(['created_at_date'])[['nrc_anger', 'nrc_anticipation', 'nrc_disgust',
       'nrc_fear', 'nrc_joy', 'nrc_negative', 'nrc_positive', 'nrc_sadness',
       'nrc_surprise', 'nrc_trust']].sum().resample('M').sum().reset_index().melt('created_at_date'),
        x='created_at_date', y='value', color='variable',
        title='NRC for Tweets with Vaccine keyword per Month')
    f.write(t.to_html())

In [43]:
with open('NRC for Tweets with Dengue keyword per Month.html', 'w') as f:
    t = px.line(tmp[tmp['keyword_dengue'].fillna(False)].groupby(['created_at_date'])[['nrc_anger', 'nrc_anticipation', 'nrc_disgust',
       'nrc_fear', 'nrc_joy', 'nrc_negative', 'nrc_positive', 'nrc_sadness',
       'nrc_surprise', 'nrc_trust']].sum().resample('M').sum().reset_index().melt('created_at_date'),
        x='created_at_date', y='value', color='variable',
        title='NRC for Tweets with Dengue keyword per Month')
    f.write(t.to_html())

In [44]:
with open('NRC for Tweets with Dengvaxia keyword per Month.html', 'w') as f:
    t = px.line(tmp[tmp['keyword_dengvaxia'].fillna(False)].groupby(['created_at_date'])[['nrc_anger', 'nrc_anticipation', 'nrc_disgust',
       'nrc_fear', 'nrc_joy', 'nrc_negative', 'nrc_positive', 'nrc_sadness',
       'nrc_surprise', 'nrc_trust']].sum().resample('M').sum().reset_index().melt('created_at_date'),
        x='created_at_date', y='value', color='variable',
        title='NRC for Tweets with Dengvaxia keyword per Month')
    f.write(t.to_html())