In [1]:
%pylab inline
import os
import pickle
import json
import pandas as pd
from plotly import express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text 
from wordcloud import WordCloud
from google.cloud import translate


Populating the interactive namespace from numpy and matplotlib


In [2]:
dirname, _, files = next(os.walk('data/'))

In [3]:
results = []
for f in files:
    with open(f"{dirname}{f}", 'rb') as file:
        tmp = pickle.load(file)
    if type(tmp) == list:
        results += tmp
    else:
        results.append(tmp)
results = [r for r in results if 'timePeriod' not in r.keys()]


In [4]:
set_of_jsons = {json.dumps(d, sort_keys=True) for d in results}
X = [json.loads(t) for t in set_of_jsons]

In [5]:
len(X)

14923

In [6]:
df = pd.DataFrame(X)

In [7]:
df['created_at'] = pd.to_datetime(df['created_at'])

In [8]:
df['created_at_date'] = df['created_at'].dt.date

In [9]:
df['full_text'] = df['extended_tweet'].apply(lambda x: None if type(x) == float else x['full_text']).fillna(df['text'])

In [10]:
keywords = ['dengvaxia', 'vaccine', 'bakuna', 'vax', 'dengue', 'anti vax', 'vaccination',]

In [11]:
tmp = pd.DataFrame()
for word in keywords:
    tmp2 = df[df['full_text'].str.lower().str.contains(word)].copy()
    tmp2['keyword'] = word
    tmp = tmp.append(tmp2)

In [12]:
fig = px.line(
    tmp.groupby(['keyword','created_at_date']).count()['text'].reset_index(
    ).rename({'text': 'tweet_count', 'created_at_date': 'date'}, axis=1), 
    x='date', y='tweet_count', color='keyword',
title=f'Tweet activity by Search Term')
with open('plots/tweet_activity_by_search_term.html', 'w') as f:
    f.write(fig.to_html())

In [13]:
stop_words = text.ENGLISH_STOP_WORDS.union(['https', 'co', 'to', 'sa','at','na','ang','ng','mga','ko','ay','ako','lang','hindi','di','pa','mo','may','kung','ni','naman','si','ka','ito','yung','ung','para','isa',"isa't",'nga','ba','pero','siya','niya','nya','rin','din','kaya','o','kasi','kase','dahil','wala','nang','lahat','nila','sila','kami','talaga','ngayon','ano','anu-ano','alam','pô','eh','iyon','yon','yun','kahit','dito','namin','kaniya','kanya','tao',])

In [None]:
dates = sorted(set([(y, m) for m, y in zip(df['created_at'].dt.month, df['created_at'].dt.year)]))
for y, m in dates:
    wc = WordCloud(stopwords=stop_words, background_color='white', width=1000, height=800, random_state=42).generate(' '.join(df[(df['created_at'].dt.month == m) & (df['created_at'].dt.year == y)]['full_text'].values.flatten()))
    figsize(20, 20)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(f'{y} - {m} - Word Cloud')
    plt.savefig(f'plots/{y} - {m} - Word Cloud.png', bbox_inches='tight')

In [13]:
len(" ".join(df['full_text'].values))

1562316

In [14]:
# Instantiates a client
translate_client = translate.Client.from_service_account_json('twitter-vaccine-59bc3f2229fe.json')


In [None]:
res = []
for text in df['full_text']:
    # The target language
    target = 'en'
    print(f"translating: {text.encode('ascii', errors='ignore').decode()}")
    
    # Translates some text into Russian
    translation = translate_client.translate(
        text.encode("ascii", errors="ignore").decode().replace(', ', ' '),
        target_language=target, source_language='tl')
    
    res.append(translation)
    print(f"Result: {translation}")

In [25]:
len(res)

14923

In [26]:
df['translated_text'] = res

In [27]:
df.to_pickle('translated_tweets.pkl')