In [1]:
%pylab inline
import os
import pickle
import json
import pandas as pd
from plotly import express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import scattertext as st

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_pickle('translated_tweets.pkl')

In [3]:
additional_stopwords = ['na', 'https', 'ha', 'hahaha', 'haha', 'lang', 'ang', 'yan', 'ng', 'sa', 'rin', 'yun', 'yang', 'si', 'ako', 'siya', 'ka', 'po', 
                        'mga', 'yung', 'pa', 'pala', 'na', 'ni', 'sya', 'ba', 'ko', 'nyo', 'man']
additional_stopwords = set(list(ENGLISH_STOP_WORDS) + additional_stopwords)

In [4]:
df['created_at_date'] = pd.to_datetime(df['created_at_date'])
df['YEAR'] = df['created_at_date'].dt.year
df['MONTH'] = df['created_at_date'].dt.month
df['DAY'] = df['created_at_date'].dt.day

In [5]:
for i, tmp in df.sort_values(['YEAR', 'MONTH']).groupby(['YEAR', 'MONTH']):
    print(i)
    if i[0] <= 2016 and i[1] < 8:
        continue
    keywords = ['dengvax', 'vacci', 'dengue']
    for word in keywords:
        print(word)
        if word == 'vacci':

            tmp2 = tmp[(tmp['full_text'].str.lower().str.contains('vacci')) | (tmp['full_text'].str.lower().str.contains('bakuna'))].copy()
        else:
            tmp2 = tmp[tmp['full_text'].str.lower().str.contains(word)].copy()
        tmp2['keyword'] = word
        tmp3 = tmp[~tmp['full_text'].str.lower().str.contains(word)].copy()
        tmp3['keyword'] = 'not ' + word
        tmp = tmp2.append(tmp3)
        tmp['full_text'] = tmp['full_text'].str.lower()
        tmp['parse'] = tmp['full_text'].apply(st.whitespace_nlp_with_sentences)

        try:
            unigram_corpus = (st.CorpusFromParsedDocuments(tmp,
                                                       category_col='keyword',
                                                       parsed_col='parse')
                          .build().remove_terms(additional_stopwords, ignore_absences=True).get_stoplisted_unigram_corpus())
        except:
            continue
        topic_model_nmf = st.SentencesForTopicModeling(unigram_corpus).get_topics_from_model(
            Pipeline([
                ('tfidf', TfidfTransformer(sublinear_tf=True)),
                ('nmf', (NMF(n_components=100, alpha=.1, l1_ratio=.5, random_state=0)))
            ]),
            num_terms_per_topic=20
        )

        topic_feature_builder_nmf = st.FeatsFromTopicModel(topic_model_nmf)

        topic_corpus_nmf = st.CorpusFromParsedDocuments(
            tmp,
            category_col='keyword',
            parsed_col='parse',
            feats_from_spacy_doc=topic_feature_builder_nmf
        ).build()
        
        try:
            html = st.produce_scattertext_explorer(
                topic_corpus_nmf,
                category=word,
                width_in_pixels=1000,
                use_non_text_features=True,
                use_full_doc=True,
                pmi_threshold_coefficient=0,
                topic_model_term_lists=topic_feature_builder_nmf.get_top_model_term_lists(),
                topic_model_preview_size=20
            )
            with open(f'plots/scattertext/by_term/raw/{i}_{word} - NMF Topic Model.html', 'w') as f:
                f.write(html)
        except:
            continue

(2015, 11)
dengvax
vacci
dengue
(2015, 12)
dengvax
vacci
dengue
(2016, 1)
(2016, 2)
(2016, 3)
(2016, 4)
(2016, 5)
(2016, 6)
(2016, 7)
(2016, 8)
dengvax
vacci
dengue
(2016, 9)
dengvax
vacci
dengue
(2016, 10)
dengvax
vacci
dengue
(2016, 11)
dengvax
vacci
dengue
(2016, 12)
dengvax
vacci
dengue
(2017, 1)
dengvax
vacci
dengue
(2017, 2)
dengvax
vacci
dengue
(2017, 3)
dengvax
vacci
dengue
(2017, 4)
dengvax
vacci
dengue
(2017, 5)
dengvax
vacci
dengue
(2017, 6)
dengvax
vacci
dengue
(2017, 7)
dengvax
vacci
dengue
(2017, 8)
dengvax
vacci
dengue
(2017, 9)
dengvax
vacci
dengue
(2017, 10)
dengvax
vacci
dengue
(2017, 11)
dengvax
vacci
dengue
(2017, 12)
dengvax
vacci
dengue
(2018, 1)
dengvax
vacci
dengue
(2018, 2)
dengvax
vacci
dengue
(2018, 3)
dengvax
vacci
dengue
(2018, 4)
dengvax
vacci
dengue
(2018, 5)
dengvax
vacci
dengue
(2018, 6)
dengvax
vacci
dengue
(2018, 7)
dengvax
vacci
dengue
(2018, 8)
dengvax
vacci
dengue
(2018, 9)
dengvax
vacci
dengue
(2018, 10)
dengvax
vacci
dengue
(2018, 11)
dengvax
vac

In [6]:
for i, tmp in df.sort_values(['YEAR', 'MONTH']).groupby(['YEAR']):
    print(i)
    keywords = ['dengvax', 'vacci', 'dengue']
    for word in keywords:
        print(word)
        if word == 'vacci':

            tmp2 = tmp[(tmp['full_text'].str.lower().str.contains('vacci')) | (tmp['full_text'].str.lower().str.contains('bakuna'))].copy()
        else:
            tmp2 = tmp[tmp['full_text'].str.lower().str.contains(word)].copy()
        tmp2['keyword'] = word
        tmp3 = tmp[~tmp['full_text'].str.lower().str.contains(word)].copy()
        tmp3['keyword'] = 'not ' + word
        tmp = tmp2.append(tmp3)
        tmp['full_text'] = tmp['full_text'].str.lower()
        tmp['parse'] = tmp['full_text'].apply(st.whitespace_nlp_with_sentences)

        try:
            unigram_corpus = (st.CorpusFromParsedDocuments(tmp,
                                                       category_col='keyword',
                                                       parsed_col='parse')
                          .build().remove_terms(additional_stopwords, ignore_absences=True).get_stoplisted_unigram_corpus())
        except:
            continue
        topic_model_nmf = st.SentencesForTopicModeling(unigram_corpus).get_topics_from_model(
            Pipeline([
                ('tfidf', TfidfTransformer(sublinear_tf=True)),
                ('nmf', (NMF(n_components=100, alpha=.1, l1_ratio=.5, random_state=0)))
            ]),
            num_terms_per_topic=20
        )

        topic_feature_builder_nmf = st.FeatsFromTopicModel(topic_model_nmf)

        topic_corpus_nmf = st.CorpusFromParsedDocuments(
            tmp,
            category_col='keyword',
            parsed_col='parse',
            feats_from_spacy_doc=topic_feature_builder_nmf
        ).build()
        
        try:
            html = st.produce_scattertext_explorer(
                topic_corpus_nmf,
                category=word,
                width_in_pixels=1000,
                use_non_text_features=True,
                use_full_doc=True,
                pmi_threshold_coefficient=0,
                topic_model_term_lists=topic_feature_builder_nmf.get_top_model_term_lists(),
                topic_model_preview_size=20
            )
            with open(f'plots/scattertext/by_term/raw/{i}_{word} - NMF Topic Model.html', 'w') as f:
                f.write(html)
        except:
            continue

2015
dengvax
vacci
dengue
2016
dengvax
vacci
dengue
2017
dengvax
vacci
dengue
2018
dengvax
vacci
dengue
2019
dengvax
vacci
dengue
