In [2]:
import pickle
import pandas as pd
import numpy as np
import sys
import re

from tqdm import tqdm
from tokenize import tokenize

# Import module from gsdmm repository
sys.path.insert(0, '../gsdmm/')
from gsdmm import MovieGroupProcess

In [3]:
# Import customs module to tokenize and clean tweet dataset
from clean_tokenizer import tokenize_tweets
data_dir = '../input/english_health_tweets.csv'
tweets_df = pd.read_csv(data_dir) # to be used with cleaned tokenized english only csv
# tweets_df = tokenize_tweets(data_dir) # to be used with non-clean raw health tweets csv
tweets_df.head()

Unnamed: 0,id,date,timezone,tweet,hashtags,username,name,day,hour,retweet,nlikes,nreplies,nretweets,clean_tweet
0,1244004590699384833,2020-03-28 20:53:01,UTC,"Schlifke said, he and members of the CovidVent...",['#covid19'],KHNews,Kaiser Health News,4,3,False,0,0,1,schlifke say members covidvent coalition help ...
1,1243979929437507585,2020-03-28 19:15:01,UTC,Millions of Americans are seeking care by conn...,"['#covid19', '#telemedicine']",KHNews,Kaiser Health News,6,2,False,6,0,2,millions americans seek care connect doctor el...
2,1243956772123090944,2020-03-28 17:43:00,UTC,We're following the #coronaviruspandemic close...,"['#coronaviruspandemic', '#healthcare', '#heal...",KHNews,Kaiser Health News,5,4,False,0,0,1,follow closely bring best investigation surrou...
3,1243941673270460418,2020-03-28 16:43:00,UTC,About $100 billion of the funding is intended ...,"['#coronavirus', '#relieffunds', '#healthbent']",KHNews,Kaiser Health News,5,12,False,1,0,1,billion fund intend put reimburse eligible hea...
4,1243911473480556544,2020-03-28 14:43:00,UTC,Read KHN's top #COVID19 coverage: The U.S. mil...,['#covid19'],KHNews,Kaiser Health News,6,4,False,4,1,7,read coverage military fly specialize swab ita...


### Short Term Text Modeling (STTM)

In [4]:
# Convert cleaned tweet into tokens list
tweets_df['clean_tokens'] = tweets_df.clean_tweet.apply(lambda x: re.split('\s', x))

In [5]:
# Create list of tweet tokens
docs = tweets_df['clean_tokens'].tolist()
docs

[['schlifke',
  'say',
  'members',
  'covidvent',
  'coalition',
  'help',
  'organize',
  'federal',
  'executive',
  'order',
  'enable',
  'conversion',
  'surgery',
  'center',
  'hospital',
  'care',
  'sit'],
 ['millions',
  'americans',
  'seek',
  'care',
  'connect',
  'doctor',
  'electronically',
  'time'],
 ['follow',
  'closely',
  'bring',
  'best',
  'investigation',
  'surround',
  'outbreak',
  'read',
  'stories'],
 ['billion',
  'fund',
  'intend',
  'put',
  'reimburse',
  'eligible',
  'health',
  'care',
  'providers',
  'health',
  'care',
  'relate',
  'expense',
  'lose',
  'revenues',
  'attributable'],
 ['read',
  'coverage',
  'military',
  'fly',
  'specialize',
  'swab',
  'italy',
  'chicago',
  'area',
  'medical',
  'supply',
  'firm',
  'take',
  'sky',
  'weekslong',
  'boat',
  'trip',
  'ocean'],
 ['hospitals',
  'patients',
  'community',
  'health',
  'center',
  'rework',
  'care',
  'patients'],
 ['nurse', 'sit', 'sideline', 'want', 'help', 'kn

In [132]:
%%time

# Train STTM model
# Init of the Gibbs Sampling Dirichlet Mixture Model algorithm
# K = number of potential topic (which we don't know a priori)
# alpha = 
# beta = 
# n_iters = number of iterations to 
mgp = MovieGroupProcess(K=10, alpha=0.1, beta=0.1, n_iters=30)
vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)
y = mgp.fit(docs, n_terms)

# Save model
with open('dumps/trained_models/v3.model', 'wb') as f:
    pickle.dump(mgp, f)
    f.close()

In stage 0: transferred 161638 clusters with 13 clusters populated
In stage 1: transferred 125678 clusters with 13 clusters populated
In stage 2: transferred 89367 clusters with 13 clusters populated
In stage 3: transferred 65696 clusters with 13 clusters populated
In stage 4: transferred 54166 clusters with 13 clusters populated
In stage 5: transferred 48752 clusters with 13 clusters populated
In stage 6: transferred 45499 clusters with 13 clusters populated
In stage 7: transferred 43169 clusters with 13 clusters populated
In stage 8: transferred 41604 clusters with 13 clusters populated
In stage 9: transferred 40111 clusters with 13 clusters populated
In stage 10: transferred 39128 clusters with 13 clusters populated
In stage 11: transferred 38576 clusters with 13 clusters populated
In stage 12: transferred 37981 clusters with 13 clusters populated
In stage 13: transferred 37167 clusters with 13 clusters populated
In stage 14: transferred 36776 clusters with 13 clusters populated
In 

In [6]:
# Load in trained model with 10 topics 
filehandler = open('dumps/trained_models/v1.model', 'rb')
mgp = pickle.load(filehandler)

In [7]:
# Helper function
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print('Cluster %s : %s'%(cluster,sort_dicts))
        print(' — — — — — — — — —')

In [8]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*'*20)
# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)
# Show the top 5 words in term frequency for each cluster 
top_words(mgp.cluster_word_distribution, top_index, 5)

Number of documents per topic : [18387 21321 18434 22806 20195 26692 26134  8562  6816 13494]
********************
Most important clusters (by number of docs inside): [5 6 3 1 4 2 0 9 7 8]
********************
Cluster 5 : [('help', 2031), ('diet', 1855), ('healthy', 1701), ('health', 1539), ('weight', 1456)]
 — — — — — — — — —
Cluster 6 : [('health', 3405), ('care', 3100), ('doctor', 2999), ('patients', 2380), ('nurse', 1654)]
 — — — — — — — — —
Cluster 3 : [('study', 3204), ('risk', 1917), ('health', 1601), ('say', 1562), ('kid', 1401)]
 — — — — — — — — —
Cluster 1 : [('ebola', 3734), ('case', 2347), ('coronavirus', 2330), ('outbreak', 2280), ('zika', 1851)]
 — — — — — — — — —
Cluster 4 : [('cancer', 4738), ('study', 2504), ('drug', 2017), ('risk', 1852), ('disease', 1664)]
 — — — — — — — — —
Cluster 2 : [('baby', 1176), ('woman', 1131), ('die', 1081), ('cancer', 1022), ('year', 941)]
 — — — — — — — — —
Cluster 0 : [('health', 6114), ('care', 2585), ('plan', 1651), ('obamacare', 1597)

In [12]:
# Helper function to get the word importance for each topic
def cluster_importance(mgp):
    n_z_w = mgp.cluster_word_distribution
    beta, V, K = mgp.beta, mgp.vocab_size, mgp.K
    phi = [{} for i in range(K)]        
    for z in range(K):
        for w in n_z_w[z]:
            phi[z][w] = (n_z_w[z][w]+beta)/(sum(n_z_w[z].values())+V*beta)
    return phi
phi = cluster_importance(mgp)

# phi[i][w] would be the importance of word w in topic i.

In [24]:
phi[1]['ebola'] # verify for topic 2 on virus 

0.025117225152337205

In [None]:
# Helper function to attribute topics to documents
# def topic_attribution(data, model, topic_dict, threshold):
    
    


In [1]:
# topic_dict = {}
# topic_names = ['diet and excercise',
#                'health and medical workers',
#                'miscellaneous studies affecting women/children',
#                'virus/outbreaks',
#                'cancer and heart disease',
#                'cancer studies affecting woman/babies',
#                'health insurance',
#                'drug costs and opiod crisis',
#                'abortion',
#                'vaping and cigarettes']

# for i, topic_num in enumerate(top_index):
#     topic_dict[topic_num]=topic_names[i] 
    
# # Create dataframe 
# pred_df = topic_attribution(tweets_df, mgp, topic_dict, threshold=0.4)