In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import re

from tqdm import tqdm
from tokenize import tokenize
from clean_tokenizer import tokenize_tweets

# Import module from gsdmm repository
sys.path.insert(0, '../gsdmm/')
from gsdmm import MovieGroupProcess

In [None]:
# Import preprocessed clean tweet data 
data_dir = r"E:\Project\Data\tweet_data\5.0 Tokenized_Topic_Modeling\clean_local_time_2019-03_2020-02_state_level_tokenized.csv"
tweets_df = pd.read_csv(data_dir) 
tweets_df.head(2)

### Short Term Text Modeling (STTM)

In [None]:
tweets_df['clean_tweet'] = tweets_df['clean_tweet'].astype(str)

In [None]:
# Convert cleaned tweet into tokens list
tweets_df['clean_tokens'] = tweets_df.clean_tweet.apply(lambda x: re.split('\s', x))

# Create list of tweet tokens
docs = tweets_df['clean_tokens'].tolist()

In [None]:
%%time

# Train STTM model
# Init of the Gibbs Sampling Dirichlet Mixture Model algorithm
# K = number of potential topic (which we don't know a priori)
# alpha = 
# beta = 
# n_iters = number of iterations to 
mgp = MovieGroupProcess(K=6, alpha=0.1, beta=0.1, n_iters=30)
vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)
y = mgp.fit(docs, n_terms)

In [None]:
# Save model
with open(r'C:\Users\Administrator\Desktop\topic-modeling-health-tweets-master\dumps\trained_models\5clusters_2019.model', 'wb') as f:
    pickle.dump(mgp, f)
    f.close()

In [None]:
# Load in trained model with 10 topics 
filehandler = open(r'C:\Users\Administrator\Desktop\topic-modeling-health-tweets-master\dumps\trained_models\5clusters_2019.model', 'rb')
mgp = pickle.load(filehandler)

In [None]:
# Helper function
def top_words(cluster_word_distribution, top_cluster, values):
    '''prints the top words in each cluster'''
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print('Cluster %s : %s'%(cluster,sort_dicts))
        print(' — — — — — — — — —')

In [None]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*'*20)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)

# Show the top 5 words in term frequency for each cluster 
topic_indices = np.arange(start=0, stop=len(doc_count), step=1)
top_words(mgp.cluster_word_distribution, topic_indices, 10)

In [None]:
# Helper function
def cluster_importance(mgp):
    '''returns a word-topic matrix[phi] where each value represents
    the word importance for that particular cluster; 
    phi[i][w] would be the importance of word w in topic i.
    '''
    n_z_w = mgp.cluster_word_distribution
    beta, V, K = mgp.beta, mgp.vocab_size, mgp.K
    phi = [{} for i in range(K)]        
    for z in range(K):
        for w in n_z_w[z]:
            phi[z][w] = (n_z_w[z][w]+beta)/(sum(n_z_w[z].values())+V*beta)
    return phi

In [None]:
# Helper function(s)
def get_topic_name(doc, topic_dict):
    '''returns the topic name string value from a dictionary of topics'''
    topic_desc = topic_dict[doc]
    return topic_desc

def topic_allocation(df, docs, mgp, topic_dict):
    '''allocates all topics to each document in original dataframe,
    adding two columns for cluster number and cluster description'''
    topic_allocations=[]
    for doc in tqdm(docs):
        topic_label, score = mgp.choose_best_label(doc)
        topic_allocations.append(topic_label)

    df['dominant_topic'] = topic_allocations
    
    df['topic_name'] = df.dominant_topic.apply(lambda x: get_topic_name(x, topic_dict))
    print('Complete. Number of documents with topic allocated: {}'.format(len(df)))    
        

In [None]:
# Define dictionary topics in same sequential order 
# based on clusters and word distributions in STTM model above

topic_dict = {}
topic_names = ['health insurance',
               'virus/outbreaks',
               'cancer studies affecting woman/babies',
               'miscellaneous studies affecting women/children',
               'cancer and heart disease',
               'diet and excercise',
               'health and medical workers',
               'abortion',
               'vaping and cigarettes',
               'drug costs and opioid crisis']

for i, topic_num in enumerate(topic_indices):
    topic_dict[topic_num]=topic_names[i]
    
# Allocate topics to original dataframe 
topic_allocation(tweets_df, docs, mgp, topic_dict)

In [None]:
tweets_df['num_clusters'] = 10
tweets_df[['num_clusters','text', 'dominant_topic','topic_name']].sample(n=10)

### Heat Map Visualizations by Year/User

In [None]:
# Re-index dataframe by user(news source)
reindexed_tweets = tweets_df['dominant_topic'] 
reindexed_tweets.index = tweets_df['username'] 

In [None]:
from collections import Counter

# Helper function
def keys_to_counts(keys):
    '''
    returns a tuple of topic categories and their 
    accompanying magnitudes for a given list of keys
    '''
    count_pairs = sorted(Counter(keys).items(), reverse=False)
    categories = [pair[0] for pair in count_pairs]
    counts = [pair[1] for pair in count_pairs]
    return (categories, counts)

In [None]:
# Generate counts of tweets per topic across each news source
n_topics = mgp.K
news_sources = tweets_df['username'].unique() 

user_keys = []
for source in news_sources:
    user_keys.append(reindexed_tweets[source].values)
    
user_counts = []
for keys in user_keys:
    categories, counts = keys_to_counts(keys)
    user_counts.append(counts)

user_topic_counts = pd.DataFrame(np.array(user_counts), index=news_sources)
user_topic_counts.columns = ['Topic {}'.format(i) for i in range(n_topics)]
user_topic_counts

In [None]:
# Add column to sum total topics 
user_topic_counts['total_topics'] =  user_topic_counts.sum(axis=1)

# Convert topic counts to percentages for each news source 
user_topic_counts_ratio =  user_topic_counts.apply(lambda x: (x / user_topic_counts['total_topics']))
user_topic_counts_ratio = user_topic_counts_ratio.drop(columns=['total_topics'])
user_topic_counts_ratio

In [None]:
# Display heat map of topics vs news sources
fig, ax = plt.subplots(figsize=(14,10))
sns.set(font_scale=1)
sns.heatmap(user_topic_counts_ratio, cmap="YlGnBu", ax=ax);
plt.show()

In [None]:
# Convert date to datetime
tweets_df['date'] = pd.to_datetime(tweets_df['date'], errors='coerce')

# Add column for year
tweets_df['year'] = tweets_df['date'].dt.year

In [None]:
# Re-index dataframe by year
reindexed_tweets = tweets_df['dominant_topic']
reindexed_tweets.index = tweets_df['year']

In [None]:
# Generate counts of tweets per topic across years
n_topics = mgp.K
years_range = sorted(tweets_df['year'].unique(), reverse=False)

yearly_keys = []
for year in years_range:
    yearly_keys.append(reindexed_tweets[year].values)
    
yearly_counts = []
for keys in yearly_keys:
    categories, counts = keys_to_counts(keys)
    yearly_counts.append(counts)

yearly_topic_counts = pd.DataFrame(np.array(yearly_counts), index=range(2014,2020+1))
yearly_topic_counts.columns = ['Topic {}'.format(i) for i in range(n_topics)]

yearly_topic_counts

In [None]:
# Display heat map of topics vs years
fig, ax = plt.subplots(figsize=(14,10))
sns.heatmap(yearly_topic_counts, cmap="YlGnBu", ax=ax);
plt.show()