In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import re

from tqdm import tqdm
from tokenize import tokenize
from clean_tokenizer import tokenize_tweets

# Import module from gsdmm repository
sys.path.insert(0, '../gsdmm/')
from gsdmm import MovieGroupProcess

In [3]:
# Import preprocessed clean tweet data 
data_dir = r'E:\OneDrive - University of Georgia\Project\Data\tweet_data_3_groccery\4.3 FourColumns 3+1csv - Copy\grocery_2021_tokenized.csv'
tweets_df = pd.read_csv(data_dir) 
tweets_df.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweet,lon,lat,local_time,clean_tweet
0,0,0,The BLOCC LIFE Shop is bacc up and running... ...,-118.018856,33.975918,1/20/2021 15:59,blocc life shop bacc run sure stop blocc life ...
1,1,1,"I'm at Costco in North Riverside, IL https://t...",-87.829289,41.846539,1/20/2021 17:59,costco north riverside


### Short Term Text Modeling (STTM)

In [5]:
tweets_df['clean_tweet'] = tweets_df['clean_tweet'].astype(str)

# Convert cleaned tweet into tokens list
tweets_df['clean_tokens'] = tweets_df.clean_tweet.apply(lambda x: re.split('\s', x))

print(tweets_df['clean_tokens'])

0         [blocc, life, shop, bacc, run, sure, stop, blo...
1                                [costco, north, riverside]
2                                                   [foods]
3         [see, friend, soror, kroger, pearl, pink, gree...
4         [gonna, hate, kodak, home, sign, cancel, steph...
                                ...                        
129985    [apparently, people, complain, biden, harris, ...
129986    [costco, folsom, california, family, members, ...
129987    [important, data, support, safety, goals, bow,...
129988              [literally, think, trick, publix, vest]
129989                                      [look, package]
Name: clean_tokens, Length: 129990, dtype: object


In [6]:
# Create list of tweet tokens
docs = tweets_df['clean_tokens'].tolist()

docs

[['blocc',
  'life',
  'shop',
  'bacc',
  'run',
  'sure',
  'stop',
  'blocc',
  'life',
  'gear',
  'support',
  'movement'],
 ['costco', 'north', 'riverside'],
 ['foods'],
 ['see', 'friend', 'soror', 'kroger', 'pearl', 'pink', 'green', 'week'],
 ['gonna',
  'hate',
  'kodak',
  'home',
  'sign',
  'cancel',
  'stephen',
  'ross',
  'micky',
  'arison',
  'company',
  'like',
  'publix',
  'chick'],
 ['unable',
  'local',
  'demand',
  'handle',
  'high',
  'theft',
  'scabbing',
  'local',
  'retailers',
  'walmart',
  'target',
  'resort',
  'ration',
  'pack',
  'basketball',
  'baseball',
  'pokemon',
  'card',
  'nunreal',
  'card',
  'market',
  'continue',
  'surge',
  'boom',
  'bust',
  'rat'],
 ['remind', 'tell', 'story', 'hell', 'customer', 'go', 'costco'],
 ['virus',
  'transmission',
  'come',
  'indoor',
  'grocery',
  'store',
  'home',
  'depot',
  'walmart',
  'construction',
  'sit',
  'target',
  'real'],
 ['aldi', 'spot', 'carb', 'foods'],
 ['feel', 'safer', 'par

In [13]:
%%time

# Train STTM model
# Init of the Gibbs Sampling Dirichlet Mixture Model algorithm
# K = number of potential topic (which we don't know a priori)
# alpha = 
# beta = 
# n_iters = number of iterations to 
mgp = MovieGroupProcess(K=8, alpha=0.1, beta=0.1, n_iters=40)
vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)
y = mgp.fit(docs, n_terms)

In stage 0: transferred 107346 clusters with 8 clusters populated
In stage 1: transferred 87506 clusters with 8 clusters populated
In stage 2: transferred 73505 clusters with 8 clusters populated
In stage 3: transferred 61571 clusters with 8 clusters populated
In stage 4: transferred 53086 clusters with 8 clusters populated
In stage 5: transferred 47367 clusters with 8 clusters populated
In stage 6: transferred 43344 clusters with 8 clusters populated
In stage 7: transferred 40994 clusters with 8 clusters populated
In stage 8: transferred 39529 clusters with 8 clusters populated
In stage 9: transferred 38092 clusters with 8 clusters populated
In stage 10: transferred 37119 clusters with 8 clusters populated
In stage 11: transferred 36284 clusters with 8 clusters populated
In stage 12: transferred 35752 clusters with 8 clusters populated
In stage 13: transferred 35608 clusters with 8 clusters populated
In stage 14: transferred 35382 clusters with 8 clusters populated
In stage 15: transf

In [14]:
# Save model
with open(r'E:\OneDrive - University of Georgia\Project\Script\5.0 TopicModeling\dumps\trained_models\8clusters_2021.model', 'wb') as f:
    pickle.dump(mgp, f)
    f.close()

In [15]:
# Load in trained model with 10 topics 
filehandler = open(r'E:\OneDrive - University of Georgia\Project\Script\5.0 TopicModeling\dumps\trained_models\8clusters_2021.model', 'rb')
mgp = pickle.load(filehandler)

In [16]:
# Helper function
def top_words(cluster_word_distribution, top_cluster, values):
    '''prints the top words in each cluster'''
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print('Cluster %s : %s'%(cluster,sort_dicts))
        print(' — — — — — — — — —')

In [17]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)
print('*'*20)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-10:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*20)

# Show the top 5 words in term frequency for each cluster 
topic_indices = np.arange(start=0, stop=len(doc_count), step=1)
top_words(mgp.cluster_word_distribution, topic_indices, 20)

Number of documents per topic : [22394  7480 25212 14845  9379 10778 33327  6575]
********************
Most important clusters (by number of docs inside): [6 2 0 3 5 4 1 7]
********************
Cluster 0 : [('costco', 6618), ('foods', 3593), ('walmart', 2818), ('publix', 2574), ('like', 1904), ('kroger', 1730), ('aldi', 1551), ('go', 1352), ('good', 1346), ('chicken', 1324), ('food', 1239), ('buy', 1016), ('love', 866), ('need', 829), ('shop', 820), ('store', 809), ('today', 796), ('time', 786), ('sell', 781), ('pizza', 771)]
 — — — — — — — — —
Cluster 1 : [('walmart', 2577), ('publix', 1054), ('kroger', 1015), ('vaccine', 936), ('shoot', 929), ('covid', 770), ('store', 475), ('costco', 433), ('appointment', 430), ('appointments', 416), ('people', 391), ('today', 387), ('go', 366), ('available', 362), ('thank', 353), ('park', 328), ('shop', 325), ('stop', 284), ('time', 282), ('open', 272)]
 — — — — — — — — —
Cluster 2 : [('walmart', 11122), ('costco', 5516), ('store', 2194), ('go', 20

In [13]:
# Helper function
def cluster_importance(mgp):
    '''returns a word-topic matrix[phi] where each value represents
    the word importance for that particular cluster; 
    phi[i][w] would be the importance of word w in topic i.
    '''
    n_z_w = mgp.cluster_word_distribution
    beta, V, K = mgp.beta, mgp.vocab_size, mgp.K
    phi = [{} for i in range(K)]        
    for z in range(K):
        for w in n_z_w[z]:
            phi[z][w] = (n_z_w[z][w]+beta)/(sum(n_z_w[z].values())+V*beta)
    return phi

In [14]:
# Helper function(s)
def get_topic_name(doc, topic_dict):
    '''returns the topic name string value from a dictionary of topics'''
    topic_desc = topic_dict[doc]
    return topic_desc

def topic_allocation(df, docs, mgp, topic_dict):
    '''allocates all topics to each document in original dataframe,
    adding two columns for cluster number and cluster description'''
    topic_allocations=[]
    for doc in tqdm(docs):
        topic_label, score = mgp.choose_best_label(doc)
        topic_allocations.append(topic_label)

    df['dominant_topic'] = topic_allocations
    
    df['topic_name'] = df.dominant_topic.apply(lambda x: get_topic_name(x, topic_dict))
    print('Complete. Number of documents with topic allocated: {}'.format(len(df)))    
        

In [16]:
# Define dictionary topics in same sequential order 
# based on clusters and word distributions in STTM model above

topic_dict = {}
topic_names = ['health',
               'virus',
               '',
               '',
               'cancer and heart disease',
               'diet and excercise',
               'health and medical workers',
               'abortion',
               'vaping and cigarettes',
               'drug costs and opioid crisis']

for i, topic_num in enumerate(topic_indices):
    topic_dict[topic_num]=topic_names[i]
    
# Allocate topics to original dataframe 
topic_allocation(tweets_df, docs, mgp, topic_dict)

100%|██████████| 129990/129990 [00:59<00:00, 2178.07it/s]


Complete. Number of documents with topic allocated: 129990


In [17]:
tweets_df['num_clusters'] = 10
tweets_df[['clean_tweet', 'dominant_topic','topic_name']].sample(n=10)

Unnamed: 0,clean_tweet,dominant_topic,topic_name
18074,family local meijer mask workers crazy,6,health and medical workers
122609,right walmart size amazon hasten downfall main...,6,health and medical workers
9313,onion ring great value,2,
4950,walmart childish,3,
97152,look clearance stuff walmart,3,
122793,head point investigation reveal store head,6,health and medical workers
43810,post photo publix greenwise market main olas,4,cancer and heart disease
33519,think costco favorite know bird lunds time may...,2,
43479,want shoot shoot people charge food chain deci...,6,health and medical workers
71000,sample world heal,2,


### Heat Map Visualizations by Year/User

In [18]:
# Re-index dataframe by user(news source)
reindexed_tweets = tweets_df['dominant_topic'] 
reindexed_tweets.index = tweets_df['username'] 

KeyError: 'username'

In [19]:
from collections import Counter

# Helper function
def keys_to_counts(keys):
    '''
    returns a tuple of topic categories and their 
    accompanying magnitudes for a given list of keys
    '''
    count_pairs = sorted(Counter(keys).items(), reverse=False)
    categories = [pair[0] for pair in count_pairs]
    counts = [pair[1] for pair in count_pairs]
    return (categories, counts)

In [None]:
# Generate counts of tweets per topic across each news source
n_topics = mgp.K
news_sources = tweets_df['username'].unique() 

user_keys = []
for source in news_sources:
    user_keys.append(reindexed_tweets[source].values)
    
user_counts = []
for keys in user_keys:
    categories, counts = keys_to_counts(keys)
    user_counts.append(counts)

user_topic_counts = pd.DataFrame(np.array(user_counts), index=news_sources)
user_topic_counts.columns = ['Topic {}'.format(i) for i in range(n_topics)]
user_topic_counts

In [None]:
# Add column to sum total topics 
user_topic_counts['total_topics'] =  user_topic_counts.sum(axis=1)

# Convert topic counts to percentages for each news source 
user_topic_counts_ratio =  user_topic_counts.apply(lambda x: (x / user_topic_counts['total_topics']))
user_topic_counts_ratio = user_topic_counts_ratio.drop(columns=['total_topics'])
user_topic_counts_ratio

In [None]:
# Display heat map of topics vs news sources
fig, ax = plt.subplots(figsize=(14,10))
sns.set(font_scale=1)
sns.heatmap(user_topic_counts_ratio, cmap="YlGnBu", ax=ax);
plt.show()

In [None]:
# Convert date to datetime
tweets_df['date'] = pd.to_datetime(tweets_df['date'], errors='coerce')

# Add column for year
tweets_df['year'] = tweets_df['date'].dt.year

In [None]:
# Re-index dataframe by year
reindexed_tweets = tweets_df['dominant_topic']
reindexed_tweets.index = tweets_df['year']

In [None]:
# Generate counts of tweets per topic across years
n_topics = mgp.K
years_range = sorted(tweets_df['year'].unique(), reverse=False)

yearly_keys = []
for year in years_range:
    yearly_keys.append(reindexed_tweets[year].values)
    
yearly_counts = []
for keys in yearly_keys:
    categories, counts = keys_to_counts(keys)
    yearly_counts.append(counts)

yearly_topic_counts = pd.DataFrame(np.array(yearly_counts), index=range(2014,2020+1))
yearly_topic_counts.columns = ['Topic {}'.format(i) for i in range(n_topics)]

yearly_topic_counts

In [None]:
# Display heat map of topics vs years
fig, ax = plt.subplots(figsize=(14,10))
sns.heatmap(yearly_topic_counts, cmap="YlGnBu", ax=ax);
plt.show()