In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import gensim
import re
import nltk
from tqdm import tqdm

from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()

In [2]:
# Import customs module to tokenize and clean tweet dataset
from clean_tokenizer import tokenize_tweets
data_dir = r'E:\OneDrive - University of Georgia\Project\Data\tweet_data_3_groccery\4.3 FourColumns 3+1csv - Copy\grocery_2020_tokenized.csv'
tweets_df = pd.read_csv(data_dir) 
tweets_df.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweet,lon,lat,local_time,clean_tweet
0,0,0,@andylassner Walmart specials,-75.386461,39.145023,2020-01-20 18:59:29,walmart specials
1,1,1,Walmart just gave me the biggest fucking attit...,-81.986834,26.63648,2020-01-20 18:56:59,walmart give biggest fuck attitude stupid peop...


In [3]:
# Convert date to datetime
tweets_df['date'] = pd.to_datetime(tweets_df['local_time'], errors='coerce')

# Add column for year
tweets_df['year'] = tweets_df['date'].dt.year

In [4]:
# View tweet density over time

monthly_counts = tweets_df.set_index('date').resample('M').count()
yearly_counts = tweets_df.set_index('date').resample('A').count()
daily_counts = tweets_df.set_index('date').resample('D').count()

#daily_counts.to_csv("daily_counts.csv",index=True,sep=',')

### Gensim Module - LDA Approach

In [5]:
tweets_df['clean_tweet'] = tweets_df['clean_tweet'].astype(str)
tweets_df.dtypes

Unnamed: 0               int64
Unnamed: 0.1             int64
tweet                   object
lon                    float64
lat                    float64
local_time              object
clean_tweet             object
date            datetime64[ns]
year                     int64
dtype: object

In [6]:
# Create list of each tweets' tokens 
tweets_tokens = tweets_df.clean_tweet.apply(lambda x: re.split('\s', x))


# Create dictionary
dictionary = gensim.corpora.Dictionary(tweets_tokens)

# Test dictionary
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break


0 specials
1 walmart
2 attitude
3 biggest
4 fuck
5 give
6 people
7 stupid
8 world
9 best
10 change


In [7]:
# Bag of Words
bow_corpus = [dictionary.doc2bow(tweet) for tweet in tweets_tokens]

In [8]:
# Verify BOW is set up correctly
print(tweets_df['clean_tweet'].sample(n=1, random_state=0))
print(bow_corpus[8336])

bow_tweet_8336 = bow_corpus[8336]
for i in range(len(bow_tweet_8336)):
    print("Word {} (\"{}\") appears {} time.".format(bow_tweet_8336[i][0], 
                                               dictionary[bow_tweet_8336[i][0]], 
bow_tweet_8336[i][1]))

201669    week costco
Name: clean_tweet, dtype: object
[(1, 1), (134, 1), (452, 1), (461, 1), (669, 1), (1073, 1), (2392, 1), (2727, 1), (3049, 1), (3089, 1), (3704, 1), (7464, 1), (9066, 1), (9853, 1), (9854, 1)]
Word 1 ("walmart") appears 1 time.
Word 134 ("try") appears 1 time.
Word 452 ("think") appears 1 time.
Word 461 ("brand") appears 1 time.
Word 669 ("bust") appears 1 time.
Word 1073 ("end") appears 1 time.
Word 2392 ("shortly") appears 1 time.
Word 2727 ("movie") appears 1 time.
Word 3049 ("grind") appears 1 time.
Word 3089 ("sadly") appears 1 time.
Word 3704 ("career") appears 1 time.
Word 7464 ("skate") appears 1 time.
Word 9066 ("skateboard") appears 1 time.
Word 9853 ("ollie") appears 1 time.
Word 9854 ("sickest") appears 1 time.


#### Determining best number of topics

In [9]:
import pyLDAvis.gensim
from gensim import models
from gensim.models.coherencemodel import CoherenceModel

In [10]:
# Create list of cleaned tweets
tweets_list =  tweets_df['clean_tweet']

  and should_run_async(code)


In [11]:
%%time

# Iterate through range of k-topics fitting LDA model to each and computing coherence scores for each model
coherenceList_umass = []
coherenceList_cv = []
num_topics_list = np.arange(4,14+1)

Wall time: 0 ns


  and should_run_async(code)


In [12]:
dictionary.filter_extremes(no_below=2, no_above=1.0)

  and should_run_async(code)


In [None]:
for num_topics in tqdm(num_topics_list):
    #print()
    lda = models.LdaMulticore(corpus=bow_corpus, num_topics=num_topics, id2word=dictionary,passes=10,chunksize=4000,random_state=0)    
    cm = CoherenceModel(model=lda, corpus=bow_corpus,coherence='u_mass')
    print(dsfafas)
    coherenceList_umass.append(cm.get_coherence())
    viz = pyLDAvis.gensim.prepare(lda, bow_corpus, dictionary)
    pyLDAvis.save_html(viz,f'pyLDAvis_{num_topics}.html')

In [None]:
# Plot coherence scores across topic numbers

plotData = pd.DataFrame({'Number of topics':num_topics_list,
                         'CoherenceScore':coherenceList_umass})
f,ax = plt.subplots(figsize=(16,10))
sns.set_style("darkgrid")
sns.set(font_scale = 2)
sns.pointplot(x='Number of topics', y= 'CoherenceScore',data=plotData)
plt.axhline(y=-4.8, color='red')
plt.title('Topic Coherence')

__LDA Modeling Using 10 Topics__

In [None]:
%%time
# LDA Model using BOW
lda_model_bow = gensim.models.LdaMulticore(corpus=bow_corpus, num_topics=10, id2word=dictionary, decay=0.5,chunksize=10000, passes=10, workers=4, random_state=0)

  and should_run_async(code)


In [None]:
topic_desc = []
for idx, topic in lda_model_bow.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
%%time
import pyLDAvis
import pyLDAvis.gensim 

# LDA Visualization Tool
lda_viz = pyLDAvis.gensim.prepare(lda_model_bow, bow_corpus, dictionary, mds='tsne')

In [None]:
pyLDAvis.enable_notebook()
lda_viz