In [60]:
%run ../00_AdvancedPythonConcepts/talktools.py

# Topic Modelling

<font color="grey">Python for Data Science (AY250, UC Berkeley 2016&mdash;2018, J. Bloom)</font>

What are some recent topics tweeted about with the #GoBears hashtag? 

We can use LDA  (http://ai.stanford.edu/~ang/papers/jair03-lda.pdf) to help us find themes.

"a generative probabilistic model for collections of discrete dataset such as text corpora. It is also a topic model that is used for discovering abstract topics from a collection of documents."

- http://scikit-learn.org/stable/modules/decomposition.html#latentdirichletallocation


### Get Tweet IDs from near Berkeley, CA where GOBears is mentioned

In [None]:
import pandas as pd
import tweepy
import csv

import json
cred = json.load(open(".cred.json","r"))

consumer_key = cred["consumer_key"]
consumer_secret = cred["consumer_secret"]
access_token = cred["access_token"]
access_secret = cred["access_secret"]

In [None]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, 
                     retry_count=3, retry_delay=5, retry_errors=set([401, 404, 500, 503]))

In [None]:
n=1000
ids = []
a = tweepy.Cursor(api.search, q='gobears', geocode="37.8716,-122.2727,100km",since_id=874829847523414016)
for t in a.items():
    ids.append(t.id)
    n-=1
    if n<0:
        break

In [None]:
len(ids)

In [None]:
import pandas as pd
df = pd.DataFrame(ids,columns=["berkeley_ids"])
df.to_csv("berkeley.csv",index=False)

In [None]:
!head berkeley.csv

### Get the body of text from those tweets

In [None]:
# this takes awhile!
%run get_tweets.py
retrieve_tweets("berkeley.csv","tweet_berkeley.csv")

In [None]:
#!pip install tweet-preprocessor
#!conda install --channel mpi4py mpich mpi4py -y

In [None]:
#!pip install pyLDAvis
#!pip install gensim

In [61]:
import pandas as pd
import preprocessor as p
df = pd.read_csv("tweet_berkeley.csv",usecols=["text","id"],index_col=["id"])

In [62]:
print(len(df))

1001


In [63]:
df.head(20)

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
974986220097888256,RT @CalBaseball: B8 | It's a three-hit night f...
975022990374162433,.@aliharrison07 wins her 200 breast heat in a ...
975261240069992449,RT @CalWSwim: AMERICAN RECORD!! @KathleenBaker...
974851949425119233,RT @CalBaseball: B4 | @_JonahDavis_ delivers! ...
974716376811110400,RT @CalWSwim: 5 individuals and 2 relays will ...
974059557596020736,RT @CalBaseball: The Pac-12 season is here! Ma...
973390645992673280,RT @CalBaseball: For the second time in three ...
974853874229043200,"RT @Cal: Happy 150, UC Berkeley! #Berkeley150🌟..."
975432521880973313,RT @CalBaseball: We're ready for some Sunday b...
974326599943315456,RT @CalAthletics: .@CalWBBall is South Carolin...


In [64]:
# p.set_options(p.OPT.URL, p.OPT.EMOJI,p.OPT.MENTION,p.OPT.SMILEY)
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.HASHTAG, p.OPT.MENTION)
df["clean"] = df["text"].apply(p.clean).apply(lambda x: x.replace("RT", "")).apply(lambda x: x.replace(":", ""))

In [65]:
df["clean"]

id
974986220097888256      B8 | It's a three-hit night for as he leads ...
975022990374162433    . wins her 200 breast heat in a PR 210.21 and ...
975261240069992449      AMERICAN RECORD!! defends her NCAA title in ...
974851949425119233      B4 | delivers! It's a double down the left f...
974716376811110400      5 individuals and 2 relays will be in the wa...
974059557596020736      The Pac-12 season is here! Make your plans t...
973390645992673280      For the second time in three weeks, a Golden...
974853874229043200                              Happy 150, UC Berkeley!
975432521880973313      We're ready for some Sunday baseball! Bears ...
974326599943315456                           . is South Carolina bound!
974108786548813824      RECAP Cal takes third in the 800 free relay ...
975229993092608000      AMERICAN RECORD!! defends her NCAA title in ...
975190548184104960      The No. 1 Bears continue to roll! Cal beats ...
975226815156428800                   The only comparison that

## Generate the LDA topics

In [69]:
%run make_corpus.py

In [70]:
make_corpus(df["clean"].values, outdictfile='berkeley.dict',mmfile='berkeley.mm')

In [71]:
from gensim import corpora, models, similarities

In [78]:
lda_params      = {'num_topics': 10, 'passes': 25, 'alpha': 0.001}

# Load the corpus and Dictionary
corpus = corpora.MmCorpus("berkeley.mm")
dictionary = corpora.Dictionary.load("berkeley.dict")

print("Running LDA with: %s  " % lda_params)
lda = models.LdaModel(corpus, id2word=dictionary,
                      num_topics=lda_params['num_topics'],
                      passes=lda_params['passes'],
                      alpha = lda_params['alpha'])
lda.save("berkeley.lda")

Running LDA with: {'num_topics': 10, 'passes': 25, 'alpha': 0.001}  


In [80]:
lda = models.LdaModel.load("berkeley.lda")
lda.print_topics()

[(0,
  '0.084*"cal" + 0.044*"bears" + 0.034*"inning" + 0.030*"ends" + 0.020*"osu" + 0.018*"collegiate" + 0.018*"challenge" + 0.018*"tennis" + 0.016*"two" + 0.015*"fly"'),
 (1,
  '0.043*"time" + 0.043*"second" + 0.040*"golden" + 0.039*"bear" + 0.038*"three" + 0.037*"pac" + 0.036*"recognized" + 0.034*"congrats" + 0.034*"weekly" + 0.034*"weeks"'),
 (2,
  '0.067*"gets" + 0.047*"bears" + 0.032*"record" + 0.031*"title" + 0.030*"first" + 0.027*"baker" + 0.023*"ball" + 0.020*"week" + 0.020*"broke" + 0.019*"time"'),
 (3,
  '0.060*"final" + 0.047*"cal" + 0.037*"usf" + 0.033*"free" + 0.032*"run" + 0.031*"heading" + 0.030*"home" + 0.029*"bottom" + 0.028*"friends" + 0.025*"relay"'),
 (4,
  '0.094*"bears" + 0.042*"week" + 0.040*"rankings" + 0.038*"friday" + 0.033*"alone" + 0.031*"beavers" + 0.028*"sunday" + 0.028*"home" + 0.027*"time" + 0.025*"night"'),
 (5,
  '0.089*"record" + 0.083*"back" + 0.071*"american" + 0.051*"wins" + 0.047*"ncaa" + 0.045*"title" + 0.044*"defends" + 0.043*"https" + 0.036*"pl

## Visualize

In [81]:
import pyLDAvis.gensim

debate_data =  pyLDAvis.gensim.prepare(lda,corpus, dictionary)
pyLDAvis.display(debate_data)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
