In [163]:
import pandas as pd
import numpy as np

from helpers import *


%load_ext autoreload
%autoreload 2

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import gensim
import pyLDAvis.gensim_models

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [74]:
#Load the tweets 
df = pd.read_csv('data/out_clean.csv')
df = eda_processing(df)

In [189]:
countries = df.whcs.unique()
country = 'India'
#params = {'max_df' : 1.0, 'min_df' : 1, 'n_components': 5}
params = {'max_df' : 0.95, 'min_df' : 3, 'seed' : 50}

In [190]:
clean_tweets = df[df.whcs == country].clean.copy()
clean_tweets.dropna(inplace = True)
assert clean_tweets.isna().sum() == 0

 ## LDA
 
 ### LDA with `sklearn.LatentDirichletAllocation`
 We run LDA with a different number of clusters 
 Just by displaying 

In [123]:
def display_topics(model, feature_names, no_top_words = 10):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx+1))
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        


In [191]:
for k in range(3,10+1):    #tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2) # , max_features=no_features, stop_words='english')
    print("****** k = %d ******"%(k))
    tf_vectorizer = CountVectorizer(max_df=params['max_df'], min_df=params['min_df'],ngram_range = [2,3]) 
    tf = tf_vectorizer.fit_transform(clean_tweets)
    tf_feature_names = tf_vectorizer.get_feature_names()

    
    lda = LatentDirichletAllocation(n_components=k, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

    display_topics(lda, tf_feature_names)
    print()

****** k = 3 ******
Topic 1:
imran khan, prime minister, minister imran, prime minister imran, minister imran khan, prime ministerik, prime ministerl, may allah, vote confidence, tehreek insaf
Topic 2:
peace upon, election commission, maryam nawaz, peace blessing, good morning, pakistan army, surprise day, yusuf raza, blessing upon, happy surprise day
Topic 3:
nawaz sharif, open ballot, senate election, nation demand, demand open, nation demand open, demand open ballot, pakistan day, people party, high court

****** k = 4 ******
Topic 1:
imran khan, prime minister, minister imran, prime minister imran, minister imran khan, prime ministerik, vote confidence, prime ministerl, god bless, may god
Topic 2:
election commission, senate election, nawaz sharif, maryam nawaz, open ballot, nation demand, demand open, nation demand open, demand open ballot, pakistan army
Topic 3:
pakistan day, tehreek insaf, people party, high court, khatim al, follow back, march pakistan, chairman senate, pakista

### LDA with `gensim.LdaMulticore`

In [192]:
def add_bi_tri_grams(data_words):
    bigram = gensim.models.Phrases(data_words, min_count=3, threshold=1) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=1)  
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    data_bi_tri = [bigram_mod[doc] for doc in data_words]
    data_bi_tri = [trigram_mod[bigram_mod[doc]] for doc in data_words]
    return data_bi_tri

In [206]:
# models
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary


data_words =clean_tweets.apply(lambda x : x.split(' ')).tolist()
data_words = add_bi_tri_grams(data_words)
dictionary = Dictionary(data_words)
#dictionary.filter_extremes(no_below=params['min_df'], no_above=params['max_df'])
dictionary.filter_extremes(no_below=2, no_above=0.9)
corpus = [dictionary.doc2bow(doc) for doc in data_words]

params2 = {'passes': 10, 'random_state': params['seed']}
base_models = dict()
model = LdaMulticore(corpus=corpus, num_topics=4, id2word=dictionary, workers=6,
                passes=params2['passes'], random_state=params2['random_state'])

In [207]:
print(model.show_topics(num_words=5))
print(sorted(model[corpus[0]],key=lambda x:x[1],reverse=True))

[(0, '0.012*"pakistan" + 0.012*"people" + 0.008*"today" + 0.007*"say" + 0.006*"come"'), (1, '0.011*"pakistan" + 0.009*"imran_khan" + 0.007*"say" + 0.006*"give" + 0.005*"even"'), (2, '0.007*"one" + 0.007*"amp" + 0.005*"say" + 0.004*"kashmir" + 0.004*"also"'), (3, '0.008*"country" + 0.006*"make" + 0.006*"imran_khan" + 0.005*"every" + 0.005*"become"')]
[(2, 0.606948), (3, 0.31873545), (0, 0.037533898), (1, 0.036782637)]


In [208]:
# plot topics
data =  pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
pyLDAvis.display(data)

In [202]:
model

<gensim.models.ldamulticore.LdaMulticore at 0x22e408a70a0>

## NFM

In [110]:

for k in range(3,10+1):    #tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2) # , max_features=no_features, stop_words='english')
    print("****** k = %d ******"%(k))
    
    tfidf_vectorizer = TfidfVectorizer(max_df=params['max_df'], min_df=params['min_df'])
    tfidf = tfidf_vectorizer.fit_transform(clean_tweets)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()

    nmf = NMF(n_components=k, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
    
    display_topics(nmf, tfidf_feature_names)
    print()# NMF is able to use tf-idf


****** k = 3 ******
Topic 0:
via, back, freedom, violence, zionism, peace, pedophile, sect, matter, jeane, manson, life, black, sudanese, migrant, center, knife, god, kill, berry
Topic 1:
soral, alain, answer, episode, bravery, piece, fr, jus, family, question, de, race, first, female, hebrew, health, farmer, hai, federal, great
Topic 2:
like, well, look, see, jew, fire, trump, people, everything, right, chose, child, know, catholic, official, semitic, even, week, anti, day

****** k = 4 ******
Topic 0:
via, back, freedom, violence, zionism, peace, pedophile, sect, matter, jeane, manson, life, black, sudanese, migrant, center, knife, god, berry, kill
Topic 1:
soral, alain, answer, episode, bravery, piece, fr, jus, family, question, de, race, first, female, hebrew, health, farmer, hai, federal, great
Topic 2:
like, well, look, see, jew, fire, trump, people, right, everything, chose, child, know, catholic, official, semitic, week, even, anti, day
Topic 3:
woman, one, kill, afrin, come, m



In [92]:
# Run NMF
#nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
