### Gensim LDA

In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Import nltk stopwords and spacy for lemmatization
from nltk.corpus import stopwords

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
import spacy
%matplotlib inline

# Enable logging for Gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings('ignore',category=DeprecationWarning)



#### 1. Import data set

In [4]:
# Create a list of words
from os import listdir
from os.path import isfile, join

mypath = 'C:/Users/Mikhail/Documents/GitHub/topic_modeling/data/elections'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [5]:
# Collect all Tweets available and store them as list of dicts
import json
all_tweets = []

for f in onlyfiles:
    full_name = mypath+'/'+f

    with open(full_name, 'r') as f:
        data = json.load(f)
        all_tweets.extend(data)

In [24]:
# Statistics
print('Number of keys assigned:', len(all_tweets[0].keys()), '\n')
print('Sample of a Tweet:', all_tweets[0]['text'], '\n')
print('Info available in a Tweet:', '\n', all_tweets[0].keys())
print('\n','Info avaialable in the whole set:', '\n', pd.DataFrame(all_tweets[:10]).columns.values)

Number of keys assigned: 28 

Sample of a Tweet: RT @Derksen_Gelul: Nederlands bekendste droogkloot, Marcel van Roosmalen sloopt D66 helemaal de moeder!! Luister zelf naar deze topper en g… 

Info available in a Tweet: 
 dict_keys(['created_at', 'id', 'id_str', 'text', 'source', 'truncated', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'retweeted_status', 'is_quote_status', 'quote_count', 'reply_count', 'retweet_count', 'favorite_count', 'entities', 'favorited', 'retweeted', 'filter_level', 'lang', 'timestamp_ms'])

 Info avaialable in the whole set: 
 ['contributors' 'coordinates' 'created_at' 'display_text_range' 'entities'
 'extended_tweet' 'favorite_count' 'favorited' 'filter_level' 'geo' 'id'
 'id_str' 'in_reply_to_screen_name' 'in_reply_to_status_id'
 'in_reply_to_status_id_str' 'in_reply_to_user_id'
 'in_reply_to_user_id_str' 'is_quote_stat

In [25]:
# Extract the data (id and text), and put it into dict and list
text_dict = {} 
text_list = []
id_list = []
missing_tweets = 0

for tweet in all_tweets:
    if ('text' in tweet.keys()):
        if ('id' in tweet.keys()):
            my_id = tweet['id']
            if (tweet['text'] != None):
                text_dict[my_id] = tweet['text']
                id_list.append(my_id)
                text_list.append([tweet['text']])
                
            elif (tweet['text'] == None):
                missing_values = missing_tweets + 1

In [27]:
# Statistics
t = len(text_dict)
max_length = 0
average_length = 0

for tweet in text_list : 
    average_length = average_length + sum(len(i) for i in tweet) 
    if (sum(len(i) for i in tweet) > max_length) : 
        max_length = sum(len(i) for i in tweet)
        max_tweet = tweet

print('The numver of blank Tweets = ' + str(missing_tweets))
print('In total ' + str(t) + ' Tweets collected')
print('On average Tweet is ' + str(round(average_length / t)) + ' charachters long', '\n')
print('Longest Tweet is:', max_tweet, '\n')
print('It is', sum(len(i) for i in max_tweet), 'characters long')

The numver of blank Tweets = 0
In total 795867 Tweets collected
On average Tweet is 124 charachters long 

Longest Tweet is: ['@tacticsoftuchel Courtois&gt;Karius\nAzpi&gt;Clyne \nChristensen&lt;VVD (for now)\nZouma/Rudiger&gt;&gt;&gt;&gt;&gt;&gt;\nEmerson/Alonso&lt;Robertson… https://t.co/5YNNJog70t'] 

It is 170 characters long


In [28]:
# Let's inspect the Tweets
pd.DataFrame(text_list[:5])

Unnamed: 0,0
0,RT @Derksen_Gelul: Nederlands bekendste droogk...
1,"RT @GerBStruik: Niet alleen Ollongren, maar ni..."
2,"@JoostNiemoller Joost, trap er toch niet in. H..."
3,@EenVandaag @thierrybaudet @D66 @APechtold Ald...
4,Dolhuysbrug moet na tien jaar definitief van t...


#### 2. Remove emails and newline characters 

In [29]:
# Before
pprint(text_list[:3])

[['RT @Derksen_Gelul: Nederlands bekendste droogkloot, Marcel van Roosmalen '
  'sloopt D66 helemaal de moeder!! Luister zelf naar deze topper en g…'],
 ['RT @GerBStruik: Niet alleen Ollongren, maar niemand in #Rutte3 heeft '
  'kennelijk enig benul van staatsrecht. Een minister die als minister ee…'],
 ['@JoostNiemoller Joost, trap er toch niet in. Hier is één grote minne '
  'manipulatie aan de gang tegen FVD. Deze heeft… https://t.co/lVFOuFFYF3']]


In [30]:
# data = df.content.values.tolist()
# TODO: include other Tweets into analysis
data = text_list[:10000]

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', str(sent)) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', str(sent)) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", str(sent)) for sent in data]

# TODO: remove https and links
data = [re.sub("https", "", str(sent)) for sent in data]

In [31]:
# After
pprint(data[:3])

['[RT Nederlands bekendste droogkloot, Marcel van Roosmalen sloopt D66 '
 'helemaal de moeder!! Luister zelf naar deze topper en g…]',
 '[RT Niet alleen Ollongren, maar niemand in #Rutte3 heeft kennelijk enig '
 'benul van staatsrecht. Een minister die als minister ee…]',
 'Joost, trap er toch niet in. Hier is één grote minne manipulatie aan de gang '
 'tegen FVD. Deze heeft… ://t.co/lVFOuFFYF3]']


#### 3. Tokenize words and Clean-up text

In [32]:
def sent_to_words(sentences) :
    '''Split sentences into words'''
    for sentence in sentences :
        # The main function here is Gensim's simple_preprocess
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

# Let's explore what do we have now
pd.DataFrame(data_words[:5])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,rt,nederlands,bekendste,droogkloot,marcel,van,roosmalen,sloopt,helemaal,de,...,luister,zelf,naar,deze,topper,en,,,,
1,rt,niet,alleen,ollongren,maar,niemand,in,rutte,heeft,kennelijk,...,benul,van,staatsrecht,een,minister,die,als,minister,ee,
2,joost,trap,er,toch,niet,in,hier,is,een,grote,...,manipulatie,aan,de,gang,tegen,fvd,deze,heeft,co,lvfouffyf
3,aldus,leugenaar,hoe,durft,deze,partij,nog,de,van,co,...,kqfumv,,,,,,,,,
4,dolhuysbrug,moet,na,tien,jaar,definitief,van,tafel,vvd,haarlem,...,saanwvxg,nmaak,uw,eigen,filter,ove,co,pbveexkhtf,,


#### 4. Remove Stopwords, Make Bigrams and Lemmatize

Bigrams are two words frequently occurring together in the document. Trigrams are 3 words frequently occurring.

In [33]:
# NLTK stop words collection
stop_words = stopwords.words('dutch')

# TODO: Extend stop words collection using native speaker knowledge
# stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [34]:
print('The number of Dutch stop words avaialable:', len(stop_words))

The number of Dutch stop words avaialable: 101


In [35]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[5]]])



['rt', 'niet', 'alleen_ollongren', 'maar', 'niemand', 'in', 'rutte', 'heeft', 'kennelijk_enig_benul', 'van', 'staatsrecht', 'een', 'minister', 'die', 'als', 'minister_ee']


In [36]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts) :
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts) :
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts) :
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) :
    texts_out = []
    for sent in texts :
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

Lemmatization is converting a word to its root word. Spacy NLP package responsible for that. Thanks we have Dutch language version.

In [37]:
# Remove stop words
data_words_nostops = remove_stopwords(data_words)

# Form bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

nlp = spacy.load('nl', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['nederlands_bekendste', 'droogkloot_marcel', 'roosmalen_sloopt', 'helemaal', 'moeder_luister', 'topper']]


In [38]:
print(data_words[:1])

[['rt', 'nederlands', 'bekendste', 'droogkloot', 'marcel', 'van', 'roosmalen', 'sloopt', 'helemaal', 'de', 'moeder', 'luister', 'zelf', 'naar', 'deze', 'topper', 'en']]


In [39]:
print(data_words_nostops[:1])

[['rt', 'nederlands', 'bekendste', 'droogkloot', 'marcel', 'roosmalen', 'sloopt', 'helemaal', 'moeder', 'luister', 'topper']]


In [40]:
print(data_words_bigrams[:1])

[['rt', 'nederlands_bekendste', 'droogkloot_marcel', 'roosmalen_sloopt', 'helemaal', 'moeder_luister', 'topper']]


In [41]:
print(data_lemmatized[:1])

[['nederlands_bekendste', 'droogkloot_marcel', 'roosmalen_sloopt', 'helemaal', 'moeder_luister', 'topper']]


#### 6. Create the Dictionary and Corpus needed for Topic Modeling

In [42]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# Corpus contains pairs - a unique id for each word in the document and its frequency (word_id, word_frequency).
print(pd.DataFrame(corpus[:1]))

        0       1       2       3       4       5
0  (0, 1)  (1, 1)  (2, 1)  (3, 1)  (4, 1)  (5, 1)


In [43]:
# We also can check what was the word from the dictionary - id2word
pprint(corpus[:2])
print(id2word[11])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)],
 [(6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)]]
rutte


In [44]:
# Readable format
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:2]]

[[('droogkloot_marcel', 1),
  ('helemaal', 1),
  ('moeder_luister', 1),
  ('nederlands_bekendste', 1),
  ('roosmalen_sloopt', 1),
  ('topper', 1)],
 [('alleen_ollongren', 1),
  ('benul', 1),
  ('kennelijk_enig', 1),
  ('minister', 1),
  ('minister_ee', 1),
  ('rutte', 1),
  ('staatsrecht', 1)]]

#### 7. Build the model

In [45]:
# TODO: tune the parameters
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, # corpus
                                           id2word=id2word, # dict
                                           num_topics=10, # number of topic to be extracted
                                           random_state=100,
                                           update_every=1, #
                                           chunksize=100, # 
                                           passes=10, # 
                                           alpha='auto', # 
                                           per_word_topics=True) # 

#### 8. Print the topics

In [46]:
# Print the keywords in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.123*"racisme" + 0.105*"baudet" + 0.075*"fvd" + 0.073*"doet" + '
  '0.033*"hart" + 0.029*"aantijgingen" + 0.029*"idiote" + '
  '0.029*"wervelende_week" + 0.029*"vol_onterechte" + 0.021*"opvalt"'),
 (1,
  '0.027*"partijkartel" + 0.025*"steeds" + 0.020*"jaar" + 0.019*"helemaal" + '
  '0.015*"groningen" + 0.012*"zon" + 0.011*"nederland" + 0.011*"politiek" + '
  '0.011*"motie" + 0.010*"zullen"'),
 (2,
  '0.062*"leefbaar" + 0.052*"rotterdam" + 0.041*"co" + 0.037*"regering" + '
  '0.027*"boom" + 0.026*"joost_eerdmans" + 0.022*"sluit" + 0.016*"nleugens" + '
  '0.010*"zelfs" + 0.010*"vast"'),
 (3,
  '0.082*"referendum" + 0.060*"weer" + 0.054*"gaat" + 0.041*"schaffen" + '
  '0.041*"raadgevend" + 0.040*"proberen" + 0.040*"komende_donderdag" + '
  '0.040*"tegenover" + 0.031*"co" + 0.027*"gaan"'),
 (4,
  '0.104*"pechtold" + 0.097*"media" + 0.061*"kabinet" + 0.058*"discriminatie" '
  '+ 0.026*"politicus" + 0.021*"ras" + 0.012*"ooit" + 0.011*"geschapen" + '
  '0.007*"geven" + 0.006*"alexand

#### 9. Compute Model Perplexity and Coherence Score

In [47]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.294330783861692

Coherence Score:  0.5200181958046101


#### 10. Visualize the topics-keywords

Each bubble is a topic. The larger the bubble, the more prevalent the topic.

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


### LDA Mallet Model