### Gensim LDA

In [2]:
import re, os, time
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
from gensim.models.ldamodel import LdaModel
from gensim.models import Phrases
from gensim.models.phrases import Phraser
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Import nltk stopwords and spacy for lemmatization
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
import spacy
%matplotlib inline

# Enable logging for Gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings('ignore',category=DeprecationWarning)

import time

import bs4


# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
import textacy

from bokeh.charts import Chord

from pysankey import sankey

The bokeh.charts API has moved to a separate 'bkcharts' package.

This compatibility shim will remain until Bokeh 1.0 is released.
After that, if you want to use this API you will have to install
the bkcharts package explicitly.

  warn(message)


In [3]:
current_dir = os.getcwd()
os.chdir(current_dir)

#### 1. Import dataset

#### Dutch elections dataset

First of all, we need to load and preprocess the data. All Tweets were collected and stored in _JSON_ (JavaScript Object Notation) format. As reference: "JSON is a lightweight data-interchange format. It is easy for humans to read and write. It is easy for machines to parse and generate.". After loading it, we want to manipulate the data in _lists_ and _dictionaries_. In addition, we will collect statistics about the dataset.

In [4]:
# Collect names of Tweet files
from os import listdir
from os.path import isfile, join
import pickle
import json
mypath = 'data/election'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [None]:
# Collect all Tweets available and store them as list of dicts

all_tweets = []

for f in onlyfiles:
    full_name = mypath+'/'+f

    with open(full_name, 'r') as f:
        data = json.load(f)
        all_tweets.extend(data)

In [None]:
all_tweets_df = pd.DataFrame(all_tweets)
all_tweets_df = all_tweets_df[all_tweets_df['lang'] == 'nl']

In [None]:
print('All tweets :',len(all_tweets))

In [None]:
all_tweets = all_tweets_df.to_dict('records')

In [None]:
print('All dutch only tweets :',len(all_tweets))

In [None]:
# Save collected Tweets
with open('elections_tweets', 'w') as fout:
    json.dump(all_tweets, fout)

In [None]:
# Load collected Tweets
import json
with open('elections_tweets', 'r') as fout:
        all_tweets = json.load(fout)

In [None]:
# Statistics
print('Sample of a Tweet:', all_tweets[0]['text'], '\n')
print('Number of fields available:', len(all_tweets[0].keys()), '\n')
print('Fields available in a Tweet:', '\n', [*all_tweets[0]], '\n')
print('Info avaialable in the whole set:', '\n', pd.DataFrame(all_tweets[:10]).columns.values)

In [None]:
# Extract the data (id and text), and put it into dict and list
text_dict = {} 
text_list = []
id_list = []
missing_tweets = 0

for tweet in all_tweets:
    if ('text' in tweet.keys()):
        if ('id' in tweet.keys()):
            my_id = tweet['id']
            if (tweet['text'] != None):
                text_dict[my_id] = tweet['text']
                id_list.append(my_id)
                text_list.append([tweet['text']])
                
            elif (tweet['text'] == None):
                missing_values = missing_tweets + 1

In [None]:
# Save processed Tweets
with open('elections_text_list', 'wb') as fp:
    pickle.dump(text_list, fp)
    
with open('elections_text_dict', 'wb') as fp:
    pickle.dump(text_dict, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
# Load processed Tweets
with open('elections_text_list', 'rb') as fp:
    text_list = pickle.load(fp)
    
with open('elections_text_dict', 'rb') as fp:
    text_dict = pickle.load(fp)

In [6]:
# Statistics
t = len(text_dict)
max_length = 0
average_length = 0

for tweet in text_list : 
    average_length = average_length + sum(len(i) for i in tweet) 
    if (sum(len(i) for i in tweet) > max_length) : 
        max_length = sum(len(i) for i in tweet)
        max_tweet = tweet

print('In total ' + str(t) + ' Tweets collected')
print('On average Tweet is ' + str(round(average_length / t)) + ' charachters long', '\n')
print('Longest Tweet is:', max_tweet, '\n')
print('It is', sum(len(i) for i in max_tweet), 'characters long')

In total 715155 Tweets collected
On average Tweet is 126 charachters long 

Longest Tweet is: ['RT @RoosjeXS: &amp;#39;Als angst mijn&amp;nbsp;raadgever wordt,&amp;nbsp;dan moet ik&amp;nbsp;meteen iets anders&amp;nbsp;gaan doen.&amp;#39; - Sigrid Kaag https:/…'] 

It is 164 characters long


In [7]:
# Let's inspect the Tweets
pd.DataFrame(text_list[:5])

Unnamed: 0,0
0,RT @Derksen_Gelul: Nederlands bekendste droogk...
1,"RT @GerBStruik: Niet alleen Ollongren, maar ni..."
2,"@JoostNiemoller Joost, trap er toch niet in. H..."
3,@EenVandaag @thierrybaudet @D66 @APechtold Ald...
4,Dolhuysbrug moet na tien jaar definitief van t...


#### 2. Remove emails and newline characters 

Now it is time to preprocess the Tweets. We will exclude some symbols, links, etc. Thereafter for algorithmic purposes, we need to split each Tweet (a combination of sentences) into a combination of words.

In [8]:
# Before
pprint(text_list[:3])

[['RT @Derksen_Gelul: Nederlands bekendste droogkloot, Marcel van Roosmalen '
  'sloopt D66 helemaal de moeder!! Luister zelf naar deze topper en g…'],
 ['RT @GerBStruik: Niet alleen Ollongren, maar niemand in #Rutte3 heeft '
  'kennelijk enig benul van staatsrecht. Een minister die als minister ee…'],
 ['@JoostNiemoller Joost, trap er toch niet in. Hier is één grote minne '
  'manipulatie aan de gang tegen FVD. Deze heeft… https://t.co/lVFOuFFYF3']]


In [9]:
def tweet_cleaner(text):
    soup =  bs4.BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
        
    return bom_removed

data = text_list
# data = text_list[-int(1e5):]

# Remove Emails and links to users
data = [re.sub('\S*@\S*\s?', '', str(sent)) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', str(sent)) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", str(sent)) for sent in data]

# Remove https and links
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))

data = [re.sub(combined_pat, "", str(sent)) for sent in data]

data = [re.sub(r"https\S+", "", str(sent)) for sent in data]

data = [re.sub(r"www.[^ ]+", "", str(sent)) for sent in data]


data = [tweet_cleaner(sent) for sent in data]


In [10]:
# After
pprint(data[:3])
# TODO: take care of \\n

['[RT Nederlands bekendste droogkloot, Marcel van Roosmalen sloopt D66 '
 'helemaal de moeder!! Luister zelf naar deze topper en g…]',
 '[RT Niet alleen Ollongren, maar niemand in #Rutte3 heeft kennelijk enig '
 'benul van staatsrecht. Een minister die als minister ee…]',
 'Joost, trap er toch niet in. Hier is één grote minne manipulatie aan de gang '
 'tegen FVD. Deze heeft… ]']


#### 3. Tokenize words and Clean-up text

In [11]:
def sent_to_words(sentences) :
    '''Split sentences into words'''
    for sentence in sentences :
        # The main function here is Gensim's simple_preprocess
        yield(simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

# Let's explore what do we have now
pd.DataFrame(data_words[:5])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,rt,nederlands,bekendste,droogkloot,marcel,van,roosmalen,sloopt,helemaal,de,moeder,luister,zelf,naar,deze,topper,en,,,
1,rt,niet,alleen,ollongren,maar,niemand,in,rutte,heeft,kennelijk,enig,benul,van,staatsrecht,een,minister,die,als,minister,ee
2,joost,trap,er,toch,niet,in,hier,is,een,grote,minne,manipulatie,aan,de,gang,tegen,fvd,deze,heeft,
3,aldus,leugenaar,hoe,durft,deze,partij,nog,de,van,,,,,,,,,,,
4,dolhuysbrug,moet,na,tien,jaar,definitief,van,tafel,vvd,haarlem,nmaak,uw,eigen,filter,ove,,,,,


#### 4. Remove Stopwords, Make Bigrams and Lemmatize

Here we're continuing to preprocess the Tweets by excluding so-called stopwords - just a set of common used words such as 'and', 'so', etc. in English. NLTK package has a collections of stopwords for Dutch language as well. Also, we can extend the set by adding words that should be considered as redundant ones.

In [12]:
# NLTK stop words collection
stop_words = stopwords.words('dutch')

# TODO: Extend stop words collection using native speaker knowledge
# stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
stop_words.extend(['rt', 'htt', 'via'])

In [13]:
print('The number of Dutch stop words avaialable:', len(stop_words), '\n')
print(stop_words)

The number of Dutch stop words avaialable: 104 

['de', 'en', 'van', 'ik', 'te', 'dat', 'die', 'in', 'een', 'hij', 'het', 'niet', 'zijn', 'is', 'was', 'op', 'aan', 'met', 'als', 'voor', 'had', 'er', 'maar', 'om', 'hem', 'dan', 'zou', 'of', 'wat', 'mijn', 'men', 'dit', 'zo', 'door', 'over', 'ze', 'zich', 'bij', 'ook', 'tot', 'je', 'mij', 'uit', 'der', 'daar', 'haar', 'naar', 'heb', 'hoe', 'heeft', 'hebben', 'deze', 'u', 'want', 'nog', 'zal', 'me', 'zij', 'nu', 'ge', 'geen', 'omdat', 'iets', 'worden', 'toch', 'al', 'waren', 'veel', 'meer', 'doen', 'toen', 'moet', 'ben', 'zonder', 'kan', 'hun', 'dus', 'alles', 'onder', 'ja', 'eens', 'hier', 'wie', 'werd', 'altijd', 'doch', 'wordt', 'wezen', 'kunnen', 'ons', 'zelf', 'tegen', 'na', 'reeds', 'wil', 'kon', 'niets', 'uw', 'iemand', 'geweest', 'andere', 'rt', 'htt', 'via']


Thereafter, we need to identify combinations of words that frequently appear together. For these purposes, we need to create so-called 'bigrams ' and 'trigrams'. Bigrams are two words frequently occurring together in the document. Trigrams are 3 words frequently occurring.

In [14]:
# Build the bigram and trigram models

# One of the most important parameters here is min_count 
# If set up, for example, as 5, then only those bigrams and trigrams will be stored that appear 5 times or more

# Threshold is 
# TODO: set up various parameters for min_count and threshold
start = time.time()
bigram = Phrases(data_words, min_count=5, threshold=10) 
trigram = Phrases(bigram[data_words], threshold=10)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)
end = time.time()
print('Algorithm time =', round((end-start)/60), 'min')

# See trigram example
print(trigram_mod[bigram_mod[data_words[5]]])



Algorithm time = 1 min
['rt', 'niet', 'alleen', 'ollongren', 'maar', 'niemand', 'in', 'rutte', 'heeft_kennelijk_enig', 'benul', 'van', 'staatsrecht', 'een', 'minister', 'die', 'als', 'minister', 'ee']


Lemmatization is converting a word to its root word. Spacy NLP package responsible for that. Thanks we have Dutch language version.

In [15]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts) :
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts) :
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts) :
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) :
    texts_out = []
    for sent in texts :
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [16]:
# Remove stop words
start = time.time()
data_words_nostops = remove_stopwords(data_words)

# Form bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

nlp = spacy.load('nl', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

end = time.time()

In [17]:
print('Algorithm time =', round((end-start)/60), 'min')
print(data_words[:1])
print(data_words_nostops[:1])
print(data_words_bigrams[:1])
print(data_lemmatized[:1])

Algorithm time = 27 min
[['rt', 'nederlands', 'bekendste', 'droogkloot', 'marcel', 'van', 'roosmalen', 'sloopt', 'helemaal', 'de', 'moeder', 'luister', 'zelf', 'naar', 'deze', 'topper', 'en']]
[['nederlands', 'bekendste', 'droogkloot', 'marcel', 'roosmalen', 'sloopt', 'helemaal', 'moeder', 'luister', 'topper']]
[['nederlands_bekendste', 'droogkloot_marcel', 'roosmalen_sloopt', 'helemaal', 'moeder_luister', 'topper']]
[['nederlands_bekendste', 'droogkloot_marcel', 'roosmalen_sloopt', 'helemaal', 'moeder_luister', 'topper']]


In [18]:
len(data_lemmatized)

715668

In [19]:
data_lemmatized

[['nederlands_bekendste',
  'droogkloot_marcel',
  'roosmalen_sloopt',
  'helemaal',
  'moeder_luister',
  'topper'],
 ['alleen',
  'ollongren',
  'rutte',
  'kennelijk_enig',
  'benul',
  'staatsrecht',
  'minister',
  'minister',
  'ee'],
 ['joost', 'trap', 'grote_minne', 'manipulatie', 'gang', 'fvd'],
 ['aldus', 'leugenaar', 'durft', 'partij'],
 ['dolhuysbrug',
  'tien_jaar',
  'definitief',
  'tafel',
  'vvd',
  'haarlem_nmaak',
  'eigen_filter'],
 ['alleen',
  'ollongren',
  'rutte',
  'kennelijk_enig',
  'benul',
  'staatsrecht',
  'minister',
  'minister',
  'ee'],
 ['gemeente', 'geeft', 'geheim', 'kiezer', 'afgerekend_ton'],
 ['rutte', 'vvd', 'steunt', 'rutte', 'neemt', 'maat', 'racis'],
 ['maarten',
  'jan',
  'staat',
  'nummer',
  'kandidatenlijst',
  'gelooft',
  'talenten',
  'ontwik'],
 ['compleet_kafka',
  'nrutte',
  'vvd',
  'ollongren',
  'vinden',
  'baudet',
  'fvd',
  'geobsedeerd',
  'praten',
  'rassen'],
 ['maart',
  'mag',
  'stemmen',
  'sleepwet',
  'stemadvi

In [20]:
# Save results
with open('elections_data_lemmatized', 'wb') as fp:
    pickle.dump(data_lemmatized, fp)

In [None]:
# Load results
with open('elections_data_lemmatized', 'rb') as fp:
    data_lemmatized = pickle.load(fp)

In [21]:
import re, os, time
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
from gensim.models.ldamodel import LdaModel
from gensim.models import Phrases
from gensim.models.phrases import Phraser
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Import nltk stopwords and spacy for lemmatization
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
import spacy
%matplotlib inline

# Enable logging for Gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings('ignore',category=DeprecationWarning)

import time
import pickle
import bs4

# Collect names of Tweet files
from os import listdir
from os.path import isfile, join
import json

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
import textacy
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from bokeh.charts import Chord

from ipysankeywidget import SankeyWidget
from ipywidgets import Layout
from IPython.display import display

from pysankey import sankey

In [22]:
# Document training with doc2vec

# Tag each tweets 
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(data_lemmatized)]

# Train documents(tweets) with doc2vec library
model = Doc2Vec(documents, vector_size=300, window_size = 15, min_count=1, epochs=150, 
                sampling_threshold = 1e-5,negative_size = 5, 
                workers=32,train_epoch = 150, dm = 0) #0 = dbow; 1 = dmpv 

# Save the training set model for further use
model.save("d2v.model")

#### 6. Create the Dictionary and Corpus needed for Topic Modeling

In [None]:
# Create Dictionary
dictionary = corpora.Dictionary(data_lemmatized)

# Filter out words that occur less than 20 documents, or more than 50% of the documents
# dictionary.filter_extremes(no_below=20, no_above=0.5)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [dictionary.doc2bow(text) for text in texts]

# Corpus contains pairs - a unique id for each word in the document and its frequency (word_id, word_frequency).
print(pd.DataFrame(corpus[:1]))

In [None]:
# We also can check what was the word from the dictionary - dictionary
pprint(corpus[:1])
print(dictionary[0])

In [None]:
# Readable format
[[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [None]:
print('Number of unique tokens (words + bigrams + trigrams): %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

#### 7. Build the model

In [None]:
# # Build Gensim LDA model
# start = time.time()
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, # corpus
#                                            dictionary=dictionary, # dict
#                                            num_topics=10, # number of topic to be extracted
#                                            random_state=100,
#                                            update_every=1, #
#                                            chunksize=100, # 
#                                            passes=10, # 
#                                            alpha='auto', # 
#                                            per_word_topics=True) # 
# end = time.time()
# print('Algorithm time =', round((end-start)/60), 'min')

In [None]:
# Gensim LDA with Multiprocessing
from gensim.models import LdaMulticore

num_topics = 10
chunksize = 100
passes = 10
%time lda_model = LdaMulticore(corpus=corpus, \
                       id2word=dictionary.id2token, \
                       num_topics=num_topics, \
                       random_state=100, \
                       chunksize=chunksize, \
                       passes=passes, \
                       per_word_topics=True)

In [None]:
# Save the model for further usage
from gensim.test.utils import datapath
temp_file = datapath('lda_model_300k') 
lda_model.save(temp_file)

In [None]:
# Load a potentially pretrained model
from gensim.test.utils import datapath
temp_file = datapath('lda_model_300k') 
lda_model = LdaModel.load(temp_file)

#### 8. Print the topics

In [None]:
# Print one topic
lda_model.show_topic(0)

In [None]:
# Print all topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

#### 9. Compute Model Perplexity and Coherence Score

To measure perfomance there are two metrics: perplexity and coherence score. 

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Scores
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score c_v measure: ', coherence_lda)

coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=dictionary, coherence='u_mass')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score UMass measure: ', coherence_lda)

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=LdaMulticore(corpus=corpus, 
                       id2word=dictionary, 
                       num_topics=num_topics, 
                       random_state=100,
                       chunksize=chunksize, 
                       passes=passes,
                       per_word_topics=True)
        print('Model with', num_topics, 'topics done')
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
start = 3
limit = 20
step = 1
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=data_lemmatized, start=start, limit=limit, step=step)

# Show graph
import matplotlib.pyplot as plt
limit=limit; start=start; step=step;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# We can decrease the number of words per topic and show them separately

topn = 10 # to keep it readable
top_topics = lda_model.top_topics(corpus=corpus, dictionary=dictionary, topn=topn)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

pprint(top_topics)

#### 10. Visualize topics-keywords

Each bubble is a topic. The larger the bubble, the more prevalent the topic.

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis