# Data Modelisation

In [4]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LdaMulticore
import pickle 
import pyLDAvis

## 1) Data preparation

In [5]:
data = pd.read_csv("datasets/posts_clean.csv")
data.head(3)

Unnamed: 0,Id,Title,Body,Tags
0,415192,good way create simple python web service,I use python year I little experience python w...,<python><web-services>
1,415344,log implementation prefer,I implement log class c try decide I curious k...,<debugging><language-agnostic><logging>
2,414981,directly modify list element,I struct struct map public int size public map...,<c#><.net>


In [6]:
# remove < and > around Tags
data["Tags"] = data["Tags"].replace({"<" : " "}, regex=True)
data["Tags"] = data["Tags"].replace({">" : " "}, regex=True)

In [7]:
data.head()

Unnamed: 0,Id,Title,Body,Tags
0,415192,good way create simple python web service,I use python year I little experience python w...,python web-services
1,415344,log implementation prefer,I implement log class c try decide I curious k...,debugging language-agnostic logging
2,414981,directly modify list element,I struct struct map public int size public map...,c# .net
3,415580,regex name group java,understanding java regex package not support n...,java regex
4,415753,instance cache objective c,I want cache instance certain class class keep...,objective-c weak-references


In [8]:
docs = data["Body"].to_list()

In [9]:
# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [10]:
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [11]:
# Compute bigrams.
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [12]:
# Remove rare and common tokens.
# Create a dictionary representation of the documents.
dct = Dictionary(docs)
dct.filter_extremes(no_below=20, no_above=0.5)

In [13]:
# Bag-of-words representation of the documents.
corpus = [dct.doc2bow(doc) for doc in docs]

In [14]:
print('Number of unique tokens: %d' % len(dct))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 24935
Number of documents: 164598


## 2) Topic Modelling

In [15]:
# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dct[0]  # This is only to "load" the dictionary.
id2word = dct.id2token

model = LdaModel(
    corpus=corpus,
    id2word = id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [16]:
coherence_model_lda = CoherenceModel(model=model, texts=docs, dictionary=dct, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.5934874812843955


In [17]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    lda_model = LdaMulticore(corpus=corpus,
                        id2word = id2word,
                        chunksize=chunksize,
                        alpha=a,
                        eta=b,
                        iterations=iterations,
                        num_topics=k,
                        passes=passes,
                        eval_every=eval_every)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dct, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [18]:
from gensim.utils import ClippedCorpus
import gensim
import tqdm

In [19]:
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=dct, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

100%|██████████| 540/540 [38:30:52<00:00, 256.76s/it]


In [24]:
pd.DataFrame(model_results).sort_values("Coherence", ascending=False)

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
491,100% Corpus,9,0.61,0.31,0.640743
535,100% Corpus,10,asymmetric,0.01,0.636625
195,75% Corpus,8,0.91,0.01,0.629944
252,75% Corpus,10,0.61,0.61,0.629695
221,75% Corpus,9,0.61,0.31,0.626676
...,...,...,...,...,...
15,75% Corpus,2,0.91,0.01,0.440731
290,100% Corpus,2,symmetric,0.01,0.439272
280,100% Corpus,2,0.61,0.01,0.434665
279,100% Corpus,2,0.31,symmetric,0.434100


In [25]:
lda_model = LdaMulticore(corpus=corpus,
                        id2word=id2word,
                        num_topics=9, 
                        random_state=100,
                        chunksize=100,
                        passes=10,
                        alpha=0.61,
                        eta=0.31)

In [36]:
# Visualize the topics
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
lda_vis = gensimvis.prepare(lda_model, corpus, dct)

  default_term_info = default_term_info.sort_values(


In [37]:
lda_vis