#SoMe Topic Modeling Notebook | Release canvas 1 📖

## Installations and Libraries 💽

In [1]:
import time 
# Time the running of everything
start_of_notebook_time = time.time()

In [2]:
# Installations
import sys
if 'google.colab' in sys.modules:
    !pip install emoji --upgrade
    !pip install pandas-profiling==2.*
    !pip install plotly==4.*
    !python -m spacy download en_core_web_lg
    !pip install pyldavis
    !pip install gensim
    !pip install chart_studio
    #!pip install --upgrade autopep8

Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/ff/1c/1f1457fe52d0b30cbeebfd578483cedb3e3619108d2d5a21380dfecf8ffd/emoji-0.6.0.tar.gz (51kB)
[K     |██████▍                         | 10kB 19.4MB/s eta 0:00:01[K     |████████████▉                   | 20kB 6.1MB/s eta 0:00:01[K     |███████████████████▎            | 30kB 7.2MB/s eta 0:00:01[K     |█████████████████████████▊      | 40kB 7.2MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 3.8MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-0.6.0-cp36-none-any.whl size=49716 sha256=150b1d01e7101aff78cce81e4de210bd374495f66d17e6375a7f2b846ea82128
  Stored in directory: /root/.cache/pip/wheels/46/2c/8b/9dcf5216ca68e14e0320e283692dce8ae321cdc01e73e17796
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-0.6.0
Collecting pandas-profiling==2.*
[

In [3]:
# Required Libraries

#Base and Cleaning 
import json
import requests
import pandas as pd
import numpy as np
import emoji
import regex
import re
import string
from collections import Counter

#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt 
import pyLDAvis.gensim
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls

#Natural Language Processing (NLP)
import spacy
import gensim
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.


The usage of `cmp` is deprecated and will be removed on or after 2021-06-01.  Please use `eq` and `order` instead.


`scipy.sparse.sparsetools` is deprecated!
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.



In [4]:
#Setting up chart studios to save visualizations
Username = 'so-me'
api_key = 'MnGv47xSLbpMq7mDjvLT'

chart_studio.tools.set_credentials_file(username=Username, api_key=api_key)


In [5]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

##Data Cleaning 🧹

In [6]:
# Loading the JSON file 
url_elon = 'https://raw.githubusercontent.com/Lambda-School-Labs/social-media-strategy-ds/feature/topic/python_notebooks/elonmusk_followers_english.json'
url_dutchbros = 'https://raw.githubusercontent.com/Lambda-School-Labs/social-media-strategy-ds/feature/topic/python_notebooks/dutchbros_followers.json'

df = requests.get(url_elon).json()

# Converting the dataset to pandas DataFrame and renaming the columns 
df = pd.DataFrame(df.values())
df = df.rename(columns={0:'original_tweets'})

#Removing emojies from text
#Refrence 1 : https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
#Refrence 2 : https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python

def give_emoji_free_text(text):
    """
    Removes emoji's from tweets
    Accepts:
        Text (tweets)
    Returns:
        Text (emoji free tweets)
    """
    emoji_list = [c for c in text if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

def url_free_text(text):
    '''
    Cleans text from urls
    '''
    text = re.sub(r'http\S+', '', text)
    return text

# Apply the function above and get tweets free of emoji's
call_emoji_free = lambda x: give_emoji_free_text(x)

# Apply `call_emoji_free` which calls the function to remove all emoji's
df['emoji_free_tweets'] = df['original_tweets'].apply(call_emoji_free)

#Create a new column with url free tweets
df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)

df

Unnamed: 0,original_tweets,emoji_free_tweets,url_free_tweets
0,This kid will forever be a legend 😂 https://t....,This kid will forever be a legend https://t.co...,This kid will forever be a legend
1,"If you truly believe Lebrons mindset, competit...","If you truly believe Lebrons mindset, competit...","If you truly believe Lebrons mindset, competit..."
2,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!
3,@Bhuvan_Bam ❤️❤️,@Bhuvan_Bam,@Bhuvan_Bam
4,I'm not crying you're crying.\nhttps://t.co/Bc...,I'm not crying you're crying. https://t.co/BcF...,I'm not crying you're crying.
...,...,...,...
9941,@MirandaSleeper The offerings in that f-3 are ...,@MirandaSleeper The offerings in that f-3 are ...,@MirandaSleeper The offerings in that f-3 are ...
9942,This will be the defining segment of “The Last...,This will be the defining segment of “The Last...,This will be the defining segment of “The Last...
9943,"@frank_miskelly I don’t like it, I LOVE IT! Bu...","@frank_miskelly I don’t like it, I LOVE IT! Bu...","@frank_miskelly I don’t like it, I LOVE IT! Bu..."
9944,Excited for my brother @Shufly10 as he embarks...,Excited for my brother @Shufly10 as he embarks...,Excited for my brother @Shufly10 as he embarks...


##Tokenizing 🕵🏻‍♂

In [7]:
# Load spacy
# Make sure to restart the runtime after running installations and libraries tab
nlp = spacy.load('en_core_web_lg')

OSError: ignored

In [None]:
"""
Import Gensim and Wordcloud to use their stopwords as well and use the combined stopwords of ALL as the variable:
ALL_STOP_WORDS
"""
# Timing Start
program_start_time = time.time()

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)


# Custom stopwords
custom_stopwords = ['hi','\n','\n\n', '&amp;', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']

# Customize stop words by adding to the default list
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)

# ALL_STOP_WORDS = spacy + gensim + wordcloud
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)


tokens = []

for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500):
    doc_tokens = []    
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower())   
    tokens.append(doc_tokens)

# Makes tokens column
df['tokens'] = tokens

# Timing End
program_end_time = time.time()

# View df
df

In [None]:
# See how long it took
print(program_end_time - program_start_time, "seconds to finish")

##Lemmatization🇬🇧

In [None]:
# Refrence 4 : https://stackoverflow.com/questions/45306988/column-of-lists-convert-list-to-string-as-a-new-column

# Timing Start
program_start_time = time.time()

# Make tokens a string again
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]

def get_lemmas(text):
    '''Used to lemmatize the processed tweets'''
    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)

# Make lemmas a string again
df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]
# df[['original_tweet', 'lemmas_back_to_text']]

# Timing End
program_end_time = time.time()


In [None]:
#Printing Lemmetization Time
print(program_end_time - program_start_time, "seconds to finish")

In [None]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Timing Start
program_start_time = time.time()

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Tokenizer function
def tokenize(text):
    """
    Parses a string into a list of semantic units (words)
    Args:
        text (str): The string that the function will tokenize.
    Returns:
        list: tokens parsed out
    """
    # Removing url's
    pattern = r"http\S+"
    
    tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
    tokens = re.sub('[^a-zA-Z 0-9]', '', text)
    tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
    tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
    tokens = re.sub('@*!*\$*', '', text) # Remove @ ! $
    tokens = tokens.strip(',') # TESTING THIS LINE
    tokens = tokens.strip('?') # TESTING THIS LINE
    tokens = tokens.strip('!') # TESTING THIS LINE
    tokens = tokens.strip("'") # TESTING THIS LINE
    tokens = tokens.strip(".") # TESTING THIS LINE

    tokens = tokens.lower().split() # Make text lowercase and split it
    
    return tokens

# Apply tokenizer
df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)

# Timing End
program_end_time = time.time()

# View those tokens (the 4th column)

df

In [None]:
#Printing Tokenization Time
print(program_end_time - program_start_time, "seconds to finish")


##Topic Modeling ㊙️

###id2word 📒

In [None]:
# Create a id2word dictionary
id2word = Dictionary(df['lemma_tokens'])
print(len(id2word))

In [None]:
# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)
print(len(id2word))

###Corpus Object & Generating Base Model Topics 📚

In [None]:
# Creating a corpus object 
corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]

####Base Model

In [None]:
# Timing Start
base_model_program_start_time = time.time()

# Instantiating a LDA model 
base_model = LdaMulticore(corpus=corpus, num_topics=5, id2word=id2word, workers=12, passes=5)

# Timing End
base_model_program_end_time = time.time()

In [None]:
#Printing First Model Time
base_model_runtime = round(base_model_program_end_time - base_model_program_start_time, 2)
print(base_model_runtime)


In [None]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]


In [None]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]


In [None]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)

#### Base Model Topic Distance Visualization 📈

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(base_model, corpus, id2word)

###Grid Seach 🔍

In [None]:
lemmas_df = df['lemmas_back_to_text']
print(type(lemmas_df[0]))


In [None]:
vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(df['lemmas_back_to_text'])


In [None]:
gs_start_time = time.time()

# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128, 
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1, 
                                                 learning_decay=0.7, 
                                                 learning_method=None,
                                                 learning_offset=10.0, 
                                                 max_doc_update_iter=100, 
                                                 max_iter=10,
                                                 mean_change_tol=0.001, 
                                                 n_components=10, 
                                                 n_jobs=1,
                                                 perp_tol=0.1, 
                                                 random_state=None,
                                                 topic_word_prior=None, 
                                                 total_samples=1000000.0, 
                                                 verbose=0),
             iid=True, n_jobs=1,
             param_grid={'n_topics': [10, 15, 20, 30], 
                         'learning_decay': [0.5, 0.7, 0.9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
             scoring=None, verbose=0)

gs_end_time = time.time()

In [None]:
print(gs_end_time - gs_start_time, "seconds to finish")


In [None]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
# Best Model's Params:  {'learning_decay': 0.9, 'n_topics': 10}


###Hyperparameter Tuning 🦾

####Model iteration 1 (number of topics)


#####1.0 Topics = 5

In [None]:
# Let's start with parameter tuning for the LDA model and,
# find an optimal number of topics to reach the best coherence score

# Define chunksize and passes
# Chunksize is Number of documents to be used in each training chunk
# Passes is Number of passes through the corpus during training

# Timing Start
model_1_0_start_time = time.time()

model_1_0 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=5,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_0_end_time = time.time()

In [None]:
#Printing First Model Time
model_1_0_runtime = round(model_1_0_end_time - model_1_0_start_time, 2)
print(model_1_0_runtime)


In [None]:
# Filtering for words 
words_1_0 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_0.print_topics()]

# Create Topics
topics_1_0 = [' '.join(t[0:10]) for t in words_1_0]

# Getting the topics
for id, t in enumerate(topics_1_0): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_0_perplexity = model_1_0.log_perplexity(corpus)
print('\nPerplexity: ', model_1_0_perplexity) 

# Compute Coherence Score
coherence_model_1_0 = CoherenceModel(model=model_1_0, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_0 = coherence_model_1_0.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_0)



In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_0, corpus, id2word)


#####1.1 Topics = 10

In [None]:
#Increasing number of topics to 10
#Timing Start
model_1_1_start_time = time.time()

model_1_1 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=10,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_1_end_time = time.time()

In [None]:
#Printing First Model Time
model_1_1_runtime = round(model_1_1_end_time - model_1_1_start_time, 2)
print(model_1_1_runtime)


In [None]:
# Filtering for words 
words_1_1 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_1.print_topics()]

# Create Topics
topics_1_1 = [' '.join(t[0:10]) for t in words_1_1]

# Getting the topics
for id, t in enumerate(topics_1_1): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_1_perplexity = model_1_1.log_perplexity(corpus)
print('\nPerplexity: ', model_1_1_perplexity) 

# Compute Coherence Score
coherence_model_1_1 = CoherenceModel(model=model_1_1, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_1 = coherence_model_1_1.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_1)

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_1, corpus, id2word)


#####1.2 Topics = 15

In [None]:
#Increasing number of topics to 15
#Timing Start
model_1_2_start_time = time.time()

model_1_2 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=15,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_2_end_time = time.time()

In [None]:
#Printing First Model Time
model_1_2_runtime = round(model_1_2_end_time - model_1_2_start_time, 2)
print(model_1_2_runtime)

In [None]:
# Filtering for words 
words_1_2 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_2.print_topics()]

# Create Topics
topics_1_2 = [' '.join(t[0:10]) for t in words_1_2]

# Getting the topics
for id, t in enumerate(topics_1_2): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_2_perplexity = model_1_2.log_perplexity(corpus)
print('\nPerplexity: ', model_1_2_perplexity) 

# Compute Coherence Score
coherence_model_1_2 = CoherenceModel(model=model_1_2, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_2 = coherence_model_1_2.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_2)

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_2, corpus, id2word)


##### 1.3 Topics = 20

In [None]:
#Increasing number of topics to 20
#Timing Start
model_1_3_start_time = time.time()

model_1_3 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=20,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_3_end_time = time.time()

In [None]:
#Printing First Model Time
model_1_3_runtime = round(model_1_3_end_time - model_1_3_start_time, 2)
print(model_1_3_runtime)


In [None]:
# Filtering for words 
words_1_3 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_3.print_topics()]

# Create Topics
topics_1_3 = [' '.join(t[0:10]) for t in words_1_3]

# Getting the topics
for id, t in enumerate(topics_1_3): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_3_perplexity = model_1_3.log_perplexity(corpus)
print('\nPerplexity: ', model_1_3_perplexity) 

# Compute Coherence Score
coherence_model_1_3 = CoherenceModel(model=model_1_3, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_3 = coherence_model_1_3.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_3)

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_3, corpus, id2word)


##### 1.4 Topics = 25

In [None]:
#Increasing number of topics to 25
#Timing Start
model_1_4_start_time = time.time()

model_1_4 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=25,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_4_end_time = time.time()

In [None]:
#Printing First Model Time
model_1_4_runtime = round(model_1_4_end_time - model_1_4_start_time, 2)
print(model_1_4_runtime)

In [None]:
# Filtering for words 
words_1_4 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_4.print_topics()]

# Create Topics
topics_1_4 = [' '.join(t[0:10]) for t in words_1_3]

# Getting the topics
for id, t in enumerate(topics_1_4): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_4_perplexity = model_1_4.log_perplexity(corpus)
print('\nPerplexity: ', model_1_4_perplexity) 

# Compute Coherence Score
coherence_model_1_4 = CoherenceModel(model=model_1_4, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_4 = coherence_model_1_4.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_4)

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_4, corpus, id2word)


#####1.5 Topics = 30

In [None]:
#We have had a reduction from .39 to .35 in coherence score 
#by going from 20 to 25. Let's try 30 topics and see what 
#coherence score we'll get. 

# Timing Start
model_1_5_start_time = time.time()

model_1_5 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=30,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_5_end_time = time.time()

In [None]:
#Printing First Model Time
model_1_5_runtime = round(model_1_5_end_time - model_1_5_start_time, 2)
print(model_1_5_runtime)


In [None]:
# Filtering for words 
words_1_5 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_5.print_topics()]

# Create Topics
topics_1_5 = [' '.join(t[0:10]) for t in words_1_5]

# Getting the topics
for id, t in enumerate(topics_1_5): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_5_perplexity = model_1_5.log_perplexity(corpus)
print('\nPerplexity: ', model_1_5_perplexity) 

# Compute Coherence Score
coherence_model_1_5 = CoherenceModel(model=model_1_5, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_5 = coherence_model_1_5.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_5)


In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_5, corpus, id2word)


#####1.6 Topics = 35

In [None]:
# Coherence score jumped back to 0.39, let's try 35 topics
# Timing Start
model_1_6_start_time = time.time()

model_1_6 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=35,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_6_end_time = time.time()

In [None]:
#Printing First Model Time
model_1_6_runtime = round(model_1_6_end_time - model_1_6_start_time, 2)
print(model_1_6_runtime)


In [None]:
# Filtering for words 
words_1_6 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_6.print_topics()]

# Create Topics
topics_1_6 = [' '.join(t[0:10]) for t in words_1_6]

# Getting the topics
for id, t in enumerate(topics_1_6): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_6_perplexity = model_1_6.log_perplexity(corpus)
print('\nPerplexity: ', model_1_6_perplexity) 

# Compute Coherence Score
coherence_model_1_6 = CoherenceModel(model=model_1_6, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_6 = coherence_model_1_6.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_6)


In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_6, corpus, id2word)


#####1.7 Topics = 40

In [None]:
# Let's try 40 topics
# Timing Start
model_1_7_start_time = time.time()

model_1_7 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=40,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_7_end_time = time.time()

In [None]:
#Printing First Model Time
model_1_7_runtime = round(model_1_7_end_time - model_1_7_start_time, 2)
print(model_1_7_runtime)


In [None]:
# Filtering for words 
words_1_7 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_7.print_topics()]

# Create Topics
topics_1_7 = [' '.join(t[0:10]) for t in words_1_7]

# Getting the topics
for id, t in enumerate(topics_1_7): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_7_perplexity = model_1_7.log_perplexity(corpus)
print('\nPerplexity: ', model_1_7_perplexity) 

# Compute Coherence Score
coherence_model_1_7 = CoherenceModel(model=model_1_7, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_7 = coherence_model_1_7.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_7)


In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_7, corpus, id2word)


#####1.8 Topics = 5-200


In [None]:
#Defining a function to loop over number of topics to be used to find an 
#optimal number of tipics
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the 
    LDA model with respective number of topics
    """
    coherence_values_topic = []
    model_list_topic = []
    for num_topics in range(start, limit, step):
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list_topic.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values_topic.append(coherencemodel.get_coherence())

    return model_list_topic, coherence_values_topic      

In [None]:
# Can take a long time to run.
model_1_8_start_time = time.time()

model_list_topic, coherence_values_topic = compute_coherence_values(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=df['lemma_tokens'],
                                                        start=2, limit=200, step=6)
model_1_8_end_time = time.time()

In [None]:
#Printing First Model Time
model_1_8_runtime = round(model_1_8_end_time - model_1_8_start_time, 2)
print(model_1_8_runtime)


In [None]:
limit=200; start=2; step=6;
x_topic = range(start, limit, step)

topic_ts = {'coherence_value': coherence_values_topic,
            'number_of_topics': x_topic}

topic_chart = pd.DataFrame(data=topic_ts)

topic_fig = px.line(topic_chart, x="number_of_topics", y="coherence_value")
topic_fig.show()


In [None]:
#Saving track sheet chart on chart studios to be used in documentation
py.plot(topic_fig, filename = 'num_of_topics_chart', auto_open=True)


#####1.9 Topics = 68

In [None]:
# Based on 1.8 the optimal number of topics are 68
# Timing Start
model_1_9_start_time = time.time()

model_1_9 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_9_end_time = time.time()



In [None]:
#Printing First Model Time
model_1_9_runtime = round(model_1_9_end_time - model_1_9_start_time, 2)
print(model_1_9_runtime)


In [None]:
# Filtering for words 
words_1_9 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_9.print_topics()]

# Create Topics
topics_1_9 = [' '.join(t[0:10]) for t in words_1_9]

# Getting the topics
for id, t in enumerate(topics_1_9): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")



In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_9_perplexity = model_1_9.log_perplexity(corpus)
print('\nPerplexity: ', model_1_9_perplexity) 

# Compute Coherence Score
coherence_model_1_9 = CoherenceModel(model=model_1_9, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_9 = coherence_model_1_9.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_9)


In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_9, corpus, id2word)


##### Track Sheet

In [None]:
# Let's keep track of our progress

topic_ts = {'model_iteration':[1,1,1,1,1,1,1,1,1,1],
            'model': [0.0,1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.9],
      'runtime_seconds': [base_model_runtime, model_1_0_runtime,
                          model_1_1_runtime, model_1_2_runtime,
                          model_1_3_runtime, model_1_4_runtime,
                          model_1_5_runtime, model_1_6_runtime,
                          model_1_7_runtime, model_1_9_runtime],
      'coherence_score': [coherence_lda_model_base, coherence_lda_model_1_0,
                          coherence_lda_model_1_1, coherence_lda_model_1_2,
                          coherence_lda_model_1_3, coherence_lda_model_1_4,
                          coherence_lda_model_1_5, coherence_lda_model_1_6,
                          coherence_lda_model_1_7, coherence_lda_model_1_9],
      'perplexity': [base_perplexity, model_1_0_perplexity, 
                     model_1_1_perplexity, model_1_2_perplexity,
                     model_1_3_perplexity, model_1_4_perplexity,
                     model_1_5_perplexity, model_1_6_perplexity,
                     model_1_7_perplexity, model_1_9_perplexity],
      'number_of_topics': [base_model.num_topics, model_1_0.num_topics,
                           model_1_1.num_topics, model_1_2.num_topics,
                           model_1_3.num_topics, model_1_4.num_topics, 
                           model_1_5.num_topics, model_1_6.num_topics,
                           model_1_7.num_topics, model_1_9.num_topics],
            'passes': [base_model.passes, model_1_0.passes,
                           model_1_1.passes, model_1_2.passes,
                           model_1_3.passes, model_1_4.passes, 
                           model_1_5.passes, model_1_6.passes,
                           model_1_7.passes, model_1_9.passes]}

topic_track_sheet = pd.DataFrame(data=topic_ts)

topic_track_sheet

In [None]:
#Visualizing our progress
topic_fig_1 = px.line(topic_track_sheet, x="number_of_topics", y="coherence_score",
                    hover_name='perplexity', )
topic_fig_1.show()


In [None]:
#Saving track sheet chart on chart studios to be used in documentation
py.plot(topic_fig_1, filename = 'num_of_topics', auto_open=True)


####Model iteration 2 (number of passes)

---



#####2.0 Passes = 10 
*Refer to model 1.3 topics = 20*

#####2.1 Passes = 15

In [None]:
#Chose num_topics=68 based on on the coherence score and the perplexity score
#Above 68 will cause the model to overfit and produce non coherent results
#Lets try to change passes to asses if it'll improve the coherence score
#passes = 15
model_2_1_start_time = time.time()

model_2_1 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

model_2_1_end_time = time.time()



In [None]:
model_2_1_runtime = round(model_2_1_end_time - model_2_1_start_time, 2)
print(model_2_1_runtime, "seconds to finish")


In [None]:
# Filtering for words 
words_2_1 = [re.findall(r'"([^"]*)"',t[1]) for t in model_2_1.print_topics()]

# Create Topics
topics_2_1 = [' '.join(t[0:10]) for t in words_2_1]

# Getting the topics
for id, t in enumerate(topics_2_1): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_2_1_perplexity = model_2_1.log_perplexity(corpus)
print('\nPerplexity: ', model_2_1_perplexity) 

# Compute Coherence Score
coherence_model_2_1 = CoherenceModel(model=model_2_1, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_2_1 = coherence_model_2_1.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_2_1)

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_2_1, corpus, id2word)


#####2.2 Passes = 20

In [None]:
#passes = 20
model_2_2_start_time = time.time()

model_2_2 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=20)

model_2_2_end_time = time.time()

In [None]:
model_2_2_runtime = round(model_2_2_end_time - model_2_2_start_time, 2)
print(model_2_2_runtime, "seconds to finish")


In [None]:
# Filtering for words 
words_2_2 = [re.findall(r'"([^"]*)"',t[1]) for t in model_2_2.print_topics()]

# Create Topics
topics_2_2 = [' '.join(t[0:10]) for t in words_2_2]

# Getting the topics
for id, t in enumerate(topics_2_2): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_2_2_perplexity = model_2_2.log_perplexity(corpus)
print('\nPerplexity: ', model_2_2_perplexity) 

# Compute Coherence Score
coherence_model_2_2 = CoherenceModel(model=model_2_2, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_2_2 = coherence_model_2_2.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_2_2)

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_2_2, corpus, id2word)


#####2.3 Passes = 25

In [None]:
#passes = 20
model_2_3_start_time = time.time()

model_2_3 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=25)

model_2_3_end_time = time.time()

In [None]:
model_2_3_runtime = round(model_2_3_end_time - model_2_3_start_time, 2)
print(model_2_3_runtime, "seconds to finish")


In [None]:
# Filtering for words 
words_2_3 = [re.findall(r'"([^"]*)"',t[1]) for t in model_2_3.print_topics()]

# Create Topics
topics_2_3 = [' '.join(t[0:10]) for t in words_2_3]

# Getting the topics
for id, t in enumerate(topics_2_3): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_2_3_perplexity = model_2_3.log_perplexity(corpus)
print('\nPerplexity: ', model_2_3_perplexity) 

# Compute Coherence Score
coherence_model_2_3 = CoherenceModel(model=model_2_3, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_2_3 = coherence_model_2_3.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_2_3)

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_2_3, corpus, id2word)


#####Track Sheet

In [None]:
# Let's keep track of our progress

passes_ts = {'model_iteration':[2,2,2,2],
             'model': [2.0, 2.1, 2.2, 2.3], 
      'runtime_seconds': [model_1_9_runtime, model_2_1_runtime,
                          model_2_2_runtime, model_2_3_runtime],
      'coherence_score': [coherence_lda_model_1_9, coherence_lda_model_2_1, 
                          coherence_lda_model_2_2, coherence_lda_model_2_3],
      'perplexity': [model_1_9_perplexity,model_2_1_perplexity, 
                     model_2_2_perplexity,model_2_3_perplexity],
      'number_of_topics': [model_1_9.num_topics,model_2_1.num_topics, 
                           model_2_2.num_topics,model_2_3.num_topics],
            'passes': [10,15,20,25]}

passes_track_sheet = pd.DataFrame(data=passes_ts)

passes_track_sheet

In [None]:
#Visualizing our progress
passes_fig = px.line(passes_track_sheet, x="passes", y="coherence_score",
                    hover_name='perplexity')
passes_fig.show()


In [None]:
#Saving track sheet chart on chart studios to be used in documentation
py.plot(passes_fig, filename = 'passes_track_chart', auto_open=True)


####Model iteration 3 (Minimum Probability)

#####3.0 aplpha = symmetric
*Refer to model 2.3 passes = 25*

#####3.1 alpha = asymmetric

In [None]:
#Chose passes = 25 based on on the coherence score and the perplexity score
#Anything above 25 wouldnt significantly improve the scores 
#Lets try to change alpha to asses if it'll improve the coherence score
#alpha = asymmetric
model_3_1_start_time = time.time()

model_3_1 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=25,
                       alpha = 'asymmetric')

model_3_1_end_time = time.time()

In [None]:
model_3_1_runtime = round(model_3_1_end_time - model_3_1_start_time, 2)
print(model_3_1_runtime, "seconds to finish")


In [None]:
# Filtering for words 
words_3_1 = [re.findall(r'"([^"]*)"',t[1]) for t in model_3_1.print_topics()]

# Create Topics
topics_3_1 = [' '.join(t[0:10]) for t in words_3_1]

# Getting the topics
for id, t in enumerate(topics_3_1): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

    

In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_3_1_perplexity = model_3_1.log_perplexity(corpus)
print('\nPerplexity: ', model_3_1_perplexity) 

# Compute Coherence Score
coherence_model_3_1 = CoherenceModel(model=model_3_1, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_3_1 = coherence_model_3_1.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_3_1)

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_3_1, corpus, id2word)


#####Track Sheet

In [None]:
# Let's keep track of our progress

alpha_ts = {'model_iteration':[3,3],
            'model': [3.0, 3.1], 
      'runtime_seconds': [model_2_3_runtime, model_3_1_runtime],
      'coherence_score': [coherence_lda_model_2_3, coherence_lda_model_3_1],
      'perplexity': [model_2_3_perplexity,model_3_1_perplexity],
      'number_of_topics': [model_2_3.num_topics,model_3_1.num_topics],
      'passes': [25,25],
      'alpha':['symmetric','asymmetric']}

alpha_track_sheet = pd.DataFrame(data=alpha_ts)

alpha_track_sheet


####Model iteration 4 (Decay)

#####4.0 Decay = 0.5
*Reffer to 2.3*

#####4.1 Decay = 0.7

In [None]:
model_4_1_start_time = time.time()

model_4_1 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=25,
                       decay=0.7)

model_4_1_end_time = time.time()

In [None]:
model_4_1_runtime = round(model_4_1_end_time - model_4_1_start_time, 2)
print(model_4_1_runtime, "seconds to finish")


In [None]:
# Filtering for words 
words_4_1 = [re.findall(r'"([^"]*)"',t[1]) for t in model_4_1.print_topics()]

# Create Topics
topics_4_1 = [' '.join(t[0:10]) for t in words_4_1]

# Getting the topics
for id, t in enumerate(topics_4_1): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_4_1_perplexity = model_4_1.log_perplexity(corpus)
print('\nPerplexity: ', model_4_1_perplexity) 

# Compute Coherence Score
coherence_model_4_1 = CoherenceModel(model=model_4_1, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_4_1 = coherence_model_4_1.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_4_1)

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_4_1, corpus, id2word)


#####4.2 Decay = 0.9

In [None]:
model_4_2_start_time = time.time()

model_4_2 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=25,
                       decay=0.9)

model_4_2_end_time = time.time()

In [None]:
model_4_2_runtime = round(model_4_2_end_time - model_4_2_start_time, 2)
print(model_4_2_runtime, "seconds to finish")


In [None]:
# Filtering for words 
words_4_2 = [re.findall(r'"([^"]*)"',t[1]) for t in model_4_2.print_topics()]

# Create Topics
topics_4_2 = [' '.join(t[0:10]) for t in words_4_2]

# Getting the topics
for id, t in enumerate(topics_4_2): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_4_2_perplexity = model_4_2.log_perplexity(corpus)
print('\nPerplexity: ', model_4_2_perplexity) 

# Compute Coherence Score
coherence_model_4_2 = CoherenceModel(model=model_4_2, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_4_2 = coherence_model_4_2.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_4_2)

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_4_2, corpus, id2word)


#####Track Sheet

In [None]:
# Let's keep track of our progress

decay_ts = {'model_iteration':[4,4,4],
            'model': [4.0, 4.1, 4.2], 
      'runtime_seconds': [model_2_3_runtime, model_4_1_runtime, 
                          model_4_2_runtime],
      'coherence_score': [coherence_lda_model_2_3, coherence_lda_model_4_1,
                          coherence_lda_model_4_2],
      'perplexity': [model_2_3_perplexity,model_4_1_perplexity,
                     model_4_2_perplexity],
      'number_of_topics': [model_2_3.num_topics,model_4_1.num_topics,
                           model_4_2.num_topics],
      'passes': [model_2_3.passes, model_4_1.passes, model_4_2.passes],
      'alpha':['symmetric','symmetric','symmetric'],
      'decay':[model_2_3.decay, model_4_1.decay, model_4_2.decay]}

decay_track_sheet = pd.DataFrame(data=decay_ts)

decay_track_sheet


In [None]:
#Visualizing our progress
decay_fig = px.line(decay_track_sheet, x="decay", y="coherence_score",
                    hover_name='perplexity')
decay_fig.show()


In [None]:
#Saving track sheet chart on chart studios to be used in documentation
py.plot(decay_fig, filename = 'decay_track_chart', auto_open=True)


####Model iteration 5 (iterations)

#####5.0 iterations = 50 
*refer to 2.3*

#####5.1 iterations = 60

In [None]:
model_5_1_start_time = time.time()

model_5_1 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=25,
                       decay=0.5,
                       iterations=60)

model_5_1_end_time = time.time()

In [None]:
model_5_1_runtime = round(model_5_1_end_time - model_5_1_start_time, 2)
print(model_5_1_runtime, "seconds to finish")


In [None]:
# Filtering for words 
words_5_1 = [re.findall(r'"([^"]*)"',t[1]) for t in model_5_1.print_topics()]

# Create Topics
topics_5_1 = [' '.join(t[0:10]) for t in words_5_1]

# Getting the topics
for id, t in enumerate(topics_5_1): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_5_1_perplexity = model_5_1.log_perplexity(corpus)
print('\nPerplexity: ', model_5_1_perplexity) 

# Compute Coherence Score
coherence_model_5_1 = CoherenceModel(model=model_5_1, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_5_1 = coherence_model_5_1.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_5_1)

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_5_1, corpus, id2word)


#####5.2 iterations = 70

In [None]:
model_5_2_start_time = time.time()

model_5_2 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=25,
                       decay=0.5,
                       iterations=70)

model_5_2_end_time = time.time()

In [None]:
model_5_2_runtime = round(model_5_2_end_time - model_5_2_start_time, 2)
print(model_5_2_runtime, "seconds to finish")


In [None]:
# Filtering for words 
words_5_2 = [re.findall(r'"([^"]*)"',t[1]) for t in model_5_2.print_topics()]

# Create Topics
topics_5_2 = [' '.join(t[0:10]) for t in words_5_2]

# Getting the topics
for id, t in enumerate(topics_5_2): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_5_2_perplexity = model_5_2.log_perplexity(corpus)
print('\nPerplexity: ', model_5_2_perplexity) 

# Compute Coherence Score
coherence_model_5_2 = CoherenceModel(model=model_5_2, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_5_2 = coherence_model_5_2.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_5_2)

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_5_2, corpus, id2word)


#####5.3 iterations = 70-150

In [None]:
#Defining a function to loop over iterations to find an optimal number of tipics
def compute_coherence_values_1(dictionary, corpus, texts, limit, start=70, step=10):
    """
    Compute c_v coherence for various number of iterations

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of iterations

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the 
    LDA model with respective number of topics
    """
    coherence_values_its = []
    model_list_its = []
    for iterations in range(start, limit, step):
        model = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=25,
                       decay=0.5,
                       iterations=iterations)
        model_list_its.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values_its.append(coherencemodel.get_coherence())

    return model_list_its, coherence_values_its   

In [None]:
# Can take a long time to run.
model_5_3_start_time = time.time()

model_list_its, coherence_values_its = compute_coherence_values_1(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=df['lemma_tokens'],
                                                        start=70, limit=150, step=10)
model_5_3_end_time = time.time()

In [None]:
#Printing First Model Time
model_5_3_runtime = round(model_5_3_end_time - model_5_3_start_time, 2)
print(model_5_3_runtime)
      

In [None]:
limit=150; start=70; step=10;
x_2 = range(start, limit, step)

its_ts = {'coherence_value': coherence_values_its,
            'number_of_iterations': x_2}

its_track_sheet = pd.DataFrame(data=its_ts)

its_fig = px.line(its_track_sheet, x="number_of_iterations", y="coherence_value")
its_fig.show()


In [None]:
#Saving track sheet chart on chart studios to be used in documentation
py.plot(its_fig, filename = 'iterations_track_chart', auto_open=True)


#####Track Sheet

In [None]:
# Let's keep track of our progress

iterations_ts = {'model_iteration':[5,5,5],
                 'model': [5.0, 5.1, 5.2], 
      'runtime_seconds': [model_2_3_runtime, model_5_1_runtime, 
                          model_5_2_runtime],
      'coherence_score': [coherence_lda_model_2_3, coherence_lda_model_5_1,
                          coherence_lda_model_5_2],
      'perplexity': [model_2_3_perplexity,model_5_1_perplexity,
                     model_5_2_perplexity],
      'number_of_topics': [model_2_3.num_topics,model_5_1.num_topics,
                           model_5_2.num_topics],
      'passes': [model_2_3.passes, model_5_1.passes, model_5_2.passes],
      'alpha':['symmetric','symmetric','symmetric'],
      'decay':[model_2_3.decay, model_5_1.decay, model_5_2.decay],
      'iterations':[model_2_3.iterations, model_5_1.iterations,
                    model_5_2.iterations]}

iterations_track_sheet = pd.DataFrame(data=iterations_ts)

iterations_track_sheet


In [None]:
#Visualizing our progress
iterations_fig = px.line(iterations_track_sheet, x="iterations", y="coherence_score",
                    hover_name='perplexity')
iterations_fig.show()

In [None]:
#Saving track sheet chart on chart studios to be used in documentation
py.plot(iterations_fig, filename = 'iterations_track_chart', auto_open=True)


####Model iteration 6 (minimum_probability)
*eval_every – Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x.*

#####6.0 minimum_probability = 0.01
*Reffer to 5.2*

#####6.1 minimum_probability = 0.05


In [None]:
model_6_1_start_time = time.time()

model_6_1 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=25,
                       decay=0.5,
                       iterations=70,
                       minimum_probability=0.05)

model_6_1_end_time = time.time()

In [None]:
model_6_1_runtime = round(model_6_1_end_time - model_6_1_start_time, 2)
print(model_6_1_runtime, "seconds to finish")


In [None]:
# Filtering for words 
words_6_1 = [re.findall(r'"([^"]*)"',t[1]) for t in model_6_1.print_topics()]

# Create Topics
topics_6_1 = [' '.join(t[0:10]) for t in words_6_1]

# Getting the topics
for id, t in enumerate(topics_6_1): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")
 

In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_6_1_perplexity = model_6_1.log_perplexity(corpus)
print('\nPerplexity: ', model_6_1_perplexity) 

# Compute Coherence Score
coherence_model_6_1 = CoherenceModel(model=model_6_1, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_6_1 = coherence_model_6_1.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_6_1)

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_6_1, corpus, id2word)


#####6.2 minimum_probability = 0.1


In [None]:
model_6_2_start_time = time.time()

model_6_2 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=25,
                       decay=0.5,
                       iterations=70,
                       minimum_probability=0.1)

model_6_2_end_time = time.time()

In [None]:
model_6_2_runtime = round(model_6_2_end_time - model_6_2_start_time, 2)
print(model_6_2_runtime, "seconds to finish")


In [None]:
# Filtering for words 
words_6_2 = [re.findall(r'"([^"]*)"',t[1]) for t in model_6_2.print_topics()]

# Create Topics
topics_6_2 = [' '.join(t[0:10]) for t in words_6_2]

# Getting the topics
for id, t in enumerate(topics_6_2): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

  

In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_6_2_perplexity = model_6_2.log_perplexity(corpus)
print('\nPerplexity: ', model_6_2_perplexity) 

# Compute Coherence Score
coherence_model_6_2 = CoherenceModel(model=model_6_2, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_6_2 = coherence_model_6_2.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_6_2)

In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_6_2, corpus, id2word)


#####Track Sheet

In [None]:
# Let's keep track of our progress

minimum_probability_ts = {'model_iteration':[6,6,6],
                          'model': [6.0, 6.1, 6.2], 
      'runtime_seconds': [model_5_2_runtime, model_6_1_runtime, 
                          model_6_2_runtime],
      'coherence_score': [coherence_lda_model_5_2, coherence_lda_model_6_1,
                          coherence_lda_model_6_2],
      'perplexity': [model_5_2_perplexity,model_6_1_perplexity,
                     model_6_2_perplexity],
      'number_of_topics': [model_5_2.num_topics,model_6_1.num_topics,
                           model_6_2.num_topics],
      'passes': [model_5_2.passes, model_6_1.passes, model_6_2.passes],
      'alpha':['symmetric','symmetric','symmetric'],
      'decay':[model_5_2.decay, model_6_1.decay, model_6_2.decay],
      'iterations':[model_5_2.iterations, model_6_1.iterations,
                    model_6_2.iterations],
      'minimum_probability':[model_5_2.minimum_probability, 
                             model_6_1.minimum_probability,
                             model_6_2.minimum_probability]}

minimum_probability_track_sheet = pd.DataFrame(data=minimum_probability_ts)

minimum_probability_track_sheet


In [None]:
#Visualizing our progress
minimum_probability_fig = px.line(minimum_probability_track_sheet, x="minimum_probability", y="coherence_score",
                    hover_name='perplexity')
minimum_probability_fig.show()


In [None]:
#Saving track sheet chart on chart studios to be used in documentation
py.plot(minimum_probability_fig, filename = 'minproba_track_chart', auto_open=True)


In [None]:
end_of_notebook_time = time.time()
total_notebook_time_seconds = end_of_notebook_time - start_of_notebook_time
total_running_time_of_notebook_minutes = (end_of_notebook_time - start_of_notebook_time) / 60
print('Duration for the entire notebook to run: {} seconds.'.format(total_notebook_time_seconds))
print(f'Which is {total_running_time_of_notebook_minutes} minutes.')


####Gensim Mallet 🧛

In [None]:
 #Loading Mallet LDA Model
 from google.colab import files 
 uploaded=files.upload()
 

In [None]:
!unzip mallet.zip

In [None]:
# lemmas_df = df['lemmas_back_to_text']
# vectorizer = CountVectorizer()
# data_vectorized = vectorizer.fit_transform(df['lemmas_back_to_text'])


In [None]:
from gensim.models.wrappers import LdaMallet
# Assigning the mallet path and runnig the mallet model using
# the final chosen hyper parameters 
mallet_path = !/content/mallet/binn/mallett
mallet_path


import os
import sys

mallet_path = r'mallet-2.0.8/bin/mallet'




if not os.path.exists(mallet_path):
    print('{} not found'.format(mallet_path))
    sys.exit()

In [None]:
ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

####Progress Tracksheet 🧑‍🔬

In [None]:
track_sheet = pd.concat([topic_track_sheet, passes_track_sheet, alpha_track_sheet,
                         decay_track_sheet, iterations_track_sheet, 
                         minimum_probability_track_sheet], ignore_index=True)

track_sheet[['alpha', 'decay', 
             'minimum_probability',
             'iterations']] = track_sheet[['alpha', 
                                           'decay', 
                                           'minimum_probability',
                                           'iterations']].fillna(method='backfill')

track_sheet['model_iteration_str'] = track_sheet['model_iteration'].replace(
    [1,2,3,4,5,6],['one', 'two', 'three', 'four', 'five', 'six'])

track_sheet

In [None]:
#Visualizing our progress
track_sheet_fig = px.line(track_sheet, x="model", y="coherence_score",
                    hover_name='perplexity')
track_sheet_fig.show()

In [None]:
#Saving track sheet chart on chart studios to be used in documentation
py.plot(track_sheet, filename = 'track_chart', auto_open=True)


In [None]:
#Visualizing our progress
fig1 = px.scatter(track_sheet, x="model", y="coherence_score", 
                  color='model_iteration', size='runtime_seconds', 
                  marginal_y='histogram', marginal_x='violin')

fig1.show()


In [None]:
fig2 = px.scatter(track_sheet, y="perplexity", x="coherence_score", 
                  color='model_iteration', size='runtime_seconds', 
                  marginal_y='histogram', marginal_x='violin')

fig2.show()