In [1]:
# Required Libraries

#Base and Cleaning 
import json
import requests
import pandas as pd
import numpy as np
import emoji
import regex
import re
import string
from collections import Counter

#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt 
import pyLDAvis.gensim_models
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls

#Natural Language Processing (NLP)
import spacy
import gensim
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)

2023-01-05 11:48:28.192280: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [24]:
import os
print(os.getcwd())
os.chdir('/Users/davidchu/Desktop/Carleton/Courses/Comps/CS Comps/carleton-cscomps-dataviz-FW22/Tweets')

/Users/davidchu/Desktop/Carleton/Courses/Comps/CS Comps/carleton-cscomps-dataviz-FW22/Tweets


In [3]:
id = "tedcruz"
df = pd.read_csv("./Data/"+id+"_tweets.csv")

In [4]:
def give_emoji_free_text(text):
    """
    Removes emoji's from tweets
    Accepts:
        Text (tweets)
    Returns:
        Text (emoji free tweets)
    """
    emoji_list = [c for c in text if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

def url_free_text(text):
    '''
    Cleans text from urls
    '''
    text = re.sub(r'http\S+', '', text)
    return text

# Apply the function above and get tweets free of emoji's
call_emoji_free = lambda x: give_emoji_free_text(x)

# Apply `call_emoji_free` which calls the function to remove all emoji's
# df['emoji_free_tweets'] = df['text'].apply(call_emoji_free)

#Create a new column with url free tweets
df['url_free_tweets'] = df['text'].apply(url_free_text)

In [5]:
# Load spacy
# Make sure to restart the runtime after running installations and libraries tab
nlp = spacy.load('en_core_web_lg')

In [6]:
# Tokenizer
tokenizer = Tokenizer(nlp.vocab)


# Custom stopwords
custom_stopwords = ['hi','\n','\n\n', '&', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']

# Customize stop words by adding to the default list
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)

# ALL_STOP_WORDS = spacy + gensim + wordcloud
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)


tokens = []

for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500):
    doc_tokens = []    
    for token in doc: 
        if token.text.lower() not in ALL_STOP_WORDS:
            doc_tokens.append(token.text.lower())   
    tokens.append(doc_tokens)

# Makes tokens column
df['tokens'] = tokens

In [7]:
# Make tokens a string again
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]

def get_lemmas(text):
    '''Used to lemmatize the processed tweets'''
    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)

In [8]:
# Make lemmas a string again
df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]

# Tokenizer function
def tokenize(text):
    """
    Parses a string into a list of semantic units (words)
    Args:
        text (str): The string that the function will tokenize.
    Returns:
        list: tokens parsed out
    """
    # removing hashtags and mentions
    tokens = re.sub(r"@[A-Za-z0-9_]+","", text)
    tokens = re.sub(r"#[A-Za-z0-9_]+","", tokens)
    # removing numbers
    tokens = re.sub(r"\b\d+\b","", tokens)
    # removing &amp
    tokens = re.sub(r"&amp|amp", "", tokens)
    # removing urls
    tokens = re.sub(r"http\S+", "", tokens)
    tokens = re.sub(r"www.\S+", "", tokens)
    # removing punctuations 
    tokens = re.sub(r'[()!?]', ' ', tokens)
    tokens = re.sub(r'\[.*?\]',' ', tokens)
    # removing non-alphanumeric characters
    tokens = re.sub(r'\W+',' ', tokens)
    # make text lowercase and split it
    tokens = tokens.lower().split() 
    # removing stop words
    stop_words = open("./data/stopwords.txt",'r').read().splitlines()
    stop_words = [word.strip() for word in stop_words]
    tokens = [w for w in tokens if not w in stop_words]

    return tokens

# Apply tokenizer
df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)

In [59]:
# Create a id2word dictionary
id2word = Dictionary(df['lemma_tokens'])
print(len(id2word))
# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)
print(len(id2word))
# Creating a corpus object 
corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]
# Instantiating a Base LDA model 
base_model = LdaMulticore(corpus=corpus, 
                          num_topics=10, 
                          id2word=id2word, 
                          workers=12, 
                          passes=5, 
                          decay = 0.7)
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]
# Create Topics
topics = [' '.join(t[0:15]) for t in words]
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

3867
1975
------ Topic 0 ------
verdict episode democrat youtube people vote dem podcast spotify kid

------ Topic 1 ------
democrats day american house america white friend time yes inflation

------ Topic 2 ------
vote school democrats woman police man border friend know today

------ Topic 3 ------
verdict fact episode border people say youtube today democrat american

------ Topic 4 ------
verdict episode podcast apple spotify late youtube miss iheart people

------ Topic 5 ------
need court verdict right corrupt supreme good law world wow

------ Topic 6 ------
new book justice leave corrupt democrat order today powerful legal

------ Topic 7 ------
illegal border inflation immigrant alien vineyard martha democrat price tell

------ Topic 8 ------
verdict justice new illegal democrats right country border work fbi

------ Topic 9 ------
true need verdict america vote school today country friend new



In [62]:
# get top topics
topics = base_model.top_topics(corpus)
top_topics = [topics[i][0][0][1] for i in range(len(topics))]
np.unique(top_topics, return_counts=True)

(array(['democrats', 'illegal', 'need', 'new', 'true', 'verdict', 'vote'],
       dtype='<U9'),
 array([1, 1, 1, 1, 1, 4, 1]))

In [294]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


Perplexity:  -7.580846434116392

Coherence Score:  0.35844886165144707


In [295]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(base_model, corpus, id2word)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [None]:
# Things to filter: "amp, w/, blank space, w, +, "
# denser grid search
# learn about hyperparameters and evaluation metrics
# ask Eric about whether it is better to train the model on 1 set of data or multiple sets of data

In [22]:
df['lemmas_back_to_text_clean'] =  df['lemma_tokens'].transform(lambda x: ' '.join(x))

In [25]:
# HYPERPARAMETERS TUNING
vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(df['lemmas_back_to_text_clean'])

# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128, 
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1, 
                                                 learning_decay=0.7, 
                                                 learning_method=None,
                                                 learning_offset=10.0, 
                                                 max_doc_update_iter=100, 
                                                 max_iter=10,
                                                 mean_change_tol=0.001, 
                                                 n_components=10, 
                                                 n_jobs=1,
                                                 perp_tol=0.1, 
                                                 random_state=None,
                                                 topic_word_prior=None, 
                                                 total_samples=1000000.0, 
                                                 verbose=0),
             n_jobs=1,
             param_grid={'n_topics': [1, 3, 5, 7, 10, 12, 15, 20, 30], 
                         'learning_decay': [0.2, 0.35, 0.5, 0.7, 0.9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
             scoring=None, verbose=0)


GridSearchCV(error_score='raise',
             estimator=LatentDirichletAllocation(learning_method=None,
                                                 n_jobs=1),
             n_jobs=1,
             param_grid={'learning_decay': [0.2, 0.35, 0.5, 0.7, 0.9],
                         'n_topics': [1, 3, 5, 7, 10, 12, 15, 20, 30]},
             return_train_score='warn')

In [26]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.9, 'n_components': 10}
Best Log Likelihood Score:  -45977.037297281066
Model Perplexity:  2947.4065033993957


In [41]:
model.best_params_['n_components']

10

In [26]:
# Test BTM
import bitermplus as btm

texts = df['lemmas_back_to_text'].str.strip().tolist()

X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
docs_vec = btm.get_vectorized_docs(texts, vocabulary)
biterms = btm.get_biterms(docs_vec)

# Initializing and running model
model = btm.BTM(
    X, vocabulary, seed=12321, T=8, M=20, alpha=50/8, beta=0.01)
model.fit(biterms, iterations=20)

100%|████████████████████████████████████████████| 20/20 [00:00<00:00, 113.86it/s]


['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_compute_p_wz',
 'alpha_',
 'beta_',
 'biterms_',
 'coherence_',
 'coherence_window_',
 'df_words_topics_',
 'fit',
 'fit_transform',
 'has_background_',
 'iterations_',
 'labels_',
 'matrix_docs_topics_',
 'matrix_topics_docs_',
 'matrix_topics_words_',
 'matrix_words_topics_',
 'perplexity_',
 'theta_',
 'topics_num_',
 'transform',
 'vocabulary_',
 'vocabulary_size_',
 'window_']

In [52]:
model.__doc__

'BTM(n_dw, vocabulary, int T, int M=20, double alpha=1., double beta=0.01, unsigned int seed=0, int win=15, bool has_background=False)\nBiterm Topic Model.\n\n    Parameters\n    ----------\n    n_dw : csr.csr_matrix\n        Documents vs words frequency matrix. Typically, it should be the output\n        of `CountVectorizer` from sklearn package.\n    vocabulary : list\n        Vocabulary (a list of words).\n    T : int\n        Number of topics.\n    M : int = 20\n        Number of top words for coherence calculation.\n    alpha : float = 1\n        Model parameter.\n    beta : float = 0.01\n        Model parameter.\n    seed : int = 0\n        Random state seed. If seed is equal to 0 (default),\n        use ``time(NULL)``.\n    win : int = 15\n        Biterms generation window.\n    has_background : bool = False\n        Use a background topic to accumulate highly frequent words.\n    '

In [28]:
p_zd = model.transform(docs_vec)

100%|█████████████████████████████████████| 2113/2113 [00:00<00:00, 131176.76it/s]


In [29]:
perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 8)
coherence = btm.coherence(model.matrix_topics_words_, X, M=20)
# or
perplexity = model.perplexity_
coherence = model.coherence_

In [30]:
import tmplot as tmp

# Run the interactive report interface
tmp.report(model=model, docs=texts)

Run `pip install tomotopy` in the console.
  warn(
Run `pip install tomotopy` in the console.
  warn(
Run `pip install tomotopy` in the console.
  warn(
Run `pip install tomotopy` in the console.
  warn(


VBox(children=(VBox(children=(HBox(children=(HTML(value='<b>Select a topic</b>:'), Dropdown(options=((0, 0), (…

In [34]:
import pickle as pkl
import tmplot as tmp
import glob

# Loading saved models
models_files = sorted(glob.glob(r'results/model[0-9].pkl'))
models = []
for fn in models_files:
    file = open(fn, 'rb')
    models.append(pkl.load(file))
    file.close()

In [39]:
# Choosing reference model
np.random.seed(122334)
reference_model = np.random.randint(1, 6)

# Getting close topics
close_topics, close_kl = tmp.get_closest_topics(
    models, method="skzlb", ref=reference_model)

# Getting stable topics
stable_topics, stable_kl = tmp.get_stable_topics(
    close_topics, close_kl, ref=reference_model, thres=0.7)

# Stable topics indices list
print(stable_topics[:, reference_model])

IndexError: list index out of range