# Biterm Topic Model Implementation on SLO Twitter Dataset

### Joseph Jinn and Keith VanderLinden

### Import libraries and set parameters:

In [None]:
# Import libraries.
import logging as log
import warnings
import time
import pandas as pd
import numpy as np
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer
from matplotlib import pyplot as plt
import seaborn as sns
import pyLDAvis
from biterm.cbtm import oBTM
from biterm.utility import vec_to_biterms, topic_summuary  # helper functions

# Import custom utility functions.
# import topic_extraction_utility_functions as lda_util

#############################################################

# Pandas options.
pd.options.display.max_rows = None
pd.options.display.max_columns = None
pd.options.display.width = None
pd.options.display.max_colwidth = 1000
# Pandas float precision display.
pd.set_option('precision', 12)
# Seaborn setting.
sns.set()
# Don't output these types of warnings to terminal.
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
# Matplotlib log settings.
mylog = log.getLogger("matplotlib")
mylog.setLevel(log.INFO)

"""
Turn debug log statements for various sections of code on/off.
(adjust log level as necessary)
"""
log.basicConfig(level=log.INFO)
log.disable(level=log.DEBUG)

### Pre-process and Post-process Tweets:

In [None]:
# Test on our topic modeling dataset.
tweet_dataset_preprocessor(
    "D:/Dropbox/summer-research-2019/jupyter-notebooks/attribute-datasets/"
    "twitter-dataset-7-10-19-test-subset-100-examples.csv",
    "D:/Dropbox/summer-research-2019/jupyter-notebooks/attribute-datasets/"
    "twitter-dataset-7-10-19-lda-ready-tweet-text-with-hashtags-excluded-created-7-17-19.csv",
    "text_derived")

# Test on our topic modeling dataset.
tweet_dataset_preprocessor(
    "D:/Dropbox/summer-research-2019/jupyter-notebooks/attribute-datasets/"
    "twitter-dataset-7-10-19-test-subset-100-examples.csv",
    "D:/Dropbox/summer-research-2019/jupyter-notebooks/attribute-datasets/"
    "twitter-dataset-7-10-19-lda-ready-user-description-text-with-hashtags-excluded-created-7-17-19.csv",
    "user_description")

### Import and prepare the preprocessed dataset for use in Biterm topic extraction:

In [None]:
# # Import the dataset (relative path).
# tweet_dataset_processed = \
#     pd.read_csv("twitter-dataset-7-10-19-lda-ready-tweet-text-with-hashtags-excluded-created-7-17-19.csv", sep=",")

# Import the dataset (absolute path).
tweet_dataset_processed = \
    pd.read_csv("D:/Dropbox/summer-research-2019/jupyter-notebooks/attribute-datasets/"
                "twitter-dataset-7-10-19-lda-ready-tweet-text-with-hashtags-excluded-created-7-17-19.csv", sep=",")

# # Import the dataset (test/debug).
# tweet_dataset_processed = \
#     pd.read_csv("twitter-dataset-7-10-19-lda-ready-tweet-text-test.csv", sep=",")

# # Import the dataset (test/debug).
# tweet_dataset_processed = \
#     pd.read_csv("D:/Dropbox/summer-research-2019/jupyter-notebooks/attribute-datasets/"
#                 "twitter-dataset-7-10-19-lda-ready-tweet-text-test.csv", sep=",")

# Reindex and shuffle the data randomly.
tweet_dataset_processed = tweet_dataset_processed.reindex(
    pd.np.random.permutation(tweet_dataset_processed.index))

# Generate a Pandas dataframe.
tweet_text_dataframe = pd.DataFrame(tweet_dataset_processed)

# # Print shape and column names.
# log.info(f"\nThe shape of the Tweet text dataframe:")
# log.info(f"{tweet_text_dataframe.shape}\n")
# log.info(f"\nThe columns of the Tweet text dataframe:")
# log.info(f"{tweet_text_dataframe.columns}\n")

# Print shape and column names.
log.info("\nThe shape of the Tweet text dataframe:")
log.info(tweet_text_dataframe.shape)
log.info("\nThe columns of the Tweet text dataframe:")
log.info(tweet_text_dataframe.columns)

# Drop any NaN or empty Tweet rows in dataframe (or else CountVectorizer will blow up).
tweet_text_dataframe = tweet_text_dataframe.dropna()

# # Print shape and column names.
# log.info(f"\nThe shape of the Tweet text dataframe with NaN (empty) rows dropped:")
# log.info(f"{tweet_text_dataframe.shape}\n")
# log.info(f"\nThe columns of the Tweet text dataframe with NaN (empty) rows dropped:")
# log.info(f"{tweet_text_dataframe.columns}\n")

# Print shape and column names.
log.info("\nThe shape of the Tweet text dataframe with NaN (empty) rows dropped:")
log.info(tweet_text_dataframe.shape)
log.info("\nThe columns of the Tweet text dataframe with NaN (empty) rows dropped:")
log.info(tweet_text_dataframe.columns)

# Reindex everything.
tweet_text_dataframe.index = pd.RangeIndex(len(tweet_text_dataframe.index))

# Assign column names.
tweet_text_dataframe_column_names = ['text_derived', 'text_derived_preprocessed', 'text_derived_postprocessed']

# Rename column in dataframe.
tweet_text_dataframe.columns = tweet_text_dataframe_column_names

# Create input feature.
selected_features = tweet_text_dataframe[['text_derived_postprocessed']]
processed_features = selected_features.copy()

# # Check what we are using as inputs.
# log.info(f"\nA sample Tweet in our input feature:")
# log.info(f"{processed_features['text_derived_postprocessed'][0]}\n")

# Check what we are using as inputs.
log.info("\nA sample Tweet in our input feature:")
log.info(processed_features['text_derived_postprocessed'][0])

# Create feature set.
slo_feature_series = processed_features['text_derived_postprocessed']
slo_feature_series = pd.Series(slo_feature_series)
slo_feature_list = slo_feature_series.tolist()

### Perform the topic extraction:

In [None]:

def biterm_topic_model_topic_extraction():
    """
    Function performs topic extraction on Tweets using the Gensim HDP model.

    :return: None.
    """
    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model.
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
    tf = tf_vectorizer.fit_transform(slo_feature_series)
    tf_feature_names = tf_vectorizer.get_feature_names()

    # log.info(f"\n.fit_transform - Learn the vocabulary dictionary and return term-document matrix.")
    # log.info(f"{tf}\n")
    # log.info(f"\n.get_feature_names - Array mapping from feature integer indices to feature name")
    # log.info(f"{tf_feature_names}\n")

    log.info("\n.fit_transform - Learn the vocabulary dictionary and return term-document matrix.")
    log.info(tf)
    log.info("\n.get_feature_names - Array mapping from feature integer indices to feature name")
    log.info(tf_feature_names)

    # Convert corpus of documents (vectorized text) to numpy array.
    tf_array = tf.toarray()

    # Convert dictionary of words (vocabulary) to numpy array.
    tf_feature_names = np.array(tf_vectorizer.get_feature_names())

    # get biterms
    biterms = vec_to_biterms(tf_array)

    # create btm
    btm = oBTM(num_topics=20, V=tf_feature_names)

    print("\n\n Train Online BTM ..")
    for i in range(0, len(biterms), 100):  # prozess chunk of 200 texts
        biterms_chunk = biterms[i:i + 100]
        btm.fit(biterms_chunk, iterations=50)
    topics = btm.transform(biterms)
    time.sleep(3)

    # print("\n\n Visualize Topics ..")
    # vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(tf_array, axis=1), tf_feature_names, np.sum(tf_array, axis=0))
    # pyLDAvis.save_html(vis, './vis/online_btm.html')

    print("\n\n Topic coherence ..")
    topic_summuary(btm.phi_wz.T, tf_array, tf_feature_names, 10)

    print("\n\n Texts & Topics ..")
    for i in range(len(slo_feature_series)):
        print("{} (topic: {})".format(slo_feature_series[i], topics[i].argmax()))

In [None]:
"""
Main function.  Execute the program.
"""
if __name__ == '__main__':
    my_start_time = time.time()
    ################################################
    """
    Perform the topic extraction.
    """
    biterm_topic_model_topic_extraction()

    ################################################
    my_end_time = time.time()

    time_elapsed_in_seconds = (my_end_time - my_start_time)
    time_elapsed_in_minutes = (my_end_time - my_start_time) / 60.0
    time_elapsed_in_hours = (my_end_time - my_start_time) / 60.0 / 60.0
    # print(f"Time taken to process dataset: {time_elapsed_in_seconds} seconds, "
    #       f"{time_elapsed_in_minutes} minutes, {time_elapsed_in_hours} hours.")
    print("\n\nTime taken to process dataset: " + str(time_elapsed_in_seconds) + " seconds, " +
          str(time_elapsed_in_minutes) + " minutes, " + str(time_elapsed_in_hours) + " hours.\n")

### Topic Extraction Results on Twitter Dataset Tweet Text:

Was stupid and didn't limit the print statement for displaying Tweets and their topic assignments, therefore it printed all 650k+ Tweets and I couldn't scroll back far enough to see the topic coherence data and actual topics.  Waiting on a 2nd and 3rd execution run simulatenously (limited to displaying first 10 Tweets and their associated topics this time around)

In [None]:
"""
Biterm Output: (subset of all output)

(not the topics - just example of Tweets with assigned topics )
 world record objection 276 submission prepare submit invasive gas field    (topic: 13)
accenture join bhp set 50/50 gender target    (topic: 12)
colin barnett wrong iron ore bhp boss    (topic: 6)
approval adani queensland coalmine face legal challenge    hint (topic: 11)
declare buy 5 large parcel land 15 minute drive plan route        (topic: 18)
having whale time fortescue bay pic ig    (topic: 6)
question greens want govern australia greens voter support lnp continual attack australian defend attack labor lnp (topic: 17)
courage tear adani 's license people australia new legal advice show adani wo n’t able damn thing    (topic: 11)
's critical adani need controversial government loan    (topic: 0)
money matters 10 thing need know open bell spy spx qqq dia jpm aapl    (topic: 0)
right alp reconsider support adani    24 hour convince reject    email labor cabinet tell > >    (topic: 11)
annastacia palaszczuk billion dollar handout billionaire adani sign petition    (topic: 15)
dr nahan payroll tax hike tax job large employer coles woolies rio tinto etc (topic: 12)
matt canavan delusional think seq (topic: 0)
great silence single weatherill minister available talk listener bhp job hit (topic: 0)
australia 's climate bomb senselessness adani carmichael coal    (topic: 4)
listen local farmer tourism operator want protect land water coral    (topic: 17)


Time taken to process dataset: 39906.75524163246 seconds, 665.112587360541 minutes, 11.08520978934235 hours.


Process finished with exit code 0

###########################################################################################

(run 1)
Placeholder.

###########################################################################################

(run 2)
Placeholder.

"""

## Resources Used: