In [None]:
import os
import sys
import pandas as pd
from pickle import load, dump
from datetime import datetime
import matplotlib.pyplot as plt

In [None]:
from wordcloud import WordCloud
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaModel, LdaMulticore, LsiModel

In [None]:
sys.path.append("..")
from utils import preprocess
from utils.tool_simple import get_keywords
from data.dataset import *

## Record loading

In [None]:
path_file = ""
with open(path_file, 'r') as f:
    data_original = f.readlines()

In [None]:
tweet_user, tweet_user_all, tweet_user_all_username = [], [], []
dict_username_tweet = {}
for idx, line in enumerate(data_original):
    if line.startswith('Username'):
        username = line.strip().split('Username:')[-1]
        if idx:
            dict_username_tweet[username] = tweet_user
            tweet_user = []
    if line.startswith('20'):
        line = line.split(':', 2)[-1][3:]
        line = preprocess.process_for_modeling(line)
        tweet_user.append(line)
        tweet_user_all.append(line)
        tweet_user_all_username.append(username)
print(f"{len(dict_username_tweet)} users with {len(tweet_user_all)} depression tweet")

In [None]:
text = ' '.join(tweet_user_all)

In [None]:
username = random.choice(list(dict_username_tweet.keys()))
dict_username_tweet[username]

In [None]:
df = pd.DataFrame({'username':tweet_user_all_username, 'full_text':tweet_user_all})
df

## Preparation

In [None]:
list_depress = get_keywords("../resources/keywords_depression_strict_list.txt")

In [None]:
list_common = []

In [None]:
STOPWORDS_depress = set(STOPWORDS) | set(list_depress) | set(list_common)
# STOPWORDS_depress = set(STOPWORDS) | set(list_common)
print(len(STOPWORDS), len(list_depress), len(list_common))
print(len(STOPWORDS_depress))

In [None]:
tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()

In [None]:
df["tokens"] = df.full_text.apply(lambda x: tokenizer.tokenize(x))
print(len(df))
df["tokens"] = df.tokens.apply(lambda x: [t.lower() for t in x if len(t) > 2 and t.isalpha() and t not in STOPWORDS_depress])
print(len(df))
df["tokens"] = df.tokens.apply(lambda x: [lemmatizer.lemmatize(t) for t in x])
print(len(df))

In [None]:
corpus = df.tokens.values.tolist()
phrases = Phraser(Phrases(corpus))
for i in range(len(corpus)):
    bigrams = [token for token in phrases[corpus[i]] if "_" in token]
    corpus[i].extend(bigrams)

In [None]:
df["tokens"] = df.tokens.apply(lambda x: [t for t in x if t not in STOPWORDS_depress])
print(len(df))

In [None]:
df

## Word Cloud

In [None]:
corpus = df.tokens.values.tolist()
long_string = ",".join([",".join([t for t in c if t not in ["covid", "pandemic"]]) for c in corpus])

In [None]:
# # Create a WordCloud object
wordcloud = WordCloud(scale=4, random_state=0, background_color="white", max_words=5000, contour_width=2, contour_color='steelblue', collocations=False)
# , max_font_size=50, min_font_size=5
# # Generate a word cloud
wordcloud.generate(long_string)
plt.switch_backend('agg')
plt.switch_backend('Agg')
# # Visualize the word cloud
image = wordcloud.to_image()
image.show()
# wordcloud.to_file('LDA/word_cloud_depress.png')

In [None]:
# wordcloud.to_file('LDA/word_cloud_depress.png')

In [None]:
#  # save tokens to files
file_name = 'LDA/mental_token.csv'
df[["username", "tokens"]].to_csv(file_name, index=False)

## Modeling

In [None]:
from pickle import load, dump
from ast import literal_eval
from datetime import datetime
from gensim.corpora import Dictionary
from gensim.models import LdaModel, LdaMulticore, LsiModel

In [None]:
# Topic modeling
df = pd.read_csv(file_name, keep_default_na=False)
df

In [None]:
# convert strings of lists to lists
df.tokens = df.tokens.apply(eval)
corpus = df.tokens.values.tolist()
dictionary = Dictionary(corpus)

In [None]:
# changing these numbers can increase/decrease the run time if needed, but too exclusive will lead to worse results
no_below = 5
dictionary.filter_extremes(no_below=no_below, no_above=0.5)
corpus = [dictionary.doc2bow(tokens) for tokens in corpus]
print('vocab size: {}'.format(len(dictionary)))
print('documents in corpus: {}'.format(len(corpus)))

In [None]:
os.makedirs(f"LDA/Models/{no_below}/", exist_ok=True)
savefile = f'LDA/Models/{no_below}/all.PICKLE'
print('saving dataset to {}...'.format(savefile))
dump({'corpus': corpus, 'dictionary': dictionary}, open(savefile, 'wb+'))
loaddict = {'corpus': corpus, 'dictionary': dictionary}

In [None]:
#topic model
def topic_modeling(num_topics=5):
    np.random.seed(0)
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token
    iterations = 50
    passes = 5

    print('topics: {}'.format(num_topics))
    print('interations: {}'.format(iterations))
    print('passes: {}'.format(passes))
    print('vocab size: {}'.format(len(dictionary)))
    print('documents in corpus: {}'.format(len(corpus)))

    model_directory = f"LDA/Models/{no_below}/"
    os.makedirs(model_directory, exist_ok=True)
    model_name = f"{model_directory}/all_p{passes}_i{iterations}_t{num_topics}"
    print("Model: ", model_name)

    ##Create new model with desired parameters
    # https://radimrehurek.com/gensim/models/ldamulticore.html
    model = LdaModel(
        corpus=corpus,  # leave commented out for batch training, uncomment to train on full corpus at once
        id2word=id2word,
        iterations=iterations,
        passes=passes,
        num_topics=num_topics,
        random_state=0
    )

    top_topics = model.top_topics(corpus)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('\nAverage topic coherence: %.4f.' % avg_topic_coherence)
    # pprint(top_topics)  # prints list of ((list of top probability,term tuples), topic coherence) tuples

    print(datetime.now())
    try:
        print('saving model...')
        model.save(model_name)
        print('model saved as {}.'.format(model_name))
    except Exception as e:
        print('saving error: {}'.format(e))
    print("----------------", "\n")

In [None]:
n_topics = list(range(10, 201, 5))
len(n_topics)

In [None]:
for num_topics in n_topics:
    topic_modeling(num_topics=num_topics)

## Perplexity coherence

In [None]:
from gensim.models import LdaModel, LdaMulticore, LsiModel

In [None]:
n_topics = list(range(10, 201, 5))

In [None]:
perplexity_list = []
coherence_list = []
f_out = open("LDA/new_topics_record.txt", 'w')
for num_topics in n_topics:
    model_name = f'LDA/Models/5/all_p5_i50_t{num_topics}'
    f_out.write(f"Loading model of {num_topics} topic \n")
    lda_model = LdaModel.load(model_name)
    topic_list = lda_model.print_topics(num_topics=10, num_words=20)
    for topic in topic_list:
        f_out.write(f"{topic[0]} - {topic[1]}\n")
    f_out.write('\n')
    perplexity_list.append(lda_model.log_perplexity(corpus))
    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    top_topics = lda_model.top_topics(corpus)
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    coherence_list.append(avg_topic_coherence)
    break
f_out.close()

In [None]:
perplexity_list = []
coherence_list = []
f_out = open("LDA/new_topics_record.txt", 'w')
for num_topics in n_topics:
    model_name = f'LDA/Models/5/all_p5_i50_t{num_topics}'
    f_out.write(f"Loading model of {num_topics} topic \n")
    lda_model = LdaModel.load(model_name)
    topic_list = lda_model.print_topics(num_topics=10, num_words=20)
    for idx, topic in enumerate(topic_list):
        list_word = []
        for word in str(topic).split('*"')[1:]:
            list_word.append(word.split('" +')[0])
        str_list_word = ", ".join(list_word)[:-3]
        f_out.write(f"{idx} - {str_list_word}\n")
    f_out.write('\n')
    perplexity_list.append(lda_model.log_perplexity(corpus))
    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    top_topics = lda_model.top_topics(corpus)
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    coherence_list.append(avg_topic_coherence)
    # break
f_out.close()

In [None]:
perplexity_list = np.asarray(perplexity_list)
coherence_list = np.asarray(coherence_list)

In [None]:
font_size=16
# ticks
t = np.asarray(range(len(n_topics)))
# main plot
fig, ax1 = plt.subplots(figsize=(10, 6))
plt.xticks(t, n_topics, rotation=90)
ax1.set_xlabel('Topics', fontsize=font_size)

In [None]:
# subplot 1
color = 'tab:red'
ax1.set_ylabel('Perplexity', color=color, fontsize=font_size)
p1 = ax1.plot(t, perplexity_list, marker='o', color=color, label = 'Perplexity')
b, m = np.polynomial.polynomial.polyfit(t, perplexity_list, 1)
# plt.plot(t, b + m * t, '--', color=color)
ax1.tick_params(axis='y', labelcolor=color)
# ax1.set_ylim([0, 0.26])
for tick in ax1.yaxis.get_major_ticks():
    tick.label.set_fontsize(font_size-1)

In [None]:
# instantiate a second axes that shares the same x-axis
ax2 = ax1.twinx()
# subplot 2
color = 'tab:blue'
ax2.set_ylabel('Model Coherence', color=color, fontsize=font_size)  # we already handled the x-label with ax1
p2 = ax2.plot(t, coherence_list, marker='o', color=color, label = 'Model Coherence')
b, m = np.polynomial.polynomial.polyfit(t, coherence_list, 1)
# plt.plot(t, b + m * t, '--', color=color)
ax2.tick_params(axis='y', labelcolor=color)
# ax2.set_ylim([0, 0.131])

# Pad margins so that markers don't get clipped by the axes
plt.margins(0.1)

plt.yticks(fontsize=font_size-1)
# fig.tight_layout()  # otherwise the right y-label is slightly clipped
lns = p1+p2
labs = [l.get_label() for l in lns]

# adjust legends location
ax1.legend(lns, labs, loc=0)

# plt.title("", fontsize=font_size)

plt.show()
fig.savefig("LDA/pc5.pdf", bbox_inches='tight')