In [3]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
import numpy as np
import time

# New Section

In [5]:
# remove punctuation
with open("stops.txt", "r") as infile:
    stops = [line.strip() for line in infile.readlines()]
stops.extend(["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x"])

In [6]:
vec = CountVectorizer(
    # max_df --> words appear 70%
    max_df=0.7, min_df=30, stop_words=stops)

*********************

In [7]:
with open("ojsEd_corpus.txt", "r") as infile:
    ojs = [line.lower().strip() for line in infile.readlines()]

In [8]:
matrix_all = vec.fit_transform(ojs)

In [9]:
def get_scores(data, min_epoch, max_epoch):
    div_scores = [] # diversity scores
    ppl_scores = [] # perplexity scores
    models = []
    time_elapseds = []
    for epoch in range(min_epoch,max_epoch):
        tock = time.time()
        model = LDA(n_components=50, random_state=100, max_iter=epoch)
        model.fit_transform(data)
        compnorm = model.components_ / model.components_.sum(axis=1)[:, np.newaxis]
        top_words = []
        models.append(model)
        print(f"Epoch: {epoch}")
        for comp in compnorm:
            word_idx = np.argsort(comp)[::-1][:25]
            top_words.append(word_idx)
        toparr = np.array(top_words, dtype=object)
        n_unique = len(np.unique(toparr))
        TD = n_unique / (25 * 50)
        ppl = model.perplexity(data)
        div_scores.append(TD)
        ppl_scores.append(ppl)
        print(f"Topic diversity: {TD}")
        print(f"Perplexity: {ppl}")
        tick = time.time()
        print(f"Time elapsed: {tick-tock}")
        print("\n")
    return div_scores, ppl_scores, models, time_elapseds

In [None]:
div_scores_all, ppl_scores_all, models, times = get_scores(data=matrix_all, min_epoch=1, max_epoch=51)

Epoch: 1
Topic diversity: 0.1088
Perplexity: 1633.5203335205267
Time elapsed: 45.544429063797


Epoch: 2
Topic diversity: 0.272
Perplexity: 1078.4996700746979
Time elapsed: 52.91321086883545


Epoch: 3
Topic diversity: 0.42
Perplexity: 867.4943236276775
Time elapsed: 56.5989625453949


Epoch: 4
Topic diversity: 0.5248
Perplexity: 772.6301240833568
Time elapsed: 62.65300107002258


Epoch: 5
Topic diversity: 0.5872
Perplexity: 725.7736786268343
Time elapsed: 69.77685284614563


Epoch: 6
Topic diversity: 0.6264
Perplexity: 703.3946851339342
Time elapsed: 75.50285458564758


Epoch: 7
Topic diversity: 0.6656
Perplexity: 692.83005317195
Time elapsed: 80.28208804130554


Epoch: 8
Topic diversity: 0.6912
Perplexity: 686.9259313738627
Time elapsed: 86.17986106872559


Epoch: 9
Topic diversity: 0.7024
Perplexity: 684.1801608230222
Time elapsed: 91.19336819648743


Epoch: 10
Topic diversity: 0.7168
Perplexity: 682.5102941898463
Time elapsed: 95.7994544506073


Epoch: 11
Topic diversity: 0.7328
Pe

In [None]:
div_scores_all, ppl_scores_all, models, times = get_scores(data=matrix_all, min_epoch=1, max_epoch=51)

*********************

## Plot topics in the best LDA model (highest topic diversity) where Epoch = 49

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
feature_names = vec.get_feature_names_out()
n_top_words = 25

In [None]:
best_model = LDA(n_components=50, random_state=100, max_iter=49)
best_model.fit_transform(matrix_all)

In [None]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(10, 5, figsize=(40, 89), sharex=True)
axes = axes.flatten()
for topic_idx, topic in enumerate(best_model.components_):
    top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
    top_features = [feature_names[i] for i in top_features_ind]
    weights = topic[top_features_ind]

    ax = axes[topic_idx]
    ax.barh(top_features, weights, height=0.7)
    ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
    ax.invert_yaxis()
    ax.tick_params(axis="both", which="major", labelsize=20)
    for i in "top right left".split():
        ax.spines[i].set_visible(False)
    fig.suptitle("Topics in LDA model with highest topic diversity (epoch = 49)", fontsize = 80)
plt.subplots_adjust(top=0.90,bottom=0.05, wspace=0.70, hspace=0.3)
plt.savefig("LDAtopics_model_epoch49.png")
plt.show()