In [1]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
import numpy as np
import time

In [2]:
with open("../stops.txt", "r") as infile:
    stops = [line.strip() for line in infile.readlines()]
stops.extend(["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x"])

In [3]:
vec = CountVectorizer(
    max_df=0.7, min_df=30, tokenizer=word_tokenize, stop_words=stops)

*********************

In [4]:
with open("ojs10k.txt", "r") as infile:
    ojs = [line.lower().strip() for line in infile.readlines()]

In [5]:
matrix10k = vec.fit_transform(ojs)

In [9]:
div_scores = []
ppl_scores = []
for epoch in [51]:
    tock = time.time()
    model = LDA(n_components=50, random_state=100, max_iter=epoch)
    model.fit_transform(matrix10k)
    compnorm = model.components_ / model.components_.sum(axis=1)[:, np.newaxis]
    top_words = []
    for comp in compnorm:
        word_idx = np.argsort(comp)[::-1][:25]
        top_words.append(word_idx)
    toparr = np.array(top_words, dtype=object)
    n_unique = len(np.unique(toparr))
    TD = n_unique / (25 * 50)
    ppl = model.perplexity(matrix10k)
    div_scores.append(TD)
    ppl_scores.append(ppl)
    print(f"Topic diversity: {TD}")
    print(f"Perplexity: {ppl}")
    tick = time.time()
    print(f"Time elapsed: {tick-tock}")
    print("\n")

Topic diversity: 0.5848
Perplexity: 635.3513055739534
Time elapsed: 18.835319995880127




In [7]:
print(div_scores)

[0.1792, 0.3424, 0.4448, 0.5032, 0.54, 0.5616, 0.5704, 0.5752, 0.5792, 0.5792, 0.5808, 0.5824, 0.5824, 0.5832, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848, 0.5848]


In [8]:
print(ppl_scores)

[1638.957043496083, 1084.5247795811815, 874.0653238407803, 769.2024894603917, 713.3436031694742, 678.0442913591781, 656.517590220653, 643.0224567039692, 634.9184720653984, 627.39897376911, 625.0664079276967, 622.4325198701247, 622.2478326228463, 622.8185393802062, 623.5071843224905, 625.0591739513877, 626.0063419588045, 627.2824629990034, 628.267803783191, 629.5548947311705, 630.4741902137304, 631.0598882581356, 631.5434487484998, 631.9050950595386, 632.6861328062639, 632.8913806777184, 632.9022967969017, 633.2593811654409, 633.2033255916316, 633.2579024765824, 633.5899045166253, 633.8454847411685, 634.0308697653455, 634.0322428717282, 633.9605680844026, 634.1362272825982, 634.0668135361599, 633.9708459863779, 634.0995505769522, 634.3280683708138, 634.5653621101097, 634.7809367853109, 634.9253233010514, 634.9505925050368, 634.9927175681447, 635.1303102244185, 635.3523102589699, 635.312725717959, 635.4074651877027]


*********************

In [2]:
with open("ojs64k.txt", "r") as infile:
    ojs = [line.lower().strip() for line in infile.readlines()]

In [5]:
matrix64k = vec.fit_transform(ojs)

In [8]:
div_scores = []
ppl_scores = []
for epoch in range(1,50):
    tock = time.time()
    model = LDA(n_components=50, random_state=100, max_iter=epoch)
    model.fit_transform(matrix64k)
    compnorm = model.components_ / model.components_.sum(axis=1)[:, np.newaxis]
    top_words = []
    for comp in compnorm:
        word_idx = np.argsort(comp)[::-1][:25]
        top_words.append(word_idx)
    toparr = np.array(top_words, dtype=object)
    n_unique = len(np.unique(toparr))
    TD = n_unique / (25 * 50)
    ppl = model.perplexity(matrix)
    div_scores.append(TD)
    ppl_scores.append(ppl)
    print(f"Topic diversity: {TD}")
    print(f"Perplexity: {ppl}")
    tick = time.time()
    print(f"Time elapsed: {tick-tock}")
    print("\n")

Topic diversity: 0.116
Perplexity: 2854.761656927356
Time elapsed: 22.73287081718445


Topic diversity: 0.2512
Perplexity: 2106.3345890359064
Time elapsed: 25.60282802581787


Topic diversity: 0.3912
Perplexity: 1778.8357824954041
Time elapsed: 28.48859715461731


Topic diversity: 0.508
Perplexity: 1606.400581765635
Time elapsed: 31.96041989326477


Topic diversity: 0.5904
Perplexity: 1511.5201508318912
Time elapsed: 34.738621950149536


Topic diversity: 0.6456
Perplexity: 1459.0277650150788
Time elapsed: 38.07667779922485


Topic diversity: 0.6808
Perplexity: 1431.6033013105764
Time elapsed: 41.16506099700928


Topic diversity: 0.7032
Perplexity: 1416.2685547709768
Time elapsed: 43.74478888511658


Topic diversity: 0.728
Perplexity: 1408.5576251974067
Time elapsed: 46.61259603500366


Topic diversity: 0.7384
Perplexity: 1406.205034073722
Time elapsed: 49.16384816169739


Topic diversity: 0.756
Perplexity: 1406.4643615549212
Time elapsed: 51.60255408287048


Topic diversity: 0.7704
Per

In [11]:
del ojs

In [12]:
with open("ojs500k.txt", "r") as infile:
    ojs = [line.lower().strip() for line in infile.readlines()]

In [13]:
%time matrix500k = vec.fit_transform(ojs)

CPU times: user 35.6 s, sys: 178 ms, total: 35.8 s
Wall time: 35.8 s


In [14]:
div_scores = []
ppl_scores = []
for epoch in range(1,50):
    tock = time.time()
    model = LDA(n_components=50, random_state=100, max_iter=epoch)
    model.fit_transform(matrix500k)
    compnorm = model.components_ / model.components_.sum(axis=1)[:, np.newaxis]
    top_words = []
    for comp in compnorm:
        word_idx = np.argsort(comp)[::-1][:25]
        top_words.append(word_idx)
    toparr = np.array(top_words, dtype=object)
    n_unique = len(np.unique(toparr))
    TD = n_unique / (25 * 50)
    ppl = model.perplexity(matrix)
    div_scores.append(TD)
    ppl_scores.append(ppl)
    print(f"Topic diversity: {TD}")
    print(f"Perplexity: {ppl}")
    tick = time.time()
    print(f"Time elapsed: {tick-tock}")
    print("\n")

Topic diversity: 0.104
Perplexity: 529705.16497336
Time elapsed: 153.53865504264832


Topic diversity: 0.2312
Perplexity: 357239.77817375556
Time elapsed: 184.82922911643982


Topic diversity: 0.3664
Perplexity: 262162.3676364781
Time elapsed: 215.99445390701294


Topic diversity: 0.4672
Perplexity: 210075.25988814034
Time elapsed: 241.2549111843109


Topic diversity: 0.5384
Perplexity: 178127.94399345573
Time elapsed: 270.4438371658325


Topic diversity: 0.5864
Perplexity: 158229.79810808485
Time elapsed: 295.6091718673706


Topic diversity: 0.628
Perplexity: 145298.6883369173
Time elapsed: 319.11403012275696


Topic diversity: 0.656
Perplexity: 137248.36704476358
Time elapsed: 344.42570900917053


Topic diversity: 0.676
Perplexity: 132339.08541819896
Time elapsed: 373.3389058113098


Topic diversity: 0.6904
Perplexity: 128240.7091014212
Time elapsed: 396.9002482891083


Topic diversity: 0.7064
Perplexity: 125102.84176365162
Time elapsed: 417.159903049469


Topic diversity: 0.7224
Per

KeyboardInterrupt: 

In [15]:
print(div_scores)

[0.104, 0.2312, 0.3664, 0.4672, 0.5384, 0.5864, 0.628, 0.656, 0.676, 0.6904, 0.7064, 0.7224, 0.732, 0.7408, 0.7456, 0.7472, 0.7552, 0.7592, 0.7624, 0.768, 0.7688, 0.7712, 0.7736, 0.7784, 0.7832, 0.7856, 0.7856, 0.7896, 0.792, 0.7928, 0.7936, 0.7944, 0.7952, 0.7968, 0.7992, 0.8024, 0.8032, 0.8072, 0.8072, 0.8072, 0.808, 0.8072, 0.8104, 0.8104]


In [16]:
print(ppl_scores)

[529705.16497336, 357239.77817375556, 262162.3676364781, 210075.25988814034, 178127.94399345573, 158229.79810808485, 145298.6883369173, 137248.36704476358, 132339.08541819896, 128240.7091014212, 125102.84176365162, 122800.72058666038, 121126.49543107278, 119824.260984731, 118959.23554296845, 118271.89985770587, 117405.57783363378, 116624.77884358006, 116017.22963021044, 115493.20871774826, 115058.15452598486, 114542.9051164241, 114364.53611299909, 114213.955805536, 113873.56567804309, 113619.99548817471, 113482.11808547263, 113145.51707892837, 112953.85492940046, 112807.26851060646, 112642.37800812886, 112477.73541860063, 112290.35613346822, 112189.57879912382, 112123.05592636655, 112017.88713414945, 111859.37041045308, 111750.52982795938, 111679.33235719959, 111607.69725627739, 111628.00969065951, 111479.31463650324, 111326.53983971805, 111172.70966825036]


In [17]:
del ojs

*************************************

In [7]:
with open("ojs1M.txt", "r") as infile:
    ojs = [line.lower().strip() for line in infile.readlines()]

In [8]:
%time matrix1M = vec.fit_transform(ojs)

CPU times: user 1min 10s, sys: 420 ms, total: 1min 11s
Wall time: 1min 11s


In [None]:
div_scores = []
ppl_scores = []
for epoch in range(1,50):
    tock = time.time()
    model = LDA(n_components=50, random_state=100, max_iter=epoch)
    model.fit_transform(matrix1M)
    compnorm = model.components_ / model.components_.sum(axis=1)[:, np.newaxis]
    top_words = []
    for comp in compnorm:
        word_idx = np.argsort(comp)[::-1][:25]
        top_words.append(word_idx)
    toparr = np.array(top_words, dtype=object)
    n_unique = len(np.unique(toparr))
    TD = n_unique / (25 * 50)
    ppl = model.perplexity(matrix1M)
    div_scores.append(TD)
    ppl_scores.append(ppl)
    print(f"Topic diversity: {TD}")
    print(f"Perplexity: {ppl}")
    tick = time.time()
    print(f"Time elapsed: {tick-tock}")
    print("\n")

Topic diversity: 0.1008
Perplexity: 3510.1443782144133
Time elapsed: 413.35098099708557


Topic diversity: 0.2344
Perplexity: 2695.6644189751164
Time elapsed: 457.41803884506226




In [7]:
x = (div_scores, ppl_scores)