# LDA Model

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Data

In [2]:
import io
import os.path
import re
import tarfile

import smart_open

def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):
    with smart_open.open(url, "rb") as file:
        with tarfile.open(fileobj=file) as tar:
            for member in tar.getmembers():
                if member.isfile() and re.search(r'nipstxt/nips\d+/\d+\.txt', member.name):
                    member_bytes = tar.extractfile(member).read()
                    yield member_bytes.decode('utf-8', errors='replace')

docs = list(extract_documents())

In [3]:
print(len(docs))
print(docs[0][:500])

1740
387 
Neural Net and Traditional Classifiers  
William Y. Huang and Richard P. Lippmann 
MIT Lincoln Laboratory 
Lexington, MA 02173, USA 
Abstract
Previous work on nets with continuous-valued inputs led to generative 
procedures to construct convex decision regions with two-layer percepttons (one hidden 
layer) and arbitrary decision regions with three-layer percepttons (two hidden layers). 
Here we demonstrate that two-layer perceptton classifiers trained with back propagation 
can form both c


# Pre-process and vectorize the documents

In [4]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [5]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [6]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

2023-04-04 15:43:10,352 : INFO : collecting all words and their counts
2023-04-04 15:43:10,354 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2023-04-04 15:43:21,592 : INFO : collected 1120198 token types (unigram + bigrams) from a corpus of 4629808 words and 1740 sentences
2023-04-04 15:43:21,594 : INFO : merged Phrases<1120198 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000>
2023-04-04 15:43:21,596 : INFO : Phrases lifecycle event {'msg': 'built Phrases<1120198 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000> in 11.24s', 'datetime': '2023-04-04T15:43:21.596308', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


In [7]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

2023-04-04 15:43:40,609 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2023-04-04 15:43:45,153 : INFO : built Dictionary<79429 unique tokens: ['1ooooo', '1st', '25oo', '2o00', '4ooo']...> from 1740 documents (total 4953968 corpus positions)
2023-04-04 15:43:45,156 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<79429 unique tokens: ['1ooooo', '1st', '25oo', '2o00', '4ooo']...> from 1740 documents (total 4953968 corpus positions)", 'datetime': '2023-04-04T15:43:45.156161', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}
2023-04-04 15:43:45,311 : INFO : discarding 70785 tokens: [('1ooooo', 1), ('25oo', 2), ('2o00', 6), ('4ooo', 2), ('64k', 6), ('a', 1740), ('aaditional', 1), ('above', 1114), ('abstract', 1740), ('acase', 1)]...
2023-04-04 15:43:45,314 : INFO : keeping 8644 tokens which were in no less than 20 and no more than 870 (=50.0%) documents
2

In [8]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [9]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 8644
Number of documents: 1740


# Training

In [10]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

2023-04-04 15:43:48,176 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
2023-04-04 15:43:48,181 : INFO : using serial LDA version on this node
2023-04-04 15:43:48,203 : INFO : running online (multi-pass) LDA training, 10 topics, 20 passes over the supplied corpus of 1740 documents, updating model once every 1740 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2023-04-04 15:43:48,205 : INFO : PROGRESS: pass 0, at document #1740/1740
2023-04-04 15:44:43,161 : INFO : optimized alpha [0.09683638, 0.062146228, 0.076947145, 0.072598696, 0.085596, 0.0891738, 0.058290202, 0.0696312, 0.0775034, 0.06945777]
2023-04-04 15:44:43,184 : INFO : topic #6 (0.058): 0.006*"image" + 0.004*"visual" + 0.004*"field" + 0.004*"filter" + 0.003*"cell" + 0.003*"response" + 0.003*"solution" + 0.003*"signal" + 0.003*"optimal" + 0.003*"control"
2023-04-04 15:44:43,189 : INFO : topic #1 (0.062): 0.005*"recogni

In [11]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

2023-04-04 15:50:29,581 : INFO : CorpusAccumulator accumulated stats from 1000 documents


Average topic coherence: -1.1811.
[([(0.0055544167, 'gaussian'),
   (0.0052463426, 'hidden'),
   (0.004963446, 'mixture'),
   (0.0045803944, 'noise'),
   (0.004493264, 'prediction'),
   (0.0044829464, 'estimate'),
   (0.0044161193, 'likelihood'),
   (0.004328939, 'density'),
   (0.0041987626, 'prior'),
   (0.004131406, 'approximation'),
   (0.003772136, 'bayesian'),
   (0.0035632188, 'variance'),
   (0.0034792833, 'log'),
   (0.0032770261, 'em'),
   (0.0031307193, 'posterior'),
   (0.003015177, 'generalization'),
   (0.002973318, 'optimal'),
   (0.0028702398, 'estimation'),
   (0.002810206, 'matrix'),
   (0.0027650697, 'sample')],
  -0.905946189552042),
 ([(0.02259867, 'neuron'),
   (0.015441564, 'cell'),
   (0.009849928, 'spike'),
   (0.0081646135, 'synaptic'),
   (0.007798447, 'activity'),
   (0.0073849056, 'firing'),
   (0.0070876563, 'response'),
   (0.006667137, 'stimulus'),
   (0.005451517, 'signal'),
   (0.00497962, 'potential'),
   (0.0042268345, 'noise'),
   (0.004157701, 'con