In [1]:
import io
import os.path
import re
import tarfile

import smart_open

def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):
    with smart_open.open(url, "rb") as file:
        with tarfile.open(fileobj=file) as tar:
            for member in tar.getmembers():
                if member.isfile() and re.search(r'nipstxt/nips\d+/\d+\.txt', member.name):
                    member_bytes = tar.extractfile(member).read()
                    yield member_bytes.decode('utf-8', errors='replace')

docs = list(extract_documents())

In [2]:
print(len(docs))
print(docs[0][:500])

1740
387 
Neural Net and Traditional Classifiers  
William Y. Huang and Richard P. Lippmann 
MIT Lincoln Laboratory 
Lexington, MA 02173, USA 
Abstract
Previous work on nets with continuous-valued inputs led to generative 
procedures to construct convex decision regions with two-layer percepttons (one hidden 
layer) and arbitrary decision regions with three-layer percepttons (two hidden layers). 
Here we demonstrate that two-layer perceptton classifiers trained with back propagation 
can form both c


In [3]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [4]:
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

     Interpreting the result using "no_above = .75 "

In [5]:
from gensim.corpora import Dictionary

dictionary = Dictionary(docs)

dictionary.filter_extremes(no_below=20, no_above=0.75)

In [6]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [7]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 6754
Number of documents: 1740


     Interpreting the result using "no_above = .9"

In [9]:
from gensim.corpora import Dictionary

dictionary = Dictionary(docs)

dictionary.filter_extremes(no_below=20, no_above=0.9)

corpus = [dictionary.doc2bow(doc) for doc in docs]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 6798
Number of documents: 1740


The no_above parameter specifies the maximum proportion of documents in which a token can appear to be included in the dictionary. By setting it to 0.75, tokens that appear in more than 75% of the documents are excluded from the dictionary. Similarly, setting it to 0.9 excludes tokens that appear in more than 90% of the documents.

The difference in the number of unique tokens between the two settings reflects the stricter filtering applied when no_above is set to 0.9, resulting in fewer tokens being included in the dictionary. However, the number of documents remains the same in both cases since it is independent of the dictionary filtering.

These changes affect the vocabulary size and potentially the quality of the bag-of-words representation. Adjusting no_above allows for control over the vocabulary size and the inclusion of more or fewer frequent terms in the dictionary based on the specific requirements of the application.

    Interpreting the results after training the LDA model using num_topics = 10

In [11]:
from gensim.models import LdaModel

num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = 50  


temp = dictionary[0] 
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)


top_topics = model.top_topics(corpus)
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

def check_topic_threshold(x, topic,threshold):
    topics = model.get_document_topics(corpus[x])
    for i in topics:
        if i[0]==topic and i[1]>threshold:
            return True
    else: return False
t=[docs[x] for x in range(len(corpus)) if check_topic_threshold(x, 0, .9)]

Average topic coherence: -0.6771.
[([(0.031453587, 'network'),
   (0.019327786, 'learning'),
   (0.015130068, 'unit'),
   (0.011830874, 'input'),
   (0.011249237, 'weight'),
   (0.009832398, 'training'),
   (0.009467361, 'output'),
   (0.008128904, 'error'),
   (0.0080774445, 'time'),
   (0.0078066825, 'hidden'),
   (0.006048526, 'state'),
   (0.0059344866, 'layer'),
   (0.005556106, 'problem'),
   (0.005482669, 'figure'),
   (0.0053764316, 'task'),
   (0.0050665326, 'net'),
   (0.0049162614, 'control'),
   (0.0047234627, 'algorithm'),
   (0.0046429625, 'value'),
   (0.0040557957, 'number')],
  -0.37357946210282633),
 ([(0.010582791, 'learning'),
   (0.010535968, 'algorithm'),
   (0.0077663343, 'state'),
   (0.0072821225, 'value'),
   (0.006413359, 'error'),
   (0.0060458826, 'problem'),
   (0.005713942, 'example'),
   (0.0052246987, 'then'),
   (0.0052204872, 'probability'),
   (0.005070462, 'number'),
   (0.0049493858, 'case'),
   (0.0048729638, 'model'),
   (0.0048715696, 'bound'),


    Interpreting the results after training the LDA model using num_topics = 15

In [12]:
from gensim.models import LdaModel

num_topics = 15
chunksize = 2000
passes = 20
iterations = 400
eval_every = 50  


temp = dictionary[0] 
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)


top_topics = model.top_topics(corpus)
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

def check_topic_threshold(x, topic,threshold):
    topics = model.get_document_topics(corpus[x])
    for i in topics:
        if i[0]==topic and i[1]>threshold:
            return True
    else: return False
t=[docs[x] for x in range(len(corpus)) if check_topic_threshold(x, 0, .9)]

Average topic coherence: -0.7639.
[([(0.011681974, 'network'),
   (0.008170731, 'bound'),
   (0.007343298, 'then'),
   (0.0072680283, 'algorithm'),
   (0.0071850433, 'number'),
   (0.0069085457, 'let'),
   (0.006411364, 'theorem'),
   (0.0063346955, 'case'),
   (0.006192115, 'probability'),
   (0.006028156, 'any'),
   (0.0055739232, 'learning'),
   (0.0055321176, 'given'),
   (0.0053626937, 'class'),
   (0.0052585984, 'example'),
   (0.005192798, 'will'),
   (0.00470866, 'distribution'),
   (0.004705537, 'some'),
   (0.00464571, 'point'),
   (0.004587281, 'vector'),
   (0.0045482274, 'there')],
  -0.33981173187597286),
 ([(0.034033764, 'network'),
   (0.021554386, 'unit'),
   (0.014828183, 'input'),
   (0.010212382, 'output'),
   (0.010178825, 'training'),
   (0.009407514, 'hidden'),
   (0.008490284, 'layer'),
   (0.007414752, 'pattern'),
   (0.0070998813, 'net'),
   (0.0068112053, 'weight'),
   (0.00654177, 'recognition'),
   (0.006393011, 'learning'),
   (0.0056892936, 'word'),
   (0

    Interpreting the results after training the LDA model using num_topics = 20

In [15]:
from gensim.models import LdaModel

num_topics = 20
chunksize = 2000
passes = 20
iterations = 400
eval_every = 50  


temp = dictionary[0] 
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)


top_topics = model.top_topics(corpus)
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

def check_topic_threshold(x, topic,threshold):
    topics = model.get_document_topics(corpus[x])
    for i in topics:
        if i[0]==topic and i[1]>threshold:
            return True
    else: return False
t=[docs[x] for x in range(len(corpus)) if check_topic_threshold(x, 0, .9)]

Average topic coherence: -0.7664.
[([(0.04230769, 'network'),
   (0.019953296, 'training'),
   (0.014985912, 'input'),
   (0.014737401, 'unit'),
   (0.012810181, 'weight'),
   (0.012579819, 'output'),
   (0.011903695, 'hidden'),
   (0.011820152, 'error'),
   (0.0110104345, 'layer'),
   (0.008734973, 'net'),
   (0.00811618, 'learning'),
   (0.007656898, 'time'),
   (0.0076067955, 'performance'),
   (0.0067791734, 'task'),
   (0.0062481873, 'trained'),
   (0.005962459, 'were'),
   (0.0058288663, 'architecture'),
   (0.00565377, 'figure'),
   (0.0051911557, 'problem'),
   (0.004972692, 'number')],
  -0.33725877952929234),
 ([(0.02583927, 'network'),
   (0.02177658, 'rule'),
   (0.016142791, 'learning'),
   (0.0063816234, 'weight'),
   (0.005741633, 'example'),
   (0.0049804077, 'training'),
   (0.004803268, 'number'),
   (0.0041650045, 'approach'),
   (0.0041186204, 'our'),
   (0.004000623, 'figure'),
   (0.0039258953, 'node'),
   (0.003924177, 'algorithm'),
   (0.0038684248, 'input'),
  