In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
import os
import tempfile
TEMP_FOLDER = tempfile.gettempdir()
print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))

Folder "/var/folders/9j/wnpgv9rd1852fcj3qqttm7_40000gn/T" will be used to save temporary dictionary and corpus.


In [1]:
from gensim import corpora

In [2]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

In [3]:
#This is tiny documents with only 9 sentences, now clean all the stop words and remove word that only appears once.
stopwords = set('for a of the end to in'.split())
texts = [[word for word in document.lower().split() if word not in stopwords] for document in documents]

#Remove words that only appears once
from collections import defaultdict
frequency = defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]
from pprint import pprint
pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'and', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees', 'and'],
 ['graph', 'minors', 'survey']]


In [6]:
dictionary = corpora.Dictionary(texts)
dictionary.save(os.path.join(TEMP_FOLDER, 'deerwester.dict')) #Store dictionary for future reference 
print(dictionary)

Dictionary(13 unique tokens: [u'and', u'minors', u'graph', u'system', u'trees']...)


Here we assigned a unique integer id to all words appearing in the corpus with the gensim.corpora.dictionary.Dictionary class. This sweeps across the texts, collecting word counts and relevant statistics. In the end, we see there are thirteen distinct words in the processed corpus, which means each document will be represented by twelve numbers (ie., by a 13-D vector). To see the mapping between words and their ids:

In [9]:
print(dictionary.token2id)

{u'and': 9, u'minors': 12, u'graph': 11, u'system': 6, u'trees': 10, u'eps': 8, u'computer': 1, u'survey': 5, u'user': 7, u'human': 2, u'time': 4, u'interface': 0, u'response': 3}


In [10]:
#Actually convert tokenized document to vector
new_doc = "Human and computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

[(1, 1), (2, 1), (9, 1)]


In [13]:
corpus_vec = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'deerwester.mm'), corpus_vec)  # store to disk, for later use
for c in corpus_vec:
    print(c)

2017-05-30 20:20:57,461 : INFO : storing corpus in Matrix Market format to /var/folders/9j/wnpgv9rd1852fcj3qqttm7_40000gn/T/deerwester.mm
2017-05-30 20:20:57,465 : INFO : saving sparse matrix to /var/folders/9j/wnpgv9rd1852fcj3qqttm7_40000gn/T/deerwester.mm
2017-05-30 20:20:57,468 : INFO : PROGRESS: saving document #0
2017-05-30 20:20:57,470 : INFO : saved 9x13 matrix, density=25.641% (30/117)
2017-05-30 20:20:57,472 : INFO : saving MmCorpus index to /var/folders/9j/wnpgv9rd1852fcj3qqttm7_40000gn/T/deerwester.mm.index


[(0, 1), (1, 1), (2, 1)]
[(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(0, 1), (6, 1), (7, 1), (8, 1)]
[(2, 1), (6, 2), (8, 1), (9, 1)]
[(3, 1), (4, 1), (7, 1)]
[(10, 1)]
[(10, 1), (11, 1)]
[(9, 1), (10, 1), (11, 1), (12, 1)]
[(5, 1), (11, 1), (12, 1)]


In [14]:
class MyCorpus(object):
    def __iter__(self):
        for line in open('datasets/mycorpus.txt'):
            # assume there's one document per line, tokens separated by whitespace
            yield dictionary.doc2bow(line.lower().split())

In [15]:
corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!
print(corpus_memory_friendly)

<__main__.MyCorpus object at 0x10c4e2dd0>


In [16]:
for vector in corpus_memory_friendly:  # load one vector into memory at a time
    print(vector)

IOError: [Errno 2] No such file or directory: 'datasets/mycorpus.txt'

Although the output is the same as for the plain Python list, the corpus is now much more memory friendly, because at most one vector resides in RAM at a time. Your corpus can now be as large as you want.
We are going to create the dictionary from the mycorpus.txt file without loading the entire file into memory. Then, we will generate the list of token ids to remove from this dictionary by querying the dictionary for the token ids of the stop words, and by querying the document frequencies dictionary (dictionary.dfs) for token ids that only appear once. Finally, we will filter these token ids out of our dictionary and call dictionary.compactify() to remove the gaps in the token id series.

In [None]:
from six import iteritems

# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('datasets/mycorpus.txt'))

# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist 
            if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]

# remove stop words and words that appear only once
dictionary.filter_tokens(stop_ids + once_ids)

# remove gaps in id sequence after words that were removed
dictionary.compactify()
print(dictionary)

In [17]:
from gensim import corpora, models, similarities

dictionary = corpora.Dictionary.load(os.path.join(TEMP_FOLDER, 'deerwester.dict'))
corpus = corpora.MmCorpus(os.path.join(TEMP_FOLDER, 'deerwester.mm')) # comes from the first tutorial, "From strings to vectors"
print(corpus)

2017-05-30 20:25:13,643 : INFO : loading Dictionary object from /var/folders/9j/wnpgv9rd1852fcj3qqttm7_40000gn/T/deerwester.dict
2017-05-30 20:25:13,645 : INFO : loaded /var/folders/9j/wnpgv9rd1852fcj3qqttm7_40000gn/T/deerwester.dict
2017-05-30 20:25:13,647 : INFO : loaded corpus index from /var/folders/9j/wnpgv9rd1852fcj3qqttm7_40000gn/T/deerwester.mm.index
2017-05-30 20:25:13,649 : INFO : initializing corpus reader from /var/folders/9j/wnpgv9rd1852fcj3qqttm7_40000gn/T/deerwester.mm
2017-05-30 20:25:13,651 : INFO : accepted corpus with 9 documents, 13 features, 30 non-zero entries


MmCorpus(9 documents, 13 features, 30 non-zero entries)


In [18]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

2017-05-30 20:25:26,401 : INFO : using serial LSI version on this node
2017-05-30 20:25:26,404 : INFO : updating model with new documents
2017-05-30 20:25:26,407 : INFO : preparing a new chunk of documents
2017-05-30 20:25:26,410 : INFO : using 100 extra samples and 2 power iterations
2017-05-30 20:25:26,411 : INFO : 1st phase: constructing (13, 102) action matrix
2017-05-30 20:25:26,416 : INFO : orthonormalizing (13, 102) action matrix
2017-05-30 20:25:26,425 : INFO : 2nd phase: running dense svd on (13, 9) matrix
2017-05-30 20:25:26,429 : INFO : computing the final decomposition
2017-05-30 20:25:26,431 : INFO : keeping 2 factors (discarding 44.243% of energy spectrum)
2017-05-30 20:25:26,517 : INFO : processed documents up to #9
2017-05-30 20:25:26,523 : INFO : topic #0(3.402): 0.651*"system" + 0.369*"user" + 0.308*"eps" + 0.238*"response" + 0.238*"time" + 0.233*"human" + 0.220*"computer" + 0.212*"and" + 0.194*"survey" + 0.186*"interface"
2017-05-30 20:25:26,525 : INFO : topic #1(2.6

In [19]:
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
print(vec_lsi)

[(0, 0.45309173631434096), (1, -0.072100884097923493)]


Initializing query structures
To prepare for similarity queries, we need to enter all documents which we want to compare against subsequent queries. In our case, they are the same nine documents used for training LSI, converted to 2-D LSA space. But that’s only incidental, we might also be indexing a different corpus altogether.

In [20]:
index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it

2017-05-30 20:29:53,311 : INFO : creating matrix with 9 documents and 2 features


In [21]:
index.save(os.path.join(TEMP_FOLDER, 'deerwester.index'))
#index = similarities.MatrixSimilarity.load(os.path.join(TEMP_FOLDER, 'index'))

2017-05-30 20:30:17,841 : INFO : saving MatrixSimilarity object under /var/folders/9j/wnpgv9rd1852fcj3qqttm7_40000gn/T/deerwester.index, separately None
2017-05-30 20:30:17,844 : INFO : saved /var/folders/9j/wnpgv9rd1852fcj3qqttm7_40000gn/T/deerwester.index


This is true for all similarity indexing classes (similarities.Similarity, similarities.MatrixSimilarity and similarities.SparseMatrixSimilarity). Also in the following, index can be an object of any of these. When in doubt, use similarities.Similarity, as it is the most scalable version, and it also supports adding more documents to the index later.

In [22]:
sims = index[vec_lsi] # perform a similarity query against the corpus
print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples

[(0, 0.99893081), (1, 0.99663335), (2, 0.99971139), (3, 0.96275038), (4, 0.94880307), (5, -0.061702289), (6, -0.0480875), (7, 0.052598059), (8, 0.12318273)]


In [23]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims) # print sorted (document number, similarity score) 2-tuples

[(2, 0.99971139), (0, 0.99893081), (1, 0.99663335), (3, 0.96275038), (4, 0.94880307), (8, 0.12318273), (7, 0.052598059), (6, -0.0480875), (5, -0.061702289)]
