# Gensim tutorial: Corpora and Vector Spaces

## 1. Set logging

In [1]:
import logging
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)

## 2. From Strings to Vectors

** Fire up gensim **

In [2]:
from gensim import corpora, models, similarities

** Start from documents represented as strings **

In [3]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

** Tokenize the documents, remove common words(using a toy stoplist) as well as words that only appear once in the corpus **

In [4]:
# remove common words and tokenize
stoplist = set('for a of the and to it'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
        for document in documents]

In [5]:
texts

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'in', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

In [6]:
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]
        for text in texts]

In [7]:
# have a show
from pprint import pprint
pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [8]:
texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

** To convert documents to vectors using a document representation called bag-of-words **

In [10]:
dictionary = corpora.Dictionary(texts)
dictionary.save('./deerwester.dict')
print dictionary

Dictionary(12 unique tokens: [u'minors', u'graph', u'system', u'trees', u'eps']...)


** To see the mapping between words and their ids: **

In [11]:
print dictionary.token2id

{u'minors': 11, u'graph': 10, u'system': 6, u'trees': 9, u'eps': 8, u'computer': 1, u'survey': 5, u'user': 7, u'human': 2, u'time': 4, u'interface': 0, u'response': 3}


** To actually convert tokenized documents to vectors **

In [12]:
new_doc = 'Human computer interaction'
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

[(1, 1), (2, 1)]


In [14]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('./deerwester.mm', corpus)
print corpus

[[(0, 1), (1, 1), (2, 1)], [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(0, 1), (6, 1), (7, 1), (8, 1)], [(2, 1), (6, 2), (8, 1)], [(3, 1), (4, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(5, 1), (10, 1), (11, 1)]]


## 3. Corpus Streaming - One Document at a Time

** Gensim requires that a corpus must be able to return one document vector at a time **

In [15]:
class MyCorpus(object):
    def __iter__(self):
        for line in open('mycorpus.txt'):
            yield dictionary.doc2bow(line.lower().split())

** Convert the tokens via a dictionary to their ids and yield the resulting sparse vector inside __iter__ **

In [16]:
corpus_memory_friendly = MyCorpus()
print corpus_memory_friendly

<__main__.MyCorpus object at 0x000000000677D5C0>


** Let's iterate over the corpus and print each document vector **<br>
** My corpus can now be as large as you want **

In [17]:
for vector in corpus_memory_friendly:
    print vector

[(0, 1), (1, 1), (2, 1)]
[(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(0, 1), (6, 1), (7, 1), (8, 1)]
[(2, 1), (6, 2), (8, 1)]
[(3, 1), (4, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(5, 1), (10, 1), (11, 1)]


** To construct the dictionary without loading all texts into memory **

In [18]:
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('mycorpus.txt'))

In [19]:
# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
           if stopword in dictionary.token2id]

In [20]:
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]

In [21]:
# remove stop words and words that appear only once
dictionary.filter_tokens(stop_ids + once_ids)

In [22]:
# remove gaps in id sequence after words that were removed
dictionary.compactify()

In [23]:
print dictionary

Dictionary(12 unique tokens: [u'minors', u'graph', u'system', u'trees', u'eps']...)


## 4. Corpus Formats