You need reload to enable change in the logging level without restarting the kernel

In [5]:
import importlib
import logging
importlib.reload(logging);

### Setting logging level

In [7]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARNING)

### Basic imports and setting tmp location

In [10]:
import os
import tempfile
from pprint import pprint  # pretty-printer
from collections import defaultdict
TEMP_FOLDER = tempfile.gettempdir()
print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))

Folder "/var/folders/9x/cd3tjw3s5yvghvt6z_ls6fn00000gn/T" will be used to save temporary dictionary and corpus.


### GenSim import 

In [16]:
from gensim import corpora, models

#### Corpus and dictionary creation in memory 

In [8]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

**Step 1:** convert documents to tokens (remove stop words and words with frequency = 1)

In [15]:
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


**Step 2:** Create and save dictonary using the token generated in Step 1

In [18]:
dictionary = corpora.Dictionary(texts)
dictionary.save(os.path.join(TEMP_FOLDER, 'deerwester.dict'))  # store the dictionary, for future reference
print(dictionary)

Dictionary(12 unique tokens: ['trees', 'time', 'interface', 'user', 'eps']...)


In [19]:
print(dictionary.token2id)

{'trees': 9, 'time': 6, 'interface': 2, 'user': 7, 'eps': 8, 'system': 5, 'computer': 0, 'minors': 11, 'survey': 4, 'response': 3, 'graph': 10, 'human': 1}


Converting a new documents to Bag of words, based on dictionary created 

In [10]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored

[(0, 1), (1, 1)]


**Step 3:** Create and save serialized corpora on entire tokenized text

In [21]:
corpus = [dictionary.doc2bow(text) for text in texts]
for c in corpus:
    print(c)
corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'deerwester.mm'), corpus)  # store to disk, for later use

[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]


#### Corpus and dictionary creating from disk (streaming)

Create a python iterator that will yield one line at a time

In [23]:
class MyCorpus(object):
    def __iter__(self):
        for line in open('deerwester.txt'):
            yield dictionary.doc2bow(line.lower().split())

In [24]:
mem_friendly_corpus = MyCorpus()

In [25]:
for vector in mem_friendly_corpus:
    print(vector)

[(0, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (7, 1)]
[(2, 1), (7, 1), (8, 1)]
[(1, 1), (5, 1)]
[(3, 1), (6, 1), (7, 1)]
[]
[(10, 1)]
[(9, 1), (11, 1)]
[(11, 1)]


**Step 1:** Creating a dictionary from the raw text documents on disk (Streaming corpus)

In [28]:
from six import iteritems
dictionary = corpora.Dictionary(line.lower().split() for line in open('deerwester.txt'))
stop_ids = [
     dictionary.token2id[stopword]
     for stopword in stoplist
     if stopword in dictionary.token2id
]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids)
dictionary.compactify()

In [29]:
# saving a dictionary
dictionary.save("deerwester.dict")

**Step 2:** Create the corpus 

In [32]:
dictionary = corpora.Dictionary.load('deerwester.dict')
corpus = corpora.MmCorpus(os.path.join(TEMP_FOLDER, 'deerwester.mm'))

**Step 3:** Calculate tf-idf on model's corpus.

In [33]:
tfidf = models.TfidfModel(corpus)  

In [38]:
# Pretty Printing documents frequncies of each word)
tfidf.dfs

{0: 2, 1: 2, 2: 2, 3: 2, 4: 2, 5: 3, 6: 2, 7: 3, 8: 2, 9: 3, 10: 3, 11: 2}

In [40]:
# Pretty Printing tf-idf of each word)
tfidf.idfs

{0: 2.1699250014423126,
 1: 2.1699250014423126,
 2: 2.1699250014423126,
 3: 2.1699250014423126,
 4: 2.1699250014423126,
 5: 1.5849625007211563,
 6: 2.1699250014423126,
 7: 1.5849625007211563,
 8: 2.1699250014423126,
 9: 1.5849625007211563,
 10: 1.5849625007211563,
 11: 2.1699250014423126}

In [36]:
doc = [(9, 1), (11, 1)]
tfidf[doc]

[(9, 0.5898341626740045), (11, 0.8075244024440723)]

In [37]:
for doc in tfidf[corpus]:
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]
