In [3]:
import pprint

In [4]:
#document refers to text. 
document = "Gensim is an open source library in python"

In [5]:
#corpus refers to collection of text or document
corpus = ["A survey of user opinion of computer system response time",
   "Relation of user perceived response time to error measurement",
   "The generation of random binary unordered trees",
   "The intersection graph of paths in trees",
   "Graph minors IV Widths of trees and well quasi ordering",]

In [6]:
# Create a set of frequent words
stoplist = set('for a of the and to in'.split(' '))
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in corpus]

pprint.pprint(texts)

[['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering']]


In [7]:
# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
pprint.pprint(processed_corpus)

[['user', 'response', 'time'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'trees']]


In [8]:
#assiging each word with unique integer ID
from gensim import corpora
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary(5 unique tokens: ['response', 'time', 'user', 'trees', 'graph'])


In [9]:
#converting processed corpus into a list of vector
pprint.pprint(dictionary.token2id)

{'graph': 4, 'response': 0, 'time': 1, 'trees': 3, 'user': 2}


In [10]:
BoW_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
pprint.pprint(BoW_corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (1, 1), (2, 1)],
 [(3, 1)],
 [(3, 1), (4, 1)],
 [(3, 1), (4, 1)]]


In [11]:
from gensim import models
tfidf = models.TfidfModel(BoW_corpus)
words = "trees graph".lower().split()
print(tfidf[dictionary.doc2bow(words)])

[(3, 0.4869354917707381), (4, 0.8734379353188121)]


In [12]:
from gensim import similarities
index = similarities.SparseMatrixSimilarity(tfidf[BoW_corpus],num_features=5)
query_document = 'trees system'.split()
query_bow = dictionary.doc2bow(query_document)
simils = index[tfidf[query_bow]]
print(list(enumerate(simils)))

[(0, 0.0), (1, 0.0), (2, 1.0), (3, 0.4869355), (4, 0.4869355)]


NameError: name 'sims' is not defined