# Similarity Queries

## 1. Set Logging

In [1]:
import logging
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)

## 2. Similarity interface

In [2]:
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('./deerwester.dict')
corpus = corpora.MmCorpus('./deerwester.mm')
print corpus

MmCorpus(9 documents, 12 features, 28 non-zero entries)


In [3]:
lsi = models.LsiModel(corpus, id2word = dictionary, num_topics = 2)

In [4]:
doc = 'Human computer interaction'
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
print vec_lsi

[(0, 0.46182100453271585), (1, -0.070027665279000617)]


## 3. Initializing query structures

In [5]:
# transform corpus to LSI space and index it
index = similarities.MatrixSimilarity(lsi[corpus])

In [6]:
index.save('./deerwester.index')
index = similarities.MatrixSimilarity.load('./deerwester.index')

## 4. Performing queries

In [7]:
sims = index[vec_lsi]

In [8]:
print list(enumerate(sims))

[(0, 0.99809301), (1, 0.93748635), (2, 0.99844527), (3, 0.9865886), (4, 0.90755945), (5, -0.12416792), (6, -0.10639259), (7, -0.098794639), (8, 0.050041765)]


In [9]:
sims = sorted(enumerate(sims), key = lambda item: -item[1])
print sims

[(2, 0.99844527), (0, 0.99809301), (3, 0.9865886), (1, 0.93748635), (4, 0.90755945), (8, 0.050041765), (7, -0.098794639), (6, -0.10639259), (5, -0.12416792)]
