In [1]:
'''
ad hoc Information Retrieval System: practice the vector space model
Imagine we have a collection of documents, and we would like to make a query
to the software to retrieve the document most relevant to the query, what is
the technique we should use? One simple model that can be used is called the
vector space model. The idea here is to create a hyperspace where each unique
word (term) in the collection represents a separate dimension. And each document
is represented by a vector composed of the weights (usually correlated with
the number of appearances) of each term (dimension). For example, if we have
2 recipes in a collections, the fried chicken recipe fc = ['chicken', 'fried',
'oil', 'pepper'] and the pouched chicken pc = ['chicken', 'water'], we would
have a collection (hyperspace) of 5 dimensions: ['chicken', 'fried', 'oil',
'pepper', 'water']. Further assume that in fc, the weight (frequency of word)
for each term is [8, 2, 7, 4], and in pc the weights are [6, 5], then the weight
represented in our hyperspace are correspondingly fc = [8, 2, 7, 4, 0], pc = [6,
0, 0, 0, 5]. Suppose we have a query q = ['fried', 'chicken'] with each term
weighting 1, q = [1, 1, 0, 0, 0]. Then in the vector space model, we only need
to calculate the cosine similarity between (q, fc) and (q, pc) and compare the
results. The more similar the topic, the larger the cosine similarity is. This
notebook is a simple implementation of this idea.

Footnote: a collection is usually represented by a so-called term-by-document
sparse matrix, where the rows resprent the weights of each feature, and the
columns represents each document.
'''
__author__ = 'Xia Wang'

In [7]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# create a document of 2 files
a = 'chicken '*8 + 'fried '*2
b = 'chicken '*6
documents = (a.strip(), b.strip()) 

## There are at least two ways of vectorizing the collection, one is by simple count, the other is use the term frequency * inversed document frequency (to reduce the weight imposed by the very common words but meaningless words such as a, the, and, etc.). Let's start with the first one.

In [4]:
# create a collection matrix (using the count vectorizer)
countVectorizer = CountVectorizer(min_df=1)
count_matrix = countVectorizer.fit_transform(documents)
print count_matrix

  (0, 0)	8
  (0, 1)	2
  (1, 0)	6


In [5]:
# create a query matrix
query = ('chicken', 'fried')
q_matrix = countVectorizer.fit_transform(query)

In [6]:
# compare the cosine similarity
results = cosine_similarity(q_matrix, count_matrix)
print results[0]
print results[1]

[ 0.9701425  1.       ]
[ 0.24253563  0.        ]


## Now let's try the second vectorization method

In [8]:
tfidfVectorizer = TfidfVectorizer()
tfidf_matrix = tfidfVectorizer.fit_transform(documents)
print tfidf_matrix

  (0, 1)	0.331498529357
  (0, 0)	0.943455735599
  (1, 0)	1.0


In [9]:
q_matrix_1 = tfidfVectorizer.fit_transform(query)

In [11]:
results1 = cosine_similarity(q_matrix_1, tfidf_matrix)
print results1[0]
print results1[1]

[ 0.94345574  1.        ]
[ 0.33149853  0.        ]
