# Objective: compute cosine similarity when documents are represented with two different approaches: 1) BoW representation and 2)TF-IDF representation

In [1]:
import pandas as pd

In [2]:
# Corpus documents
Doc_1='the best deep learning course'
Doc_2='deep learning is easy'

# BoW

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()

In [5]:
# Tokenize, build vocabulary and transform each document into a bag of words matrix
count_matrix = count_vectorizer.fit_transform([Doc_1,Doc_2])

# Vocabulary in the corpus
vocabulary=count_vectorizer.vocabulary_

# Bag of words vector for each document
bag=count_matrix.toarray()

print(vocabulary)
print("\n")
print(count_matrix)
print("\n")
print(bag)

# Remark: The ordering of the actual words in the sentences has been lost

{'the': 6, 'best': 0, 'deep': 2, 'learning': 5, 'course': 1, 'is': 4, 'easy': 3}


  (0, 6)	1
  (0, 0)	1
  (0, 2)	1
  (0, 5)	1
  (0, 1)	1
  (1, 2)	1
  (1, 5)	1
  (1, 4)	1
  (1, 3)	1


[[1 1 1 0 0 1 1]
 [0 0 1 1 1 1 0]]


In [6]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [7]:
(bag[0,:]).reshape(1, -1).shape

(1, 7)

In [8]:
# Cosine similarity between doc 1 and doc 2 calculation. reshape (1,-1) is used to specify 
# bag[0,:] and bag[1,:] as 2D matrices with 1 row and avoid error message
cosine_similarity(bag[0,:].reshape(1, -1) , bag[1,:].reshape(1, -1))

array([[0.4472136]])

In [9]:
# comparison with calculation in the slides
1/np.sqrt(5)

0.4472135954999579

# TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(norm=None)

In [11]:
# Tokenize, build vocabulary and transform each document into a tf-idf matrix
tfidf_matrix = tfidf_vectorizer.fit_transform([Doc_1,Doc_2])

# Vocabulary in the corpus
vocabulary = tfidf_vectorizer.vocabulary_

# Tf_idf vector for each document
matrix = tfidf_matrix .toarray()

print(vocabulary)
print("\n")
print(tfidf_matrix)
print("\n")
print(matrix)

# Remark: The ordering of the actual words in the sentences has been lost

{'the': 6, 'best': 0, 'deep': 2, 'learning': 5, 'course': 1, 'is': 4, 'easy': 3}


  (0, 1)	1.4054651081081644
  (0, 5)	1.0
  (0, 2)	1.0
  (0, 0)	1.4054651081081644
  (0, 6)	1.4054651081081644
  (1, 3)	1.4054651081081644
  (1, 4)	1.4054651081081644
  (1, 5)	1.0
  (1, 2)	1.0


[[1.40546511 1.40546511 1.         0.         0.         1.
  1.40546511]
 [0.         0.         1.         1.40546511 1.40546511 1.
  0.        ]]


In [12]:
# Cosine similarity between doc 1 and doc 2 calculation. reshape (1,-1) is used to specify 
# bag[0,:] and bag[1,:] as 2D matrices with 1 row and avoid error message
cosine_similarity(tfidf_matrix[0,:].reshape(1, -1) , tfidf_matrix[1,:].reshape(1, -1))

array([[0.29121942]])

In [13]:
# Remark: by default the TfidfVectorizer has the parameter norm set to l2 to account for the length of a document
# When we use TF-IDF representation to compute cosine similarity between 2 words, has it an impact?

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(norm='l2')
# Tokenize, build vocabulary and transform each document into a tf-idf matrix
tfidf_matrix = tfidf_vectorizer.fit_transform([Doc_1,Doc_2])

# Vocabulary in the corpus
vocabulary = tfidf_vectorizer.vocabulary_

# Tf_idf vector for each document
matrix = tfidf_matrix .toarray()

print(vocabulary)
print("\n")
print(tfidf_matrix)
print("\n")
print(matrix)

{'the': 6, 'best': 0, 'deep': 2, 'learning': 5, 'course': 1, 'is': 4, 'easy': 3}


  (0, 1)	0.4992213265230509
  (0, 5)	0.35520008546852583
  (0, 2)	0.35520008546852583
  (0, 0)	0.4992213265230509
  (0, 6)	0.4992213265230509
  (1, 3)	0.5761523551647353
  (1, 4)	0.5761523551647353
  (1, 5)	0.40993714596036396
  (1, 2)	0.40993714596036396


[[0.49922133 0.49922133 0.35520009 0.         0.         0.35520009
  0.49922133]
 [0.         0.         0.40993715 0.57615236 0.57615236 0.40993715
  0.        ]]


In [14]:
# Cosine similarity between doc 1 and doc 2 calculation. reshape (1,-1) is used to specify 
# bag[0,:] and bag[1,:] as 2D matrices with 1 row and avoid error message
cosine_similarity(tfidf_matrix[0,:].reshape(1, -1) , tfidf_matrix[1,:].reshape(1, -1))

array([[0.29121942]])