## Cosine Similarity & Euclidean Distance

Consider the following 4 sentences:- 

- It was the best of times.
- it was the worst of Times.
- it is the time of stupidity.
- it is the age of foolishness.

In [2]:
# Define the documents
doc1 = "It was the best of times."

doc2 = "it was the worst of Times."

doc3  = "it is the time of stupidity."

doc4  = "it is the age of foolishness."

documents = [doc1, doc2, doc3, doc4]

In [3]:
# Scikit Learn using Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Create the Document Term Matrix
bag_of_words_vectorizer = CountVectorizer()
bag_of_words = bag_of_words_vectorizer.fit_transform(documents).toarray()

# Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
df = pd.DataFrame(bag_of_words, 
                  columns=bag_of_words_vectorizer.get_feature_names(),
                  index=['doc1', 'doc2', 'doc3', 'doc4'])

print(bag_of_words)
display(df)

[[0 1 0 0 1 1 0 1 0 1 1 0]
 [0 0 0 0 1 1 0 1 0 1 1 1]
 [0 0 0 1 1 1 1 1 1 0 0 0]
 [1 0 1 1 1 1 0 1 0 0 0 0]]


Unnamed: 0,age,best,foolishness,is,it,of,stupidity,the,time,times,was,worst
doc1,0,1,0,0,1,1,0,1,0,1,1,0
doc2,0,0,0,0,1,1,0,1,0,1,1,1
doc3,0,0,0,1,1,1,1,1,1,0,0,0
doc4,1,0,1,1,1,1,0,1,0,0,0,0


In [5]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# Compute Cosine Similarity
print("Cosine Similarity")
print(cosine_similarity(df, df))

# Compute Euclidean Distance
print("Euclidean Distance")
print(euclidean_distances(df, df))

Cosine Similarity
[[1.         0.83333333 0.5        0.5       ]
 [0.83333333 1.         0.5        0.5       ]
 [0.5        0.5        1.         0.66666667]
 [0.5        0.5        0.66666667 1.        ]]
Euclidean Distance
[[0.         1.41421356 2.44948974 2.44948974]
 [1.41421356 0.         2.44948974 2.44948974]
 [2.44948974 2.44948974 0.         2.        ]
 [2.44948974 2.44948974 2.         0.        ]]


## Improvement using Stopword Filtering and Lemmatization

In [7]:
import spacy

# Small spaCy model
nlp = spacy.load("en_core_web_sm")

lemma_documents = []

# Lemmatize docs (individual)
def lemmatize_docs(x):
  x = x.lower()
  doc = nlp(x)
  lemma_words = [w.lemma_ if w.lemma_ !='-PRON-' else w.text for w in doc]
  return " ".join(lemma_words)

for doc in documents:
  lemma_documents.append(lemmatize_docs(doc))

print(lemma_documents)

['it be the good of time .', 'it be the bad of time .', 'it be the time of stupidity .', 'it be the age of foolishness .']


In [9]:
# Scikit Learn using Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Create the Document Term Matrix
bag_of_words_vectorizer = CountVectorizer(stop_words='english')
bag_of_words = bag_of_words_vectorizer.fit_transform(lemma_documents).toarray()

# Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
df = pd.DataFrame(bag_of_words, 
                  columns=bag_of_words_vectorizer.get_feature_names(),
                  index=['doc1', 'doc2', 'doc3', 'doc4'])

print(bag_of_words)
display(df)

[[0 0 0 1 0 1]
 [0 1 0 0 0 1]
 [0 0 0 0 1 1]
 [1 0 1 0 0 0]]


Unnamed: 0,age,bad,foolishness,good,stupidity,time
doc1,0,0,0,1,0,1
doc2,0,1,0,0,0,1
doc3,0,0,0,0,1,1
doc4,1,0,1,0,0,0


In [10]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# Compute Cosine Similarity
print("Cosine Similarity")
print(cosine_similarity(df, df))

# Compute Euclidean Distance
print("Euclidean Distance")
print(euclidean_distances(df, df))

Cosine Similarity
[[1.  0.5 0.5 0. ]
 [0.5 1.  0.5 0. ]
 [0.5 0.5 1.  0. ]
 [0.  0.  0.  1. ]]
Euclidean Distance
[[0.         1.41421356 1.41421356 2.        ]
 [1.41421356 0.         1.41421356 2.        ]
 [1.41421356 1.41421356 0.         2.        ]
 [2.         2.         2.         0.        ]]
