# This Example iPython Notebook Provides Sample Code for Measuring Similarity Between Documents (Cosine Similarity)

## Example 1: Compare 3 short documents that describe poltical events

In [1]:
# Define the documents
doc_jones = "Mr. Jones became president after winning the political election. Though he lost the support of some of his allies, Jones is friends with President Smith"

doc_election = "President Jones says Smith did not help with his election campaign. He says the stories are false and spread by other political parties. He claimed President Smith just is a friend who had nothing to do with the election"

doc_smith = "Post elections, Robert Smith became President of New Zealand. President Smith had served as the Prime Minister earlier in his political career"

documents = [doc_jones, doc_election, doc_smith]

## Import Count Vectorizer for counting words in documents (from SKLearn)

In [2]:
# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(documents)

# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['doc_jones', 'doc_election', 'doc_smith'])
df



Unnamed: 0,after,allies,and,are,as,became,by,campaign,career,claimed,...,spread,stories,support,the,though,to,who,winning,with,zealand
doc_jones,1,1,0,0,0,1,0,0,0,0,...,0,0,1,2,1,0,0,1,1,0
doc_election,0,0,1,1,0,0,1,1,0,1,...,1,1,0,2,0,1,1,0,2,0
doc_smith,0,0,0,0,1,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1


## Calculate Cosine Similarity of Document Vectors

In [3]:
# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(df, df))

[[1.         0.50694478 0.44381268]
 [0.50694478 1.         0.35355339]
 [0.44381268 0.35355339 1.        ]]


## Create 3 more documents in different topics space (food)

In [4]:
# Define the documents
doc_soup = "Soup is a primarily liquid food, generally served warm or hot (but may be cool or cold), that is made by combining ingredients of meat or vegetables with stock, juice, water, or another liquid. "

doc_noodles = "Noodles are a staple food in many cultures. They are made from unleavened dough which is stretched, extruded, or rolled flat and cut into one of a variety of shapes."

doc_dosa = "Dosa is a type of pancake from the Indian subcontinent, made from a fermented batter. It is somewhat similar to a crepe in appearance. Its main ingredients are rice and black gram."

documents = [doc_jones, doc_election, doc_smith, doc_soup, doc_noodles, doc_dosa]

## Import GenSim library to evaluate "Soft" Cosine Similarity

In [None]:
import gensim
# upgrade gensim if you can't import softcossim
from gensim.matutils import softcossim 
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess
print(gensim.__version__)
#> '3.6.0'

# Download the FastText model
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')

3.6.0


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
# Prepare a dictionary and a corpus.
dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents])

# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

# Convert the sentences into bag-of-words vectors.
sent_1 = dictionary.doc2bow(simple_preprocess(doc_jones))
sent_2 = dictionary.doc2bow(simple_preprocess(doc_election))
sent_3 = dictionary.doc2bow(simple_preprocess(doc_smith))
sent_4 = dictionary.doc2bow(simple_preprocess(doc_soup))
sent_5 = dictionary.doc2bow(simple_preprocess(doc_noodles))
sent_6 = dictionary.doc2bow(simple_preprocess(doc_dosa))

sentences = [sent_1, sent_2, sent_3, sent_4, sent_5, sent_6]

NameError: ignored

## Example: Soft Cosine Similarity for Documents ! and 2

In [None]:
# Compute soft cosine similarity
print(softcossim(sent_1, sent_2, similarity_matrix))


0.5842470477718544


## Soft Cosine Similarity across all 6 documents (matrix)

In [None]:
import numpy as np
import pandas as pd

def create_soft_cossim_matrix(sentences):
    len_array = np.arange(len(sentences))
    xx, yy = np.meshgrid(len_array, len_array)
    cossim_mat = pd.DataFrame([[round(softcossim(sentences[i],sentences[j], similarity_matrix) ,2) for i, j in zip(x,y)] for y, x in zip(xx, yy)])
    return cossim_mat

create_soft_cossim_matrix(sentences)

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.58,0.56,0.28,0.34,0.4
1,0.58,1.0,0.54,0.25,0.31,0.43
2,0.56,0.54,1.0,0.19,0.25,0.36
3,0.28,0.25,0.19,1.0,0.5,0.38
4,0.34,0.31,0.25,0.5,1.0,0.56
5,0.4,0.43,0.36,0.38,0.56,1.0
