# Tutorial for calculating cosine similarity between 2 strings
Author : Abhishek Singh

In [12]:
#Importing the needed Libraries
import re, math, numpy as np
from collections import Counter

In [13]:
WORD = re.compile(r'\w+') #To extract word from a mixed set

In [14]:
#Function takes in 2 vectors & computes their cosine similarity
def get_cosine(vec1, vec2):
    """
    param1: vec1 (First Vector)
    param2: vec2 (Second Vector)
    return: The Cosine similarity score
    """
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

In [15]:
#Function to generate a vector dictionary from a string
def text_to_vector(text):
    """
    param: takes in a string 
    return: Outputs a vector
    """
    words = WORD.findall(text)
    return Counter(words)



In [16]:
#Lets take in some sample strings
s1 = "This is a great food"
s2 = "I have only got horrible food in winters"
s3 = "Last night I had some great food at a party"

In [17]:
#Vectorizations
vector1 = text_to_vector(s1)
print vector1

Counter({'This': 1, 'a': 1, 'is': 1, 'food': 1, 'great': 1})


In [18]:
vector2 = text_to_vector(s2)
vector3 = text_to_vector(s3)

In [19]:
#Printing cosine similarities
print np.round(get_cosine(vector1, vector2),4)

0.1581


In [20]:
print np.round(get_cosine(vector2, vector3),4)
print np.round(get_cosine(vector1, vector3),4)

0.2236
0.4243
