In [13]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [34]:
def calculate_cosine_similarity(text1:str, text2:str)->float:
    
    """
    Calculates cosine similarity between two text strings. 
    
    Params
    ------
    text1 (str): First string to compare
    text2 (str): Second string to compare
    
    Returns
    -------
    Returns the cosine similarity score between the two text strings. 
    """
    
    count_vectorizer = CountVectorizer()
    vector_matrix = count_vectorizer.fit_transform([text1, text2])
    tokens = count_vectorizer.get_feature_names()
    vector_matrix = pd.DataFrame(
        data=vector_matrix.toarray(), 
        index=['text1','text2'], 
        columns=count_vectorizer.get_feature_names())

    return cosine_similarity(vector_matrix)[0][1]


In [50]:
# Test 1
text1 = "Data is the oil of the digital economy"
text2 = "Data is the new oil"

calculate_cosine_similarity(text1, text2)

0.7071067811865476

In [51]:
# Test 2
text1 = "Come up to me on the mountain and stay here, and I will give you the tablets of stone with the law and commandments I have written for their instruction."
text2 = "Come up to me on the mountain. Stay there, and I will give you the tablets of stone on which I have inscribed the instructions and commands so you can teach the people."

calculate_cosine_similarity(text1, text2)

0.7679817174694635

In [53]:
# Test 3
text1 = "What is hateful to you, do not do to your fellow: this is the whole Torah; the rest is the explanation; go and learn."
text2 = "You shall love your neighbor as yourself, and whatever you would not want to happen to you, do not do to another."

calculate_cosine_similarity(text1, text2)

0.4360407996361905

In [54]:
text1 = "See Pug run!"
text2 = "The cow jumped over the moon."

calculate_cosine_similarity(text1, text2)

0.0