In [54]:
# importing regex

import re
from collections import Counter
import math

In [2]:
# Text samples

sample1 = "The easiest way to earn points with Fetch Rewards is to just shop for the products you already love. If you have any participating brands on your receipt, you'll get points based on the cost of the products. You don't need to clip any coupons or scan individual barcodes. Just scan each grocery receipt after you shop and we'll find the savings for you."

sample2 = "The easiest way to earn points with Fetch Rewards is to just shop for the items you already buy. If you have any eligible brands on your receipt, you will get points based on the total cost of the products. You do not need to cut out any coupons or scan individual UPCs. Just scan your receipt after you check out and we will find the savings for you."

sample3 = "We are always looking for opportunities for you to earn more points, which is why we also give you a selection of Special Offers. These Special Offers are opportunities to earn bonus points on top of the regular points you earn every time you purchase a participating brand. No need to pre-select these offers, we'll give you the points whether or not you knew about the offer. We just think it is easier that way."

In [3]:
# loading and establishing stop words

with open("stopwords.txt", 'r') as s:
    stop_words = s.read()

In [19]:
# function to pre-process the data

def text_processing(sample):
    """
    A function to pre-process the text

    - Lower the text
    - Remove punctuation from the text
    - remove stop words
    - tokenize the text
    """
    # Can't do accurate lemmetization or stemming without a library or spending months accounting for every vocabulary, So no lemmetization or stemming technique used. 
    
    # lower the text
    sample = sample.lower()

    # use regex to remove anything that is not words and numbers
    sample = re.sub(r"[^a-zA-Z0-9]", " ", sample) 

    # tokenize text
    sample = sample.split()

    # remove stop words
    sample = [word for word in sample if word not in stop_words]

    return sample


In [24]:
# Pre-processing the text

sample1_p = text_processing(sample1)
sample2_p = text_processing(sample2)
sample3_p = text_processing(sample3)

Do TF-IDF calculation for each document sample, Then use this to calculate the cosine similarity number. Which will give us a value of how similar the 2 text documents are. 


In [38]:
#  A function for calculating term frequency

def term_frequency(processed_text):
    """
    This function will calculate the term frequency of every word in the document

    The input is the pre-process text that is already tokenized

    The output is the frequency of each word divided by total number of words 
    """

    total_word_count = len(processed_text)

    # individual word count
    word_count = dict(Counter(processed_text))

    # calculating term frequency (tf)
    tf = {}

    for word, count in word_count.items():
        tf[word] = round(count/total_word_count, 4)

    return tf

In [41]:
# calculating term frequency for each word

tf_s1 = term_frequency(sample1_p)
tf_s2 = term_frequency(sample2_p)
tf_s3 = term_frequency(sample3_p)

In [None]:
# Since this is comparing only 2 texts, the Inverse Document Frequency (IDF) will be for only 2 documents, not all 3 of them. 

In [57]:
# Function to calculate Inverse Document Frequency for 2 documents. 
# doing IDF for all 3 samples doesn't make sense because the final 
# product will calculate IDF for only 2 samples inputed as the "POST Method"

def idf(doc1, doc2):
    """
    Calculate the inverse document frequency assuming only 2 documents
    
    Input is the 2 documents in questions

    Output will be the IDF of the words in both documents
    """

    # convert each document into a set. 
    set_1 = set(doc1)
    set_2 = set(doc2)

    # find word that occurs in both documents
    words_in_both_docs = set_1 & set_2

    # find word that occurs only once in either document
    words_in_only_one_doc = set_1 ^ set_2

        # number of documents. In this case, we know only 2 documents will be used for comparison. No need to account for comparing more than 2 documents. 

    number_of_doc = 2

    # creating a dictionary of the document frequency of the term
    # document frequency is number_of_doc/term_frequency_in_document
    # inverse document frequency is log(document frequency)

    doc_freq = {}

    for word in words_in_both_docs:
        doc_freq[word] = round(math.log(number_of_doc/2), 2)

    for word in words_in_only_one_doc:
        doc_freq[word] = round(math.log(number_of_doc/1), 4)

    return doc_freq




In [59]:
# inverse term frequeny of sample 1 and sample 2
s1_s2_idf = idf(sample1_p, sample2_p)

In [61]:
# inverse term frequeny of sample 1 and sample 3
s1_s3_idf = idf(sample1_p, sample3_p)

In [72]:
# inverse term frequeny of sample 3 and sample 3
s2_s3_idf = idf(sample3_p, sample3_p)

In [73]:
# assuming calculating the TF-IDF for only 2 documents of interest and not the entire sample set. 

def tf_idf(tf, idf):
    
    tf_idf = {}
    
    for word, value in tf.items():
        tf_idf[word] = round(value * idf[word], 4)

    return tf_idf


In [74]:
# for sample 1 & sample 2- tf-idf score 
# Assuming only sample 1 and sample 2 make up the entire corpus

tf_idf1 = tf_idf(tf_s1, s1_s2_idf)

tf_idf2 = tf_idf(tf_s2, s1_s2_idf)


In [77]:
# for sample 1 & sample 3 tf-idf score 
# Assuming only sample 1 and sample 3 make up the entire corpus

tf_idf1_3 = tf_idf(tf_s1, s1_s3_idf)

tf_idf3 = tf_idf(tf_s3, s1_s3_idf)


Use Cosine Similarity to find the similarity between 2 pieces of text. 

Formula: cosθ = v1.v2 / ( |v1| * |v2| )

v1 = tf-idf of sample 1

v2 = tf-idf of sample 2

In [None]:
# find cosine similarity between sample 1 and sample 2

# sample 1 tf-idf assuming only sample 1 and sample 2 in corpus
tf_idf1

# sample 2 tf-idf assuming only sample 1 and sample 2 in corpus
tf_idf2

In [None]:
# find cosine similarity between sample 1 and sample 3

# Assuming only sample 1 and sample 3 make up the entire corpus

# sample 1 tf-idf assuming only sample 1 and sample 3 in corpus
tf_idf1_3 

# sample 3 tf-idf assuming only sample 1 and sample 3 in corpus
tf_idf3 


In [83]:
len(list(tf_idf1.values()))

20

In [84]:
len(list(tf_idf2.values()))

22

In [119]:
# function for calculating cosine similarity

def cosine(tf_idf1, tf_idf2):
    """
    The input value are the 2 tf-idf dictionary for each sample. 

    the values from dictionary values which are the vectors will be used
    to calculate the dot product of both vectors and the magnitude of each 
    vector in order to get the cosine similarity value. 

    The output will be the cosine similarities between the 2 samples
    """
    # get the values from the tf_idf dictionary
    v1 = list(tf_idf1.values())
    v2 = list(tf_idf2.values())

    # calculate dot product from both vectors
    dotproduct=0

    for a,b in zip(v1,v2):
        dotproduct = dotproduct+a*b

    # calculate magnitude of each vector

    # magnitude of v1
    mag_v1 = math.sqrt(sum(pow(element, 2) for element in v1))
    
    # magnitude of v2
    mag_v2 = math.sqrt(sum(pow(element, 2) for element in v2))

    # now combine dot product and magnitude to get cosine similarity value

    cosine_similarity = round(dotproduct/(mag_v1 * mag_v2), 2)

    return cosine_similarity


In [120]:
# similarity score between sample 1 and sample 2
cosine(tf_idf1, tf_idf2)

0.34

In [121]:
# similarity score between sample 1 and sample 3
cosine(tf_idf1, tf_idf3)

0.19