In [1]:
import re
import nltk
import math
import pandas as pd
from nltk.tokenize import word_tokenize

In [2]:
def calculate_jaccard_score(words1, words2):
    set1 = set(words1)
    set2 = set(words2)
    return float(len(set1.intersection(set2)) / float(len(set1.union(set2))))

In [3]:
def calculate_euclidean_distance(v1, v2):
    all_keys = set(v1.keys()).union(set(v2.keys()))
    return math.sqrt(sum([(v1[key] - v2[key]) ** 2 for key in all_keys]))

In [4]:
def calculate_cosine_distance(v1, v2):
    common_keys = set(v1.keys()) & set(v2.keys())
    scalar_product = sum([v1[key] * v2[key] for key in common_keys])

    v1_length = math.sqrt(sum([v1[key] ** 2 for key in v1.keys()]))
    v2_length = math.sqrt(sum([v2[key] ** 2 for key in v2.keys()]))
    vectors_length_product = v1_length * v2_length

    return float(scalar_product) / vectors_length_product

In [5]:
def get_words(text):
    clean_text = re.sub('[^a-z \n]+', '', text)
    words = word_tokenize(clean_text)
    return words

In [6]:
file = open("Text_0.txt")
text0 = file.read()
file.close()

file = open("Text_1.txt")
text1 = file.read()
file.close()

file = open("Text_2.txt")
text2 = file.read()
file.close()

In [7]:
# text0 = "a a a b b a a b a b"
# text1 = "a b b"
# text2 = "a a b"

words0 = get_words(text0)
words1 = get_words(text1)
words2 = get_words(text2)

vector0 = nltk.FreqDist(words0)
vector1 = nltk.FreqDist(words1)
vector2 = nltk.FreqDist(words2)

## Pairwise score for: Text_0 & Text_1

In [8]:
jaccard_score = calculate_jaccard_score(words0, words1)

euclidean_distance = calculate_euclidean_distance(vector0, vector1)

cosine = calculate_cosine_distance(vector0, vector1)

print("Jaccard Score:", jaccard_score)
print("")
print("Euclidean Distance:", euclidean_distance)
print("")
print("Cosine Similarity:", cosine)
print("Inv Cos:", math.acos(cosine))

Jaccard Score: 0.11731843575418995

Euclidean Distance: 28.35489375751565

Cosine Similarity: 0.46550057101165204
Inv Cos: 1.0865962130909104


## Pairwise score for: Text_0 & Text_2

In [9]:
jaccard_score = calculate_jaccard_score(words0, words2)

euclidean_distance = calculate_euclidean_distance(vector0, vector2)

cosine = calculate_cosine_distance(vector0, vector2)

print("Jaccard Score:", jaccard_score)
print("")
print("Euclidean Distance:", euclidean_distance)
print("")
print("Cosine Similarity:", cosine)
print("Inv Cos:", math.acos(cosine))

Jaccard Score: 0.12686567164179105

Euclidean Distance: 44.12482294582042

Cosine Similarity: 0.3779751907941922
Inv Cos: 1.1831880635814898


## Pairwise score for: Text_1 & Text_2

In [10]:
jaccard_score = calculate_jaccard_score(words1, words2)

euclidean_distance = calculate_euclidean_distance(vector1, vector2)

cosine = calculate_cosine_distance(vector1, vector2)

print("Jaccard Score:", jaccard_score)
print("")
print("Euclidean Distance:", euclidean_distance)
print("")
print("Cosine Similarity:", cosine)
print("Inv Cos:", math.acos(cosine))

Jaccard Score: 0.13108614232209737

Euclidean Distance: 37.61648574760805

Cosine Similarity: 0.5953737487026086
Inv Cos: 0.9330655777860906
