# 1 - Euclidean Similarity

In [153]:
# ~~~~~~~~~~~~~~~~~  DATASET  ~~~~~~~~~~~~~~~~~~~~~~
# __________________________________________________
# |RECORD|Measurement_1|Measurement_2|Measurement_3|
# __________________________________________________
# |  A  |       0     |      5      |      9       |
# __________________________________________________
# |  B  |       0     |      6      |      9       |
# __________________________________________________
# |  C  |       1     |      9      |      0       |
# __________________________________________________
# |  D  |       9     |      0      |      2       |
# __________________________________________________

#import functions
import numpy as np
from numpy.linalg import norm

#define vectors
A = np.array([0,5,9])
B = np.array([0,6,9])
C = np.array([1,9,0])
D = np.array([9,0,2])

#calculate Euclidean distance between the two vectors 
print('The Euclidean distance between A-B is ', round(norm(A-B),2))
print('The Euclidean distance between A-C is ', round(norm(A-C),2))
print('The Euclidean distance between A-D is ', round(norm(A-D),2))

The Euclidean distance between A-B is  1.0
The Euclidean distance between A-C is  9.9
The Euclidean distance between A-D is  12.45


# 2 - Manhattan distance similarity

In [150]:
# ~~~~~~~~~~~~~~~~~  DATASET  ~~~~~~~~~~~~~~~~~~~~~~
# __________________________________________________
# |RECORD|Measurement_1|Measurement_2|Measurement_3|
# __________________________________________________
# |  A  |       0     |      5      |      9       |
# __________________________________________________
# |  B  |       0     |      6      |      9       |
# __________________________________________________
# |  C  |       1     |      9      |      0       |
# __________________________________________________
# |  D  |       9     |      0      |      2       |
# __________________________________________________

# import libraries
from math import sqrt

#create function to calculate Manhattan distance 
def manhattan(a, b):
    return sum(abs(val1-val2) for val1, val2 in zip(a,b))
 
#define vectors
A = np.array([0,5,9])
B = np.array([0,6,9])
C = np.array([1,9,0])
D = np.array([9,0,2])

#calculate Manhattan distance between vectors
print('A-B is ', manhattan(A, B))
print('A-C is ', manhattan(A, C))
print('A-D is ', manhattan(A, D))

A-B is  1
A-C is  14
A-D is  21


# 3 - Euclidean Similarity

In [151]:
# ~~~~~~~~~~~~~~~~~  DATASET  ~~~~~~~~~~~~~~~~~~~~~~
# __________________________________________________
# |RECORD|Measurement_1|Measurement_2|Measurement_3|
# __________________________________________________
# |  A  |       0     |      5      |      9       |
# __________________________________________________
# |  B  |       0     |      6      |      9       |
# __________________________________________________
# |  C  |       1     |      9      |      0       |
# __________________________________________________
# |  D  |      100    |      0      |      2       |
# __________________________________________________

#import libraries
import numpy as np

#define vectors
a = np.array([0,5,9])
b = np.array([0,6,9])
c = np.array([1,0,9])
d = np.array([100,0,2])

#Normalise columns.  The normalised value for each xi of column x is (xi – min(x)) / (max(x) – min(x)).
col1 = np.array([0,0,1,100])

for x in col1:
    x = (x - min(col1)) / (max(col1) - min(col1))
    #print(x)
    
col2 = np.array([5,6,0,0])

for x in col2:
    x = (x - min(col2)) / (max(col2) - min(col2))
    #print(x)
    
col3 = np.array([9,9,9,2])

for x in col3:
    x = (x - min(col3)) / (max(col3) - min(col3))
    #print(x)

#normalised vectors
A = np.array([0,0.83,1])
B = np.array([0,1,1])
C = np.array([0.01,0,1])
D = np.array([1,0,0])

#calculate Euclidean distance between the two vectors 
print('The Euclidean distance between A-B is ', round(norm(A-B),2))
print('The Euclidean distance between A-C is ', round(norm(A-C),2))
print('The Euclidean distance between A-D is ', round(norm(A-D),2))

The Euclidean distance between A-B is  0.17
The Euclidean distance between A-C is  0.83
The Euclidean distance between A-D is  1.64


# 4 - Euclidean Similarity

In [152]:
# ~~~~~~~~~~~~~~~~~  DATASET  ~~~~~~~~~~~~~~~~~~~~~~
# __________________________________________________
# |RECORD|Measurement_1|Measurement_2|Measurement_3|
# __________________________________________________
# |  A  |       0     |      1      |      1       |
# __________________________________________________
# |  B  |       1     |      2      |      2       |
# __________________________________________________
# |  C  |       4     |      2      |      3       |
# __________________________________________________
# |  D  |       9     |      1      |      4       |
# __________________________________________________

#import functions
import numpy as np

#define vectors
a = np.array([0,1,1])
b = np.array([1,2,2])
c = np.array([4,2,3])
d = np.array([9,1,4])

#normalise 1st column
col1 = np.array([0,1,4,9])
for x in col1:
    x = (x - min(col1)) / (max(col1) - min(col1))
    #print(x)

#Categories are either equal or not equal to each other so the Euclidean distance is either 0 or 1 for categorical data.
#normalised vectors
A = np.array([0,1,0])
B = np.array([1,2,0])
C = np.array([0.4,2,1])
D = np.array([1,1,1])


#calculate Euclidean distance between the two vectors 
print('The Euclidean distance between A-B is ', round(norm(A-B),2))
print('The Euclidean distance between A-C is ', round(norm(A-C),2))
print('The Euclidean distance between A-D is ', round(norm(A-D),2))

The Euclidean distance between A-B is  1.41
The Euclidean distance between A-C is  1.47
The Euclidean distance between A-D is  1.41


# Extra stuff: Jaccard distance

In [146]:
# ~~~~~~~~~~~~~~~~~  DATASET  ~~~~~~~~~~~~~~~~~~~~~~
# __________________________________________________
# |RECORD|Measurement_1|Measurement_2|Measurement_3|
# __________________________________________________
# |  A  |       0     |      5      |      9       |
# __________________________________________________
# |  B  |       0     |      6      |      9       |
# __________________________________________________
# |  C  |       1     |      9      |      0       |
# __________________________________________________
# |  D  |       9     |      0      |      2       |
# __________________________________________________

#import functions
import numpy as np

#define vectors
A = np.array([0,5,9])
B = np.array([0,6,9])
C = np.array([1,9,0])
D = np.array([9,0,2])

#define Jaccard Similarity function
def jaccard(x, y):
    intersection = len(list(set(x).intersection(y)))
    union = (len(x) + len(y)) - intersection
    return float(intersection) / union

#find Jaccard similarity between the two sets
print('Jaccard similarity between A-B is ', round(jaccard(A,B), 2))
print('Jaccard similarity between A-C is ', round(jaccard(A,C), 2))
print('Jaccard similarity between A-D is ', round(jaccard(A,D), 2))

Jaccard similarity between A-B is  0.5
Jaccard similarity between A-C is  0.5
Jaccard similarity between A-D is  0.5


# Extra stuff: Cosine similarity

In [148]:
# ~~~~~~~~~~~~~~~~~  DATASET  ~~~~~~~~~~~~~~~~~~~~~~
# __________________________________________________
# |RECORD|Measurement_1|Measurement_2|Measurement_3|
# __________________________________________________
# |  A  |       0     |      5      |      9       |
# __________________________________________________
# |  B  |       0     |      6      |      9       |
# __________________________________________________
# |  C  |       1     |      9      |      0       |
# __________________________________________________
# |  D  |       9     |      0      |      2       |
# __________________________________________________

# import libraries
import numpy as np
from numpy.linalg import norm
 
#define vectors
A = np.array([0,5,9])
B = np.array([0,6,9])
C = np.array([1,9,0])
D = np.array([9,0,2])
 
# compute cosine similarity
def cosine(x,y):
    cosine = np.dot(x,y)/(norm(x)*norm(y))
    return cosine

#find cosine similarity between the two sets
print('Cosine similarity between A-B is ', round(cosine(A,B), 2))
print('Cosine similarity between A-C is ', round(cosine(A,C), 2))
print('Cosine similarity between A-D is ', round(cosine(A,D), 2))

Cosine similarity between A-B is  1.0
Cosine similarity between A-C is  0.48
Cosine similarity between A-D is  0.19
