In [None]:
import numpy as np
import pandas as pd
import operator
from IPython.display import display

# Proximity measure for binary attributes
* Binary attributes have only two states - 0 and 1
* _How to compute the dissimilarity between two binary attributes?_ <br>
Using dissimilarity matrix from given binary data. 
* Assumption - All binary attributes are assumed to have same weight and we will be having a 2x2 contingency matrix. 
### Notations used - 
* q := number of attributes that equal 1 for both objects i and j (vector1 and vector2 in our example)
* t := number of attributes that equal 0 for both objects i and j 
* r := number of attributes that equal 1 for object i (vector1) and 0 for object j (vector2)
* s := number of attributes that equal 0 for object i (vector1) and 0 for object j (vector2) 

* Symmetric binary similarity := Each state is considered equaly valuable 
* Asymmetric binary similarity := similarity based on asymmetric binary attributes (two states not considered equally important) 


In [None]:
vector1  = np.array([1,1,0,1,0,0,1])
vector2  = np.array([1,0,1,1,0,0,1]) 

df       = pd.DataFrame(columns=['attribute1','attribute2'])
df['attribute1'] = vector1
df['attribute2'] = vector2
df

Unnamed: 0,attribute1,attribute2
0,1,1
1,1,0
2,0,1
3,1,1
4,0,0
5,0,0
6,1,1


In [None]:
q = len(np.where((vector1==vector2) & (vector1==1))[0])
t = len(np.where((vector1==vector2) & (vector1==0))[0])
r = len(np.where((vector1!=vector2) & (vector1==1))[0])
s = len(np.where((vector1!=vector2) & (vector1==0))[0])

contingency_table_layout = pd.DataFrame(np.array(['q','r','s','t']).reshape(2,2),columns=['vector2_1','vector2_0'],index=['vector1_1','vector1_0'])
contingency_table        = pd.DataFrame(np.array([q,r,s,t]).reshape(2,2),columns=['vector2_1','vector2_0'],index=['vector1_1','vector1_0'])

print('The contingency table attained is given as \n')
display(contingency_table)
print('\n')

print('The contingency table layout attained is denoted as \n')
display(contingency_table_layout)
print('\n')

print('-----------------------------------------------------------------------------------------------------------------------------------------------------')
symmetric_dissim = (r+s)/(q+t+r+s)
print(f'The symmetric binary dissimilarity between vector1 and vector2  is given as {symmetric_dissim} ')
print(f'The formula for symmetric binary dissimilarity between vector1 and vector2  is given as (r+s)/(q+t+r+s) \n')
print('-----------------------------------------------------------------------------------------------------------------------------------------------------')

asymmetric_dissim = (r+s)/(q+r+s)
print(f'The formula for asymmetric binary dissimilarity between vector1 and vector2 considering match of 1 to be more significant is given as (r+s)/(q+r+s)')
print(f'The asymmetric binary dissimilarity between the vector1 and vector2 considering match of 1 to be more significant is {asymmetric_dissim}')
print('\n')
print('-----------------------------------------------------------------------------------------------------------------------------------------------------')

asym_sim = q/(q+r+s)
print(f'The asymmetric similarity between vector 1 and 2 is {asym_sim} (note that it is actually 1-asymmetric dissimilarity)')
print(f'This asymmetric similarity between vector 1 and 2 is also called Jaccard coefficient')
print(f'The formula for Jaccard coefficient hence becomes -  q/(q+r+s)')
print('-----------------------------------------------------------------------------------------------------------------------------------------------------')

The contingency table attained is given as 



Unnamed: 0,vector2_1,vector2_0
vector1_1,3,1
vector1_0,1,2




The contingency table layout attained is denoted as 



Unnamed: 0,vector2_1,vector2_0
vector1_1,q,r
vector1_0,s,t




-----------------------------------------------------------------------------------------------------------------------------------------------------
The symmetric binary dissimilarity between vector1 and vector2  is given as 0.2857142857142857 
The formula for symmetric binary dissimilarity between vector1 and vector2  is given as (r+s)/(q+t+r+s) 

-----------------------------------------------------------------------------------------------------------------------------------------------------
The formula for asymmetric binary dissimilarity between vector1 and vector2 considering match of 1 to be more significant is given as (r+s)/(q+r+s)
The asymmetric binary dissimilarity between the vector1 and vector2 considering match of 1 to be more significant is 0.4


-----------------------------------------------------------------------------------------------------------------------------------------------------
The asymmetric similarity between vector 1 and 2 is 0.6 (note that it is ac

## Book example for solving Jaccard similarity (Example 2.18)
Suppose that a patient record table contains attributes gender, name, fever, cough, test1, test2, test3 and test4; where name is object identifier, gender is a symmetric attribute, and remaining attributes are asymmetric binary. 

We assume Yes(Y) and Positive (P) to be 1, The value of No(N) and negative (N) an be set to 0.



In [None]:
jack = np.array(['jack','M','Y','N','P','N','N','N'])
jim  = np.array(['jim','M','Y','Y','N','N','N','N'])
array = np.vstack((jack,jim))
df = pd.DataFrame(array,columns = ['name','gender','fever','cough','test1','test2','test3','test4'])


print('-------------------------------------------------------------------------------------------')
print('STEP1')
print('The dataframe having the binary attributes/features given to us is in form ')
display(df)
print('\n')
print('We will first Encode the variables to binary form of 0 and 1')
for cols in df.columns:
  if cols in ['name','gender']:
    continue
  else:
    df[cols] = df[cols].map({'Y':1,'N':0,'P':1})
print('The encoded dataframe looks like this')
display(df)
print('-------------------------------------------------------------------------------------------------------------')

print('We will need to find the similarity between the asymmetric attributes of both Jack and Jim')
print('This means we need to exclude both name and gender from the feature set and find the similarity \n')


vector1 = df.loc[0][2:].values
print(f'The representative vector for Jack is {vector1}')
vector2 = df.loc[1][2:].values
print(f'The representative vector for Jim is {vector2}')

q = len(np.where((vector1==vector2) & (vector1==1))[0])
t = len(np.where((vector1==vector2) & (vector1==0))[0])
r = len(np.where((vector1!=vector2) & (vector1==1))[0])
s = len(np.where((vector1!=vector2) & (vector1==0))[0])

contingency_table_layout = pd.DataFrame(np.array(['q','r','s','t']).reshape(2,2),columns=['Jim_1','Jim_0'],index=['Jack_1','Jack_0'])
contingency_table        = pd.DataFrame(np.array([q,r,s,t]).reshape(2,2),columns=['Jim_1','Jim_0'],index=['Jack_1','Jack_0'])

print('The contingency table attained is given as \n')
display(contingency_table)
print('\n')

print('The contingency table layout attained is denoted as \n')
display(contingency_table_layout)
print('\n')


print('-----------------------------------------------------------------------------------------------------------------------------------------------------')
symmetric_dissim = (r+s)/(q+t+r+s)
print(f'The symmetric binary dissimilarity between vector1 and vector2  is given as {round(symmetric_dissim,3)} ')
print(f'The formula for symmetric binary dissimilarity between vector1 and vector2  is given as (r+s)/(q+t+r+s) \n')
print('-----------------------------------------------------------------------------------------------------------------------------------------------------')

asymmetric_dissim = (r+s)/(q+r+s)
print(f'The formula for asymmetric binary dissimilarity between vector1 and vector2 considering match of 1 to be more significant is given as (r+s)/(q+r+s)')
print(f'The asymmetric binary dissimilarity between the vector1 and vector2 considering match of 1 to be more significant is {round(asymmetric_dissim,3)}')
print('\n')
print('-----------------------------------------------------------------------------------------------------------------------------------------------------')

asym_sim = q/(q+r+s)
print(f'The asymmetric similarity between vector 1 and 2 is {round(asym_sim,3)} (note that it is actually 1-asymmetric dissimilarity)')
print(f'This asymmetric similarity between vector 1 and 2 is also called Jaccard coefficient')
print(f'The formula for Jaccard coefficient hence becomes -  q/(q+r+s)')
print('-----------------------------------------------------------------------------------------------------------------------------------------------------')
print('\n')
print('Jack and Jim are likely to suffer from different disease because the Jaccard similarity is very low between them')

-------------------------------------------------------------------------------------------
STEP1
The dataframe having the binary attributes/features given to us is in form 


Unnamed: 0,name,gender,fever,cough,test1,test2,test3,test4
0,jack,M,Y,N,P,N,N,N
1,jim,M,Y,Y,N,N,N,N




We will first Encode the variables to binary form of 0 and 1
The encoded dataframe looks like this


Unnamed: 0,name,gender,fever,cough,test1,test2,test3,test4
0,jack,M,1,0,1,0,0,0
1,jim,M,1,1,0,0,0,0


-------------------------------------------------------------------------------------------------------------
We will need to find the similarity between the asymmetric attributes of both Jack and Jim
This means we need to exclude both name and gender from the feature set and find the similarity 

The representative vector for Jack is [1 0 1 0 0 0]
The representative vector for Jim is [1 1 0 0 0 0]
The contingency table attained is given as 



Unnamed: 0,Jim_1,Jim_0
Jack_1,1,1
Jack_0,1,3




The contingency table layout attained is denoted as 



Unnamed: 0,Jim_1,Jim_0
Jack_1,q,r
Jack_0,s,t




-----------------------------------------------------------------------------------------------------------------------------------------------------
The symmetric binary dissimilarity between vector1 and vector2  is given as 0.333 
The formula for symmetric binary dissimilarity between vector1 and vector2  is given as (r+s)/(q+t+r+s) 

-----------------------------------------------------------------------------------------------------------------------------------------------------
The formula for asymmetric binary dissimilarity between vector1 and vector2 considering match of 1 to be more significant is given as (r+s)/(q+r+s)
The asymmetric binary dissimilarity between the vector1 and vector2 considering match of 1 to be more significant is 0.667


-----------------------------------------------------------------------------------------------------------------------------------------------------
The asymmetric similarity between vector 1 and 2 is 0.333 (note that it is actually 1-

In [None]:
def jaccard(vector1,vector2):
  '''
      Function to compute Jaccard similarity between two vectors
  '''
  q = len(np.where((vector1==vector2) & (vector1==1))[0])
  t = len(np.where((vector1==vector2) & (vector1==0))[0])
  r = len(np.where((vector1!=vector2) & (vector1==1))[0])
  s = len(np.where((vector1!=vector2) & (vector1==0))[0])

  return q/(q+r+s)

#### Solving a Quiz Question
Assuming all attributes are binary asymmetric; what is the Jaccard
Coefficient for IT and FIN departments from the given table?

In [None]:
it    = np.array(['IT','Y','Y','N','N'])
fin   = np.array(['FIN','N','Y','Y','N'])
array = np.vstack((it,fin))
df = pd.DataFrame(array,columns = ['Department','Attribute1','attribute2','attribute3','attribute4'])

print('-------------------------------------------------------------------------------------------')
print('STEP1')
print('The dataframe having the binary attributes/features given to us is in form ')
display(df)
print('\n')
print('We will first Encode the variables to binary form of 0 and 1')
for cols in df.columns:
  if cols in ['Department']:
    continue
  else:
    df[cols] = df[cols].map({'Y':1,'N':0,'P':1})
print('The encoded dataframe looks like this')
display(df)
print('-------------------------------------------------------------------------------------------------------------')

print('We will need to find the similarity between the asymmetric attributes of both Jack and Jim')
print('This means we need to exclude both name and gender from the feature set and find the similarity \n')


vector1 = df.loc[0][1:].values
print(f'The representative vector for IT is {vector1}')
vector2 = df.loc[1][1:].values
print(f'The representative vector for Fin is {vector2}')

q = len(np.where((vector1==vector2) & (vector1==1))[0])
t = len(np.where((vector1==vector2) & (vector1==0))[0])
r = len(np.where((vector1!=vector2) & (vector1==1))[0])
s = len(np.where((vector1!=vector2) & (vector1==0))[0])

contingency_table_layout = pd.DataFrame(np.array(['q','r','s','t']).reshape(2,2),columns=['IT_1','IT_0'],index=['FIN_1','FIN_0'])
contingency_table        = pd.DataFrame(np.array([q,r,s,t]).reshape(2,2),columns=['IT_1','IT_0'],index=['FIN_1','FIN_0'])

print('The contingency table attained is given as \n')
display(contingency_table)
print('\n')

print('The contingency table layout attained is denoted as \n')
display(contingency_table_layout)
print('\n')

print('-------------------------------------------------------------------------------------------------------------')
print(f'The jaccard similarity from the function defined above will be {jaccard(vector1,vector2)}')

-------------------------------------------------------------------------------------------
STEP1
The dataframe having the binary attributes/features given to us is in form 


Unnamed: 0,Department,Attribute1,attribute2,attribute3,attribute4
0,IT,Y,Y,N,N
1,FIN,N,Y,Y,N




We will first Encode the variables to binary form of 0 and 1
The encoded dataframe looks like this


Unnamed: 0,Department,Attribute1,attribute2,attribute3,attribute4
0,IT,1,1,0,0
1,FIN,0,1,1,0


-------------------------------------------------------------------------------------------------------------
We will need to find the similarity between the asymmetric attributes of both Jack and Jim
This means we need to exclude both name and gender from the feature set and find the similarity 

The representative vector for IT is [1 1 0 0]
The representative vector for Fin is [0 1 1 0]
The contingency table attained is given as 



Unnamed: 0,IT_1,IT_0
FIN_1,1,1
FIN_0,1,1




The contingency table layout attained is denoted as 



Unnamed: 0,IT_1,IT_0
FIN_1,q,r
FIN_0,s,t




-------------------------------------------------------------------------------------------------------------
The jaccard similarity from the function defined above will be 0.3333333333333333


# Proximity measures for Nominal attributes
* A nominal attribute can take two or more states (like color) 
* The dissimilarity between two objects i and j can be computed based on ratio of mismatches.
<br>
If m is the number of matches (the number of attributes for which i and j are in same state), p is total number of attributes; the dissimilarity between objects i and j is given by (p-m)/p
* In the example below, we consider an attribute list where each object is described by just one attribute (the list where it is present) so we will set p = 1
* Now d(i,j) := distance between i and j will evaluate to 0 if the match happens and 1 otherwise

### Steps to calculate dissimilarity matrix for nominal attribute
* Step1: Initialize a matrix of ones of size nxn (n is the size of attribute list given)
* Step2: Loop over lower triangle of null matrix initialized above, and set 1 if object i and j are dissimilar and 0 if they are similar. 
* Step3: Output the similar objects and dissimilarity matrix

In [None]:
def similar_objects(row,test1):
  '''
      Finding similar objects in a row of a dataframe
  '''
  objects         = [f'object_{i+1}' for i in range(len(test1))]
  
  similarity_dict = dict(zip(objects,row.values))
  
  similar_values  = []
  for key,vals in similarity_dict.items():
    if vals==0:
      similar_values.append(key)
  return similar_values



In [None]:
test1       = ['code A','code B','code C','code A']

print('----------------------------------------------------------------------------------------------------------')
print('We have the following list of nominal attributes')
print(test1)
print('\n')

dis_sim     = np.ones((len(test1),len(test1)))
for i in range(len(test1)):
  for j in range(len(test1)):
    if i<j:
      if test1[i]==test1[j]:
        dis_sim[i][j] = 0
        dis_sim[j][i] = 0

print('----------------------------------------------------------------------------------------------------------')
print('The dissimilarity matrix attained where 0 refers to similarity and 1 refers to dissimilarity is given as ')
dissimilarity = pd.DataFrame(dis_sim,columns = [f'object_{i+1}' for i in range(len(test1))], index = [f'object_{i+1}' for i in range(len(test1))])
display(dissimilarity)
print('\n')
print('----------------------------------------------------------------------------------------------------------')
print('The similar objects for each object is given as -')
dissimilarity['similar_object'] = dissimilarity.apply(lambda row : similar_objects(row,test1),axis=1)
display(dissimilarity)
print('----------------------------------------------------------------------------------------------------------')
print('We can clearly see that object 1 and 4 are similar - code A')

----------------------------------------------------------------------------------------------------------
We have the following list of nominal attributes
['code A', 'code B', 'code C', 'code A']


----------------------------------------------------------------------------------------------------------
The dissimilarity matrix attained where 0 refers to similarity and 1 refers to dissimilarity is given as 


Unnamed: 0,object_1,object_2,object_3,object_4
object_1,1.0,1.0,1.0,0.0
object_2,1.0,1.0,1.0,1.0
object_3,1.0,1.0,1.0,1.0
object_4,0.0,1.0,1.0,1.0




----------------------------------------------------------------------------------------------------------
The similar objects for each object is given as -


Unnamed: 0,object_1,object_2,object_3,object_4,similar_object
object_1,1.0,1.0,1.0,0.0,[object_4]
object_2,1.0,1.0,1.0,1.0,[]
object_3,1.0,1.0,1.0,1.0,[]
object_4,0.0,1.0,1.0,1.0,[object_1]


----------------------------------------------------------------------------------------------------------
We can clearly see that object 1 and 4 are similar - code A


### Solving a quiz question based on similarity of nominal attributes

For the given records in the table, is similarity matrix correct for Gender attribute


In [None]:

gender   = np.array(['M','M','F','M'])
test1    = gender.copy()

print('----------------------------------------------------------------------------------------------------------')
print('We have the following list of nominal attributes')
print(test1)
print('\n')

dis_sim     = np.ones((len(test1),len(test1)))
for i in range(len(test1)):
  for j in range(len(test1)):
    if i<j:
      if test1[i]==test1[j]:
        dis_sim[i][j] = 0
        dis_sim[j][i] = 0

print('----------------------------------------------------------------------------------------------------------')
print('The dissimilarity matrix attained where 0 refers to similarity and 1 refers to dissimilarity is given as ')
dissimilarity = pd.DataFrame(dis_sim,columns = [f'object_{i+1}' for i in range(len(test1))], index = [f'object_{i+1}' for i in range(len(test1))])
display(dissimilarity)
print('\n')
print('----------------------------------------------------------------------------------------------------------')
print('The similar objects for each object is given as -')
dissimilarity['similar_object'] = dissimilarity.apply(lambda row : similar_objects(row,test1),axis=1)
display(dissimilarity)
print('----------------------------------------------------------------------------------------------------------')

----------------------------------------------------------------------------------------------------------
We have the following list of nominal attributes
['M' 'M' 'F' 'M']


----------------------------------------------------------------------------------------------------------
The dissimilarity matrix attained where 0 refers to similarity and 1 refers to dissimilarity is given as 


Unnamed: 0,object_1,object_2,object_3,object_4
object_1,1.0,0.0,1.0,0.0
object_2,0.0,1.0,1.0,0.0
object_3,1.0,1.0,1.0,1.0
object_4,0.0,0.0,1.0,1.0




----------------------------------------------------------------------------------------------------------
The similar objects for each object is given as -


Unnamed: 0,object_1,object_2,object_3,object_4,similar_object
object_1,1.0,0.0,1.0,0.0,"[object_2, object_4]"
object_2,0.0,1.0,1.0,0.0,"[object_1, object_4]"
object_3,1.0,1.0,1.0,1.0,[]
object_4,0.0,0.0,1.0,1.0,"[object_1, object_2]"


----------------------------------------------------------------------------------------------------------


# Dissimilarity of Numeric data - Minkowski distance
* Normalize the data before distance calculations
* Euclidean distance - distance as the crow flies
* Manhattan distance - block distance between two points
* Supremum distance  - Chebychev distance (L_max norm) 
* Cosine similarity  - Dot product of two vectors
<br>
<br>
There will be two vectors present for the distance calculation for numeric data points. We will use sklearn's distance metric over here to compute the distances between the points
<br>
<br>
We will solve problem 2.6 of textbook

In [None]:
from sklearn.neighbors import DistanceMetric

In [None]:
vector1 = [22,1,42,10]
vector2 = [20,0,36,8]
print('---------------------------------------------------------------------------------------')
vector1 = np.array(vector1)
vector2 = np.array(vector2)

print(f'The vectors given are - {vector1} and {vector2}')
X      = np.vstack((vector1,vector2))

print('---------------------------------------------------------------------------------------')
print('Calculating Euclidean distance')
dist     = DistanceMetric.get_metric('euclidean')
euc_dist = round(dist.pairwise(X)[0,1],3)
print(f'The Euclidean distance between both the vectors is {euc_dist}')
print('\n')

print('---------------------------------------------------------------------------------------')
print('Calculating Supremum/Chebychev distance')
dist     = DistanceMetric.get_metric('chebyshev')
sup_dist = round(dist.pairwise(X)[0,1],3)
print(f'The Supremum(Chebychev) distance between both the vectors is {sup_dist}')
print('\n')

print('---------------------------------------------------------------------------------------')
print('Calculating Manhattan distance')
dist     = DistanceMetric.get_metric('manhattan')
man_dist = round(dist.pairwise(X)[0,1],3)
print(f'The Manhattan distance between both the vectors is {man_dist}')
print('\n')

print('---------------------------------------------------------------------------------------')
print('Calculating Minkowski distance with p=4')
dist     = DistanceMetric.get_metric('minkowski',p=3)
min_dist = round(dist.pairwise(X)[0,1],3)
print(f'The Minkowski distance (at p=3) between both the vectors is {min_dist}')
print('\n')

---------------------------------------------------------------------------------------
The vectors given are - [22  1 42 10] and [20  0 36  8]
---------------------------------------------------------------------------------------
Calculating Euclidean distance
The Euclidean distance between both the vectors is 6.708


---------------------------------------------------------------------------------------
Calculating Supremum/Chebychev distance
The Supremum(Chebychev) distance between both the vectors is 6.0


---------------------------------------------------------------------------------------
Calculating Manhattan distance
The Manhattan distance between both the vectors is 11.0


---------------------------------------------------------------------------------------
Calculating Minkowski distance with p=4
The Minkowski distance (at p=3) between both the vectors is 6.153




### Textbook question 2.8 

In [None]:
df = pd.DataFrame(np.array([[1.5,1.7],[2,1.9],[1.6,1.8],[1.2,1.5],[1.5,1]]),columns=['A1','A2'],index = [f'x{i+1}' for i in range(5)])
print('We have the following 2d Dataset')
display(df)

We have the following 2d Dataset


Unnamed: 0,A1,A2
x1,1.5,1.7
x2,2.0,1.9
x3,1.6,1.8
x4,1.2,1.5
x5,1.5,1.0


#### Solution to 2.8 (a) 
We have ranked the sample points in an order of their distance from the new point

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
points_map = dict(zip(df.index,df.values))
print('------------------------------------------------------------------------------------------------------------------')
X          = df.values
print('The existing data point values from the dataframe is given as ')
display(X)
print('\n')
new_point  = [1.4,1.6]
print(f'The new sample point is {new_point}\n')
X_new      = np.append(X,[new_point],0)
print(f'Updated sample after adding the new point is ')
display(X_new)
print('\n')

print('------------------------------------------------------------------------------------------------------------------')
dist       = DistanceMetric.get_metric('euclidean')
euc_dist   = dist.pairwise(X_new)
distance_val  = euc_dist[-1,:-1]
distance_dict = dict(zip(df.index,distance_val))
similar_df    = pd.DataFrame(np.array(sorted(distance_dict.items(), key=operator.itemgetter(1),reverse=False)),columns= ['Points','Euclidean_distance'])
similar_df['sample_point'] = similar_df['Points'].map(points_map)
similar_df['comparison_point'] = [new_point]*len(similar_df)
similar_df['Euclidean_distance'] = similar_df['Euclidean_distance'].apply(lambda z: round(np.float(z),3))
print('Euclidean distance calculated and the dataset points are ranked based on similarty based on Euclidean distance')
display(similar_df)
print('\n')

print('------------------------------------------------------------------------------------------------------------------')
dist          = DistanceMetric.get_metric('manhattan')
man_dist      = dist.pairwise(X_new)
distance_val  = man_dist[-1,:-1]
distance_dict = dict(zip(df.index,distance_val))
similar_df    = pd.DataFrame(np.array(sorted(distance_dict.items(), key=operator.itemgetter(1),reverse=False)),columns= ['Points','Manhattan_distance'])
similar_df['sample_point'] = similar_df['Points'].map(points_map)
similar_df['comparison_point'] = [new_point]*len(similar_df)
similar_df['Manhattan_distance'] = similar_df['Manhattan_distance'].apply(lambda z: round(np.float(z),3))
print('Manhattan distance calculated and the dataset points are ranked based on similarty based on Manhattan distance')
display(similar_df)
print('\n')


print('------------------------------------------------------------------------------------------------------------------')
dist          = DistanceMetric.get_metric('chebyshev')
sup_dist      = dist.pairwise(X_new)
distance_val  = sup_dist[-1,:-1]
distance_dict = dict(zip(df.index,distance_val))
similar_df    = pd.DataFrame(np.array(sorted(distance_dict.items(), key=operator.itemgetter(1),reverse=False)),columns= ['Points','Supremum_distance'])
similar_df['sample_point'] = similar_df['Points'].map(points_map)
similar_df['comparison_point'] = [new_point]*len(similar_df)
similar_df['Supremum_distance'] = similar_df['Supremum_distance'].apply(lambda z: round(np.float(z),3))
print('Supremum distance calculated and the dataset points are ranked based on similarty based on Supremum distance')
display(similar_df)
print('\n')


print('------------------------------------------------------------------------------------------------------------------')
cosine_distance = {}
for i in range(X.shape[0]):
  cosine_distance[f'x{i+1}'] = cosine_similarity(X_new[-1].reshape(1,-1),X_new[i].reshape(1,-1))[0][0]

similar_df    = pd.DataFrame(np.array(sorted(cosine_distance.items(), key=operator.itemgetter(1),reverse=True)),columns= ['Points','Cosine_distance'])
similar_df['sample_point'] = similar_df['Points'].map(points_map)
similar_df['comparison_point'] = [new_point]*len(similar_df)
similar_df['Cosine_distance'] = similar_df['Cosine_distance'].apply(lambda z: round(np.float(z),5))
print('Cosine similarity distance calculated and the dataset points are ranked based on similarty based on cosine similarity')
display(similar_df)

------------------------------------------------------------------------------------------------------------------
The existing data point values from the dataframe is given as 


array([[1.5, 1.7],
       [2. , 1.9],
       [1.6, 1.8],
       [1.2, 1.5],
       [1.5, 1. ]])



The new sample point is [1.4, 1.6]

Updated sample after adding the new point is 


array([[1.5, 1.7],
       [2. , 1.9],
       [1.6, 1.8],
       [1.2, 1.5],
       [1.5, 1. ],
       [1.4, 1.6]])



------------------------------------------------------------------------------------------------------------------
Euclidean distance calculated and the dataset points are ranked based on similarty based on Euclidean distance


Unnamed: 0,Points,Euclidean_distance,sample_point,comparison_point
0,x1,0.141,"[1.5, 1.7]","[1.4, 1.6]"
1,x4,0.224,"[1.2, 1.5]","[1.4, 1.6]"
2,x3,0.283,"[1.6, 1.8]","[1.4, 1.6]"
3,x5,0.608,"[1.5, 1.0]","[1.4, 1.6]"
4,x2,0.671,"[2.0, 1.9]","[1.4, 1.6]"




------------------------------------------------------------------------------------------------------------------
Manhattan distance calculated and the dataset points are ranked based on similarty based on Manhattan distance


Unnamed: 0,Points,Manhattan_distance,sample_point,comparison_point
0,x1,0.2,"[1.5, 1.7]","[1.4, 1.6]"
1,x4,0.3,"[1.2, 1.5]","[1.4, 1.6]"
2,x3,0.4,"[1.6, 1.8]","[1.4, 1.6]"
3,x5,0.7,"[1.5, 1.0]","[1.4, 1.6]"
4,x2,0.9,"[2.0, 1.9]","[1.4, 1.6]"




------------------------------------------------------------------------------------------------------------------
Supremum distance calculated and the dataset points are ranked based on similarty based on Supremum distance


Unnamed: 0,Points,Supremum_distance,sample_point,comparison_point
0,x1,0.1,"[1.5, 1.7]","[1.4, 1.6]"
1,x4,0.2,"[1.2, 1.5]","[1.4, 1.6]"
2,x3,0.2,"[1.6, 1.8]","[1.4, 1.6]"
3,x2,0.6,"[2.0, 1.9]","[1.4, 1.6]"
4,x5,0.6,"[1.5, 1.0]","[1.4, 1.6]"




------------------------------------------------------------------------------------------------------------------
Cosine similarity distance calculated and the dataset points are ranked based on similarty based on cosine similarity


Unnamed: 0,Points,Cosine_distance,sample_point,comparison_point
0,x1,0.99999,"[1.5, 1.7]","[1.4, 1.6]"
1,x3,0.99997,"[1.6, 1.8]","[1.4, 1.6]"
2,x4,0.99903,"[1.2, 1.5]","[1.4, 1.6]"
3,x2,0.99575,"[2.0, 1.9]","[1.4, 1.6]"
4,x5,0.96536,"[1.5, 1.0]","[1.4, 1.6]"


#### Solving 2.8 (b) 
We already have X_new. We will just have to normalize it and find the distance metric as usual in 2.8(a)

In [None]:
def normalize(x):
  return x/np.linalg.norm(x)

X_new_norm = []

for i in range(X_new.shape[0]):
  X_new_norm.append(normalize(X_new[i]))

X_new_norm = np.array(X_new_norm)

In [None]:
ndf = pd.DataFrame(X_new_norm[:-1],columns = df.columns,index=df.index)
points_map = dict(zip(ndf.index,ndf.values))
print('------------------------------------------------------------------------------------------------------------------')
X          = df.values
print('The normalized data is given as  ')
display(X_new_norm)
print('\n')

print('------------------------------------------------------------------------------------------------------------------')
dist       = DistanceMetric.get_metric('euclidean')
euc_dist   = dist.pairwise(X_new_norm)
distance_val  = euc_dist[-1,:-1]
distance_dict = dict(zip(df.index,distance_val))
similar_df    = pd.DataFrame(np.array(sorted(distance_dict.items(), key=operator.itemgetter(1),reverse=False)),columns= ['Points','Euclidean_distance'])
similar_df['sample_point'] = similar_df['Points'].map(points_map)
similar_df['comparison_point'] = [X_new_norm[-1]]*len(similar_df)
similar_df['Euclidean_distance'] = similar_df['Euclidean_distance'].apply(lambda z: round(np.float(z),3))
print('Euclidean distance calculated and the dataset points are ranked based on similarty based on Euclidean distance')
display(similar_df)
print('\n')

print('------------------------------------------------------------------------------------------------------------------')
dist          = DistanceMetric.get_metric('manhattan')
man_dist      = dist.pairwise(X_new_norm)
distance_val  = man_dist[-1,:-1]
distance_dict = dict(zip(df.index,distance_val))
similar_df    = pd.DataFrame(np.array(sorted(distance_dict.items(), key=operator.itemgetter(1),reverse=False)),columns= ['Points','Manhattan_distance'])
similar_df['sample_point'] = similar_df['Points'].map(points_map)
similar_df['comparison_point'] = [X_new_norm[-1]]*len(similar_df)
similar_df['Manhattan_distance'] = similar_df['Manhattan_distance'].apply(lambda z: round(np.float(z),3))
print('Manhattan distance calculated and the dataset points are ranked based on similarty based on Manhattan distance')
display(similar_df)
print('\n')


print('------------------------------------------------------------------------------------------------------------------')
dist          = DistanceMetric.get_metric('chebyshev')
sup_dist      = dist.pairwise(X_new_norm)
distance_val  = sup_dist[-1,:-1]
distance_dict = dict(zip(df.index,distance_val))
similar_df    = pd.DataFrame(np.array(sorted(distance_dict.items(), key=operator.itemgetter(1),reverse=False)),columns= ['Points','Supremum_distance'])
similar_df['sample_point'] = similar_df['Points'].map(points_map)
similar_df['comparison_point'] = [X_new_norm[-1]]*len(similar_df)
similar_df['Supremum_distance'] = similar_df['Supremum_distance'].apply(lambda z: round(np.float(z),3))
print('Supremum distance calculated and the dataset points are ranked based on similarty based on Supremum distance')
display(similar_df)
print('\n')


print('------------------------------------------------------------------------------------------------------------------')
cosine_distance = {}
for i in range(X.shape[0]):
  cosine_distance[f'x{i+1}'] = cosine_similarity(X_new_norm[-1].reshape(1,-1),X_new_norm[i].reshape(1,-1))[0][0]

similar_df    = pd.DataFrame(np.array(sorted(cosine_distance.items(), key=operator.itemgetter(1),reverse=True)),columns= ['Points','Cosine_distance'])
similar_df['sample_point'] = similar_df['Points'].map(points_map)
similar_df['comparison_point'] = [X_new_norm[-1]]*len(similar_df)
similar_df['Cosine_distance'] = similar_df['Cosine_distance'].apply(lambda z: round(np.float(z),5))
print('Cosine similarity distance calculated and the dataset points are ranked based on similarty based on cosine similarity')
display(similar_df)

------------------------------------------------------------------------------------------------------------------
The normalized data is given as  


array([[0.66162164, 0.74983786],
       [0.72499943, 0.68874946],
       [0.66436384, 0.74740932],
       [0.62469505, 0.78086881],
       [0.83205029, 0.5547002 ],
       [0.65850461, 0.75257669]])



------------------------------------------------------------------------------------------------------------------
Euclidean distance calculated and the dataset points are ranked based on similarty based on Euclidean distance


Unnamed: 0,Points,Euclidean_distance,sample_point,comparison_point
0,x1,0.004,"[0.6616216370868464, 0.7498378553650925]","[0.658504607868518, 0.7525766947068778]"
1,x3,0.008,"[0.6643638388299198, 0.7474093186836597]","[0.658504607868518, 0.7525766947068778]"
2,x4,0.044,"[0.6246950475544242, 0.7808688094430303]","[0.658504607868518, 0.7525766947068778]"
3,x2,0.092,"[0.7249994335944139, 0.6887494619146931]","[0.658504607868518, 0.7525766947068778]"
4,x5,0.263,"[0.8320502943378437, 0.5547001962252291]","[0.658504607868518, 0.7525766947068778]"




------------------------------------------------------------------------------------------------------------------
Manhattan distance calculated and the dataset points are ranked based on similarty based on Manhattan distance


Unnamed: 0,Points,Manhattan_distance,sample_point,comparison_point
0,x1,0.006,"[0.6616216370868464, 0.7498378553650925]","[0.658504607868518, 0.7525766947068778]"
1,x3,0.011,"[0.6643638388299198, 0.7474093186836597]","[0.658504607868518, 0.7525766947068778]"
2,x4,0.062,"[0.6246950475544242, 0.7808688094430303]","[0.658504607868518, 0.7525766947068778]"
3,x2,0.13,"[0.7249994335944139, 0.6887494619146931]","[0.658504607868518, 0.7525766947068778]"
4,x5,0.371,"[0.8320502943378437, 0.5547001962252291]","[0.658504607868518, 0.7525766947068778]"




------------------------------------------------------------------------------------------------------------------
Supremum distance calculated and the dataset points are ranked based on similarty based on Supremum distance


Unnamed: 0,Points,Supremum_distance,sample_point,comparison_point
0,x1,0.003,"[0.6616216370868464, 0.7498378553650925]","[0.658504607868518, 0.7525766947068778]"
1,x3,0.006,"[0.6643638388299198, 0.7474093186836597]","[0.658504607868518, 0.7525766947068778]"
2,x4,0.034,"[0.6246950475544242, 0.7808688094430303]","[0.658504607868518, 0.7525766947068778]"
3,x2,0.066,"[0.7249994335944139, 0.6887494619146931]","[0.658504607868518, 0.7525766947068778]"
4,x5,0.198,"[0.8320502943378437, 0.5547001962252291]","[0.658504607868518, 0.7525766947068778]"




------------------------------------------------------------------------------------------------------------------
Cosine similarity distance calculated and the dataset points are ranked based on similarty based on cosine similarity


Unnamed: 0,Points,Cosine_distance,sample_point,comparison_point
0,x1,0.99999,"[0.6616216370868464, 0.7498378553650925]","[0.658504607868518, 0.7525766947068778]"
1,x3,0.99997,"[0.6643638388299198, 0.7474093186836597]","[0.658504607868518, 0.7525766947068778]"
2,x4,0.99903,"[0.6246950475544242, 0.7808688094430303]","[0.658504607868518, 0.7525766947068778]"
3,x2,0.99575,"[0.7249994335944139, 0.6887494619146931]","[0.658504607868518, 0.7525766947068778]"
4,x5,0.96536,"[0.8320502943378437, 0.5547001962252291]","[0.658504607868518, 0.7525766947068778]"


### Solving a Quiz question
Given the following vectors, find the pair with maximum cosine similarity

In [None]:
v1  = np.array([2,7,1,4])
v2  = np.array([3,8,1,4])
v3  = np.array([4,14,2,8])
X   = np.array([v1,v2,v3])
print('-------------------------------------------------------------------')
sim = cosine_similarity(X,X)
print('We have calculated the pairwise vector cosine similarity as below')
display(sim)
print('\n')

print('We set all diagonal values to 0')
for i in range(X.shape[0]):
  for j in range(X.shape[1]):
    if i==j:
      sim[i][j]=0
print('-------------------------------------------------------------------')
simdf = pd.DataFrame(sim,columns = [f'X{i+1}' for i in range(3)],index = [f'X{i+1}' for i in range(3)])
max_val  = simdf.max(axis=1)
max_ind  = simdf.idxmax(axis=1)
simdf['max_dist'] = max_val
simdf['max_ind']  = max_ind
print('We get the following similarity matrix - \n')
display(simdf.sort_values(by=['max_dist'],ascending=False))
print('\n')
print('-------------------------------------------------------------------')
print('1 and 3 are similar')

-------------------------------------------------------------------
We have calculated the pairwise vector cosine similarity as below


array([[1.        , 0.99530645, 1.        ],
       [0.99530645, 1.        , 0.99530645],
       [1.        , 0.99530645, 1.        ]])



We set all diagonal values to 0
-------------------------------------------------------------------
We get the following similarity matrix - 



Unnamed: 0,X1,X2,X3,max_dist,max_ind
X1,0.0,0.995306,1.0,1.0,X3
X3,1.0,0.995306,0.0,1.0,X1
X2,0.995306,0.0,0.995306,0.995306,X1




-------------------------------------------------------------------
1 and 3 are similar


# Proximity measures for Ordinal attributes

* Step1 - We will first label encode the list given to us with excellent>good>fair and accodingly assign 3 to excellent and 1 to fair
* Step2 - Find the distinct number of values in the encoded list to find M (number of states)
* Step3 - We perform data normalization on test2_enc <br>
formula is z = (r-1)/(M-1) where r is a value from test2_enc
* Step4 - Find the distance using any ways implemented to calculate distance for numerical attributes

In [None]:
test2 = ['excellent','fair','good','excellent']

In [None]:
print('--------------------------------------------------------------------------------------------------------------------------------------------------------')
label_encode_ordinal = {'excellent':3,'good':2,'fair':1}
print('Step1')
print(f'The dictionary to be used for label encoding is {label_encode_ordinal}')
print('\n')

test2_enc            = [label_encode_ordinal[i] for i in test2]
print(f'The label encoded value for {test2} will be {test2_enc}')
print('\n')


print('--------------------------------------------------------------------------------------------------------------------------------------------------------')
print('Step 2')
print('We find the distinct number of values in encoded list ie., M')
M                    = len(set(test2_enc))
print(f'The value of M is {M}')
print('--------------------------------------------------------------------------------------------------------------------------------------------------------')

print('Step3')
ordinal_list_norm    = np.float_(np.array([(val-1)/(M-1) for val in test2_enc]))
print(f'The ordinal list {test2} which was label encoded to be {test2_enc} is now normalized using the formula mentioned to get {ordinal_list_norm} \n')
print('--------------------------------------------------------------------------------------------------------------------------------------------------------')

print('Step4')
dissim               = np.zeros((len(ordinal_list_norm),(len(ordinal_list_norm))))
for i in range(dissim.shape[0]):
  for j in range(dissim.shape[1]):
    dissim[i][j] = np.sqrt((ordinal_list_norm[i]-ordinal_list_norm[j])**2)
print('We calculated the distance between successive points to get the distance matrix as ')
display(pd.DataFrame(dissim,columns = test2,index=test2))

--------------------------------------------------------------------------------------------------------------------------------------------------------
Step1
The dictionary to be used for label encoding is {'excellent': 3, 'good': 2, 'fair': 1}


The label encoded value for ['excellent', 'fair', 'good', 'excellent'] will be [3, 1, 2, 3]


--------------------------------------------------------------------------------------------------------------------------------------------------------
Step 2
We find the distinct number of values in encoded list ie., M
The value of M is 3
--------------------------------------------------------------------------------------------------------------------------------------------------------
Step3
The ordinal list ['excellent', 'fair', 'good', 'excellent'] which was label encoded to be [3, 1, 2, 3] is now normalized using the formula mentioned to get [1.  0.  0.5 1. ] 

----------------------------------------------------------------------------------

Unnamed: 0,excellent,fair,good,excellent.1
excellent,0.0,1.0,0.5,0.0
fair,1.0,0.0,0.5,1.0
good,0.5,0.5,0.0,0.5
excellent,0.0,1.0,0.5,0.0


# Dissimilarity for attrbutes of Mixed Types
* The below text-book example contains 3 attributes. test1 is nominal attribute, test2 is ordinal attribute, test3 is numeric attribute.
* We will evaluate the similarities of all the attributes separately and aggregate them together towards the end.


In [None]:
test1 = ['code A','code B','code C','code A']
test2 = ['excellent','fair','good','excellent']
test3 = [45,22,64,28]
obj   = [1,2,3,4]
df    = pd.DataFrame(columns = ['Object Identifier','test1','test2','test3'])
df['Object Identifier'] = obj
df['test1']             = test1
df['test2']             = test2
df['test3']             = test3
df

Unnamed: 0,Object Identifier,test1,test2,test3
0,1,code A,excellent,45
1,2,code B,fair,22
2,3,code C,good,64
3,4,code A,excellent,28


We will calculate the dissimilarity in nominal attributes (we have calculated it previously and I pasted the segment as it is) 

In [None]:
test1 = df['test1'].values
print('----------------------------------------------------------------------------------------------------------')
print('We have the following list of nominal attributes')
print(df['test1'].values)
print('\n')
test1       = df['test1'].values
dis_sim     = np.ones((len(test1),len(test1)))
dis_sim     = np.ones((len(test1),len(test1)))
for i in range(len(test1)):
  for j in range(len(test1)):
    if i<j:
      if test1[i]==test1[j]:
        dis_sim[i][j] = 0
        dis_sim[j][i] = 0


print('----------------------------------------------------------------------------------------------------------')
print('The dissimilarity matrix attained where 0 refers to similarity and 1 refers to dissimilarity is given as ')
dissimilarity_nominal = pd.DataFrame(dis_sim,columns = [f'object_{i+1}' for i in range(len(test1))], index = [f'object_{i+1}' for i in range(len(test1))])
display(dissimilarity_nominal)


----------------------------------------------------------------------------------------------------------
We have the following list of nominal attributes
['code A' 'code B' 'code C' 'code A']


----------------------------------------------------------------------------------------------------------
The dissimilarity matrix attained where 0 refers to similarity and 1 refers to dissimilarity is given as 


Unnamed: 0,object_1,object_2,object_3,object_4
object_1,1.0,1.0,1.0,0.0
object_2,1.0,1.0,1.0,1.0
object_3,1.0,1.0,1.0,1.0
object_4,0.0,1.0,1.0,1.0


We will now solve the dissimilarity of ordinal attributes (again solved previously and we are just pasting it as it is) 

In [None]:
test2      = df['test2'].values
print('--------------------------------------------------------------------------------------------------------------------------------------------------------')
label_encode_ordinal = {'excellent':3,'good':2,'fair':1}
print('Step1')
print(f'The dictionary to be used for label encoding is {label_encode_ordinal}')
print('\n')

test2_enc            = [label_encode_ordinal[i] for i in test2]
print(f'The label encoded value for {test2} will be {test2_enc}')
print('\n')


print('--------------------------------------------------------------------------------------------------------------------------------------------------------')
print('Step 2')
print('We find the distinct number of values in encoded list ie., M')
M                    = len(set(test2_enc))
print(f'The value of M is {M}')
print('--------------------------------------------------------------------------------------------------------------------------------------------------------')

print('Step3')
ordinal_list_norm    = np.float_(np.array([(val-1)/(M-1) for val in test2_enc]))
print(f'The ordinal list {test2} which was label encoded to be {test2_enc} is now normalized using the formula mentioned to get {ordinal_list_norm} \n')
print('--------------------------------------------------------------------------------------------------------------------------------------------------------')

print('Step4')
dissimilarity_ordinal          = np.zeros((len(ordinal_list_norm),(len(ordinal_list_norm))))
for i in range(dissimilarity_ordinal.shape[0]):
  for j in range(dissimilarity_ordinal.shape[1]):
    dissimilarity_ordinal[i][j] = np.sqrt((ordinal_list_norm[i]-ordinal_list_norm[j])**2)
print('We calculated the distance between successive points to get the distance matrix as ')
display(pd.DataFrame(dissimilarity_ordinal,columns = test2,index=test2))

--------------------------------------------------------------------------------------------------------------------------------------------------------
Step1
The dictionary to be used for label encoding is {'excellent': 3, 'good': 2, 'fair': 1}


The label encoded value for ['excellent' 'fair' 'good' 'excellent'] will be [3, 1, 2, 3]


--------------------------------------------------------------------------------------------------------------------------------------------------------
Step 2
We find the distinct number of values in encoded list ie., M
The value of M is 3
--------------------------------------------------------------------------------------------------------------------------------------------------------
Step3
The ordinal list ['excellent' 'fair' 'good' 'excellent'] which was label encoded to be [3, 1, 2, 3] is now normalized using the formula mentioned to get [1.  0.  0.5 1. ] 

----------------------------------------------------------------------------------------

Unnamed: 0,excellent,fair,good,excellent.1
excellent,0.0,1.0,0.5,0.0
fair,1.0,0.0,0.5,1.0
good,0.5,0.5,0.0,0.5
excellent,0.0,1.0,0.5,0.0


We now calculate the distance corresponding to numeric attribute- test3. <br>
It was assumed in the text book that we will normalize the test3 using maximum value = 64 and minimum value = 22

In [None]:
test3      = df['test3'].values
max        = 64
min        = 22
norm_test3 = [(val-min)/(max-min) for val in test3]
print('--------------------------------------------------------------------------------------------------------------------------------------------------------')

dissimilarity_numerical          = np.zeros((len(norm_test3),(len(norm_test3))))
for i in range(dissimilarity_numerical.shape[0]):
  for j in range(dissimilarity_numerical.shape[1]):
    dissimilarity_numerical[i][j] = np.sqrt((norm_test3[i]-norm_test3[j])**2)
print('We calculated the distance between successive points to get the distance matrix as ')
display(pd.DataFrame(dissimilarity_numerical,columns = test3,index=test3))

--------------------------------------------------------------------------------------------------------------------------------------------------------
We calculated the distance between successive points to get the distance matrix as 


Unnamed: 0,45,22,64,28
45,0.0,0.547619,0.452381,0.404762
22,0.547619,0.0,1.0,0.142857
64,0.452381,1.0,0.0,0.857143
28,0.404762,0.142857,0.857143,0.0


Aggregating all the matrices

In [None]:
print('----------------------------------------------------------------------')
print('We have similarity matrix corresponding to nominal attribute - test1')
display(dissimilarity_nominal.values)
print('\n')

print('----------------------------------------------------------------------')
print('We have similarity matrix corresponding to ordinal attribute - test2')
display(dissimilarity_ordinal)
print('\n')

print('----------------------------------------------------------------------')
print('We have similarity matrix corresponding to numerical attribute - test3')
display(dissimilarity_numerical)
print('\n')

----------------------------------------------------------------------
We have similarity matrix corresponding to nominal attribute - test1


array([[1., 1., 1., 0.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [0., 1., 1., 1.]])



----------------------------------------------------------------------
We have similarity matrix corresponding to ordinal attribute - test2


array([[0. , 1. , 0.5, 0. ],
       [1. , 0. , 0.5, 1. ],
       [0.5, 0.5, 0. , 0.5],
       [0. , 1. , 0.5, 0. ]])



----------------------------------------------------------------------
We have similarity matrix corresponding to numerical attribute - test3


array([[0.        , 0.54761905, 0.45238095, 0.4047619 ],
       [0.54761905, 0.        , 1.        , 0.14285714],
       [0.45238095, 1.        , 0.        , 0.85714286],
       [0.4047619 , 0.14285714, 0.85714286, 0.        ]])





In [None]:
total_sim = dissimilarity_numerical+dissimilarity_ordinal+dissimilarity_nominal.values
print('Overall similarity matrix for attributes of mixed type is -')
findf = pd.DataFrame(total_sim/3,columns = df['Object Identifier'].values,index = df['Object Identifier'].values)
display(findf)
print('--------------------------------------------------------------------------------------------------------')

Overall similarity matrix for attributes of mixed type is -


Unnamed: 0,1,2,3,4
1,0.333333,0.849206,0.650794,0.134921
2,0.849206,0.333333,0.833333,0.714286
3,0.650794,0.833333,0.333333,0.785714
4,0.134921,0.714286,0.785714,0.333333


--------------------------------------------------------------------------------------------------------
