In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('..\\features\\color_hist.csv', index_col=0)
data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
1222__pool_table__0.9999995.jpg,178,51,43,49,37,40,54,57,57,54,...,8,5,12,9,13,14,12,12,7,51
1328__coil__0.99999607.jpg,47,39,66,118,112,134,143,164,194,230,...,97,114,127,188,211,172,121,90,61,186
134__zebra__0.9999949.jpg,0,0,1,1,4,4,7,5,12,8,...,34,17,40,14,25,12,2,4,2,13
2377471__pizza__0.9999988.jpg,4,0,3,4,4,11,8,5,14,9,...,6,7,7,5,1,6,3,6,2,12
2377620__zebra__0.9999882.jpg,16,26,9,20,17,25,25,31,37,33,...,0,0,0,0,1,0,0,1,0,1


In [3]:
values = data.values
values

array([[ 178,   51,   43, ...,   12,    7,   51],
       [  47,   39,   66, ...,   90,   61,  186],
       [   0,    0,    1, ...,    4,    2,   13],
       ...,
       [ 332,  141,  240, ...,  886, 2000, 5279],
       [   8,    6,    3, ...,  557,  504, 3941],
       [   0,    1,    0, ...,    0,    0,    0]], dtype=int64)

In [4]:
values = values[:10] # Ignore this
values.shape

(10, 768)

### Similarity functions

In [5]:
def euclidean_distance(a, b, same=False):
    if (same): return 0 
    else: return np.linalg.norm(a-b)

In [7]:
def cosine_similarity(a, b, same=False):
    if (same): return 1
    else: return np.dot(a, b)/(np.linalg.norm(a) * np.linalg.norm(b))

### Create Mx

In [8]:
def choose_mx(choice):
    if choice=='euclidean': return euclidean_distance
    if choice=='cosine': return cosine_similarity
    else: raise Error

In [9]:
length = values.shape[0]
sim_matrix = np.zeros(shape=(length, length))

In [10]:
sim_func = choose_mx('cosine')

In [11]:
length = values.shape[0]
for i in range(length):
    for j in range(i, length):
        sim_score = sim_func(values[i], values[j], i==j)
        # Store into matrix
        sim_matrix[i][j] = sim_score
        sim_matrix[j][i] = sim_score

In [12]:
sim_matrix

array([[1.        , 0.73668802, 0.11148996, 0.78493477, 0.29103062,
        0.2321349 , 0.40635593, 0.77727341, 0.67882152, 0.44607259],
       [0.73668802, 1.        , 0.22754743, 0.79384255, 0.43003351,
        0.28843563, 0.5253357 , 0.8565287 , 0.54395178, 0.61415831],
       [0.11148996, 0.22754743, 1.        , 0.15235756, 0.10311544,
        0.12501576, 0.2003216 , 0.1177235 , 0.1967508 , 0.14471311],
       [0.78493477, 0.79384255, 0.15235756, 1.        , 0.59570489,
        0.51031294, 0.64085015, 0.74201007, 0.59055086, 0.77517463],
       [0.29103062, 0.43003351, 0.10311544, 0.59570489, 1.        ,
        0.33001234, 0.55108348, 0.3599177 , 0.29006012, 0.69788931],
       [0.2321349 , 0.28843563, 0.12501576, 0.51031294, 0.33001234,
        1.        , 0.60855725, 0.27571136, 0.25826553, 0.68338503],
       [0.40635593, 0.5253357 , 0.2003216 , 0.64085015, 0.55108348,
        0.60855725, 1.        , 0.4849384 , 0.44390175, 0.7131532 ],
       [0.77727341, 0.8565287 , 0.1177235

In [52]:
sim_mx_df = pd.DataFrame(sim_matrix, data.index[:10], data.index[:10])
sim_mx_df.head(5)

Unnamed: 0,1222__pool_table__0.9999995.jpg,1328__coil__0.99999607.jpg,134__zebra__0.9999949.jpg,2377471__pizza__0.9999988.jpg,2377620__zebra__0.9999882.jpg,2377698__zebra__0.9999999.jpg,2378170__zebra__0.9999902.jpg,2378358__park_bench__0.99999833.jpg,2378523__banana__0.99999785.jpg,2379086__zebra__0.9999975.jpg
1222__pool_table__0.9999995.jpg,0.0,6285.621051,14593.537679,5772.380618,11441.225721,11880.166834,9123.23835,5983.605101,7356.368941,8848.581129
1328__coil__0.99999607.jpg,6285.621051,0.0,12651.202235,4392.810718,9290.19031,10232.351538,6772.767824,4410.541237,7820.760705,6133.401177
134__zebra__0.9999949.jpg,14593.537679,12651.202235,0.0,13183.485427,15001.600315,14800.066284,12944.626144,14127.997947,13790.634141,13337.084014
2377471__pizza__0.9999988.jpg,5772.380618,4392.810718,13183.485427,0.0,8017.563096,8686.188347,5939.279418,5724.228682,7469.735471,4720.819844
2377620__zebra__0.9999882.jpg,11441.225721,9290.19031,15001.600315,8017.563096,0.0,11442.310955,8428.155077,10472.219631,11316.109844,7097.954353


In [60]:
def calc_step(step):
    if (step==1):
        return 1
    else:
        return step + calc_step(step-1)

In [61]:
calc_step(198)

19701

In [81]:
class1 = [[0, 'c1', 0.1],[1, 'c2', 0.7],[2, 'c3', 0.2]]
class2 = [[0, 'c1', 0.05],[1, 'c2', 0.85],[2, 'c3', 0.1]]
class1

[[0, 'c1', 0.1], [1, 'c2', 0.7], [2, 'c3', 0.2]]

In [82]:
data1 = list()
data1.extend(class1)
data1.extend(class2)
data1

[[0, 'c1', 0.1],
 [1, 'c2', 0.7],
 [2, 'c3', 0.2],
 [0, 'c1', 0.05],
 [1, 'c2', 0.85],
 [2, 'c3', 0.1]]

In [83]:
data2 = {}

In [84]:
data2 = dict()
for element in data1:
    if element[1] in data2:
        data2[element[1]] += element[2]
    else:
        data2[element[1]] = element[2]

In [85]:
import math

In [86]:
result = 0.0
for v in data2.values():
    result += v*v
math.sqrt(result)

1.5858751527153703

In [13]:
cosine_similarity([3, 4, 3335],[-2, -3, -5])

-0.9864876556434157