# Cosine Similarity Impementation

In [1]:
# imports
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os
import cv2
from ast import literal_eval

import keras
import tensorflow as tf

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# file paths
path = os.getcwd()
path = path[:-4]
data_dir = path + "/data/features/"

In [3]:
matchPairsTrain, mismatchPairsTrain = pd.read_csv(data_dir + "matchPairsTrain.csv"), pd.read_csv(data_dir + "mismatchPairsTrain.csv")
matchPairsTest, mismatchPairsTest = pd.read_csv(data_dir + "matchPairsTest.csv"), pd.read_csv(data_dir + "mismatchPairsTest.csv")

In [4]:
matchPairsTrain = matchPairsTrain.drop('Unnamed: 0', axis=1)
matchPairsTest = matchPairsTest.drop('Unnamed: 0', axis=1)
mismatchPairsTrain = mismatchPairsTrain.drop('Unnamed: 0', axis=1)
mismatchPairsTest = mismatchPairsTest.drop('Unnamed: 0', axis=1)

In [5]:
matchPairsTrain.shape, matchPairsTest.shape, matchPairsTest.columns

((1100, 7),
 (500, 7),
 Index(['name', 'imagenum1', 'imagenum2', 'image1', 'image2', 'image1Features',
        'image2Features'],
       dtype='object'))

In [6]:
mismatchPairsTrain.shape, mismatchPairsTest.shape, mismatchPairsTest.columns

((1100, 8),
 (500, 8),
 Index(['name', 'imagenum1', 'name.1', 'imagenum2', 'image1', 'image2',
        'image1Features', 'image2Features'],
       dtype='object'))

In [7]:
def fix_feature_column(X, col):
    fixed_col = []
    for c in X[col]:
        f = [float(x) for x in c[1:-1].replace('\n', '').split()]
        fixed_col.append(f)
    return fixed_col

In [8]:
for c in ["image1Features", "image2Features"]:
    matchPairsTrain[c] = fix_feature_column(matchPairsTrain, c)
    matchPairsTest[c] = fix_feature_column(matchPairsTest, c)
    mismatchPairsTrain[c] = fix_feature_column(mismatchPairsTrain, c)
    mismatchPairsTest[c] = fix_feature_column(mismatchPairsTest, c)

In [9]:
def sim(x1, x2):
    return cosine_similarity(tf.reshape(x1, (1, -1)), tf.reshape(x2, (1, -1)))

print("Test Calculations")
f1, f2 = matchPairsTrain.iloc[0].image1Features, matchPairsTrain.iloc[0].image2Features
g1, g2 = mismatchPairsTrain.iloc[0].image1Features, mismatchPairsTrain.iloc[0].image2Features
print(f"Match: {sim(f1, f2)}")
print(f"Mismatch: {sim(g1, g2)}")

Test Calculations
Match: [[0.9637308]]
Mismatch: [[1.2940537e-08]]


Cos Similarity ~ 1 $\Rightarrow$ Match<br>
Cos Similarity ~ 0 $\Rightarrow$ Mismatch

In [10]:
def sim_cols(X, threshold, match=True):
    score, correct = [], []
    for _ , person in X.iterrows():
        f1, f2 = person.image1Features, person.image2Features
        s = sim(f1, f2)
        score.append(s)
        if s > threshold:
            if match:
                correct.append(True)
            else:
                correct.append(False)
        else:
            if match:
                correct.append(False)
            else:
                correct.append(True)
    return score, correct

In [11]:
thr = 0.5
matchPairsTrain["CosineSimilarity"], matchPairsTrain["Results"] = sim_cols(matchPairsTrain, thr)
matchPairsTest["CosineSimilarity"], matchPairsTest["Results"] = sim_cols(matchPairsTest, thr)
mismatchPairsTrain["CosineSimilarity"], mismatchPairsTrain["Results"] = sim_cols(mismatchPairsTrain, thr, match=False)
mismatchPairsTest["CosineSimilarity"], mismatchPairsTest["Results"] = sim_cols(mismatchPairsTest, thr, match=False)

In [12]:
print(f"Accuracies w/ Threshold {thr}")
print(f"{'-'*30}")
print(f'matchPairsTrain: \t{len(matchPairsTrain[matchPairsTrain["Results"] == True]) / len(matchPairsTrain.Results):0.2f}')
print(f'matchPairsTest: \t{len(matchPairsTest[matchPairsTest["Results"] == True]) / len(matchPairsTest.Results):0.2f}')
print(f'mismatchPairsTrain: \t{len(mismatchPairsTrain[mismatchPairsTrain["Results"] == True]) / len(mismatchPairsTrain.Results):0.2f}')
print(f'mismatchPairsTest: \t{len(mismatchPairsTest[mismatchPairsTest["Results"] == True]) / len(mismatchPairsTest.Results):0.2f}')

Accuracies w/ Threshold 0.5
------------------------------
matchPairsTrain: 	0.62
matchPairsTest: 	0.63
mismatchPairsTrain: 	0.40
mismatchPairsTest: 	0.36
