## Finding odd one out

In [6]:
import gensim
from gensim.models import word2vec
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [3]:
v_apple = word_vectors['apple']
v_mango = word_vectors['mango']

print(v_apple.shape, v_mango.shape)

(300,) (300,)


In [4]:
cosine_similarity([v_apple], [v_mango])

array([[0.57518554]], dtype=float32)

In [7]:
def odd_one_out(words):
    
    all_word_vectors = [word_vectors[w] for w in words]
    
    avg_vector = np.mean(all_word_vectors, axis=0)
    
    odd_one_out = None
    min_similarity = 1.0
    
    for w in words:
        sim = cosine_similarity([word_vectors[w]], [avg_vector])
        if sim < min_similarity:
            min_similarity = sim
            odd_one_out = w
            
        print('Similarity between %s and average vector is %.2f' %(w,sim))
            
    return odd_one_out

In [8]:
input_1 = ['apple', 'mango', 'juice', 'party', 'orange']
input_2 = ['music', 'dance', 'sleep', 'dancer', 'food']
input_3 = ['match', 'player', 'football', 'cricket', 'dancer']
input_4 = ['india', 'paris', 'russia', 'france', 'germany']

In [9]:
odd_one_out(input_1)

Similarity between apple and average vector is 0.78
Similarity between mango and average vector is 0.76
Similarity between juice and average vector is 0.71
Similarity between party and average vector is 0.36
Similarity between orange and average vector is 0.65


'party'

In [10]:
odd_one_out(input_2)

Similarity between music and average vector is 0.66
Similarity between dance and average vector is 0.81
Similarity between sleep and average vector is 0.51
Similarity between dancer and average vector is 0.72
Similarity between food and average vector is 0.52


'sleep'

In [11]:
odd_one_out(input_3)

Similarity between match and average vector is 0.58
Similarity between player and average vector is 0.68
Similarity between football and average vector is 0.72
Similarity between cricket and average vector is 0.70
Similarity between dancer and average vector is 0.53


'dancer'

In [12]:
odd_one_out(input_4)

Similarity between india and average vector is 0.81
Similarity between paris and average vector is 0.75
Similarity between russia and average vector is 0.79
Similarity between france and average vector is 0.81
Similarity between germany and average vector is 0.84


'paris'