## Finding odd one out

In [12]:
import gensim
from gensim.models import word2vec
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [13]:
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [3]:
v_apple = word_vectors['apple']
v_mango = word_vectors['mango']

print(v_apple.shape, v_mango.shape)

(300,) (300,)


In [4]:
cosine_similarity([v_apple], [v_mango])

array([[0.57518554]], dtype=float32)

In [7]:
def odd_one_out(words):
    
    all_word_vectors = [word_vectors[w] for w in words]
    
    avg_vector = np.mean(all_word_vectors, axis=0)
    
    odd_one_out = None
    min_similarity = 1.0
    
    for w in words:
        sim = cosine_similarity([word_vectors[w]], [avg_vector])
        if sim < min_similarity:
            min_similarity = sim
            odd_one_out = w
            
        print('Similarity between %s and average vector is %.2f' %(w,sim))
            
    return odd_one_out

In [8]:
input_1 = ['apple', 'mango', 'juice', 'party', 'orange']
input_2 = ['music', 'dance', 'sleep', 'dancer', 'food']
input_3 = ['match', 'player', 'football', 'cricket', 'dancer']
input_4 = ['india', 'paris', 'russia', 'france', 'germany']

In [9]:
odd_one_out(input_1)

Similarity between apple and average vector is 0.78
Similarity between mango and average vector is 0.76
Similarity between juice and average vector is 0.71
Similarity between party and average vector is 0.36
Similarity between orange and average vector is 0.65


'party'

In [10]:
odd_one_out(input_2)

Similarity between music and average vector is 0.66
Similarity between dance and average vector is 0.81
Similarity between sleep and average vector is 0.51
Similarity between dancer and average vector is 0.72
Similarity between food and average vector is 0.52


'sleep'

In [11]:
odd_one_out(input_3)

Similarity between match and average vector is 0.58
Similarity between player and average vector is 0.68
Similarity between football and average vector is 0.72
Similarity between cricket and average vector is 0.70
Similarity between dancer and average vector is 0.53


'dancer'

In [12]:
odd_one_out(input_4)

Similarity between india and average vector is 0.81
Similarity between paris and average vector is 0.75
Similarity between russia and average vector is 0.79
Similarity between france and average vector is 0.81
Similarity between germany and average vector is 0.84


'paris'

## Word Analogies

In [14]:
def predict_word(a, b, c, word_vectors):
    
    a,b,c = a.lower(), b.lower(), c.lower()
    
    max_similarity = -100
    d = None
    
    words = word_vectors.vocab.keys()
    wa, wb, wc = word_vectors[a], word_vectors[b], word_vectors[c]
    
    for w in words:
        if w in [a,b,c]:
            continue
        
        wv = word_vectors[w]
        sim = cosine_similarity([wb-wa], [wv-wc])
        
        if sim > max_similarity:
            max_similarity = sim
            d = w
            
    return d

In [15]:
triad_1 = ('man', 'woman', 'prince')
predict_word(*triad_1, word_vectors)

'princess'

## Using the Most Similar Method

In [None]:
word_vectors.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

## Word Analogies - Part 2

### Data Preparation

In [40]:
import gensim
from gensim.models import word2vec
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

In [41]:
import nltk
from nltk.corpus import stopwords

In [42]:
stopw = set(stopwords.words('english'))

In [43]:
def readFile(file):
    f = open(file, 'r', encoding='utf-8')
    text = f.read()
    
    #Tokenization
    sentences = nltk.sent_tokenize(text)
    
    data = []
    for s in sentences:
        words = nltk.word_tokenize(s)
        words = [w.lower() for w in words if len(w)>2 and w not in stopw]
        data.append(words)
    
    return data

In [44]:
text = readFile('bollywood.txt')
print(text)

[['deepika', 'padukone', 'ranveer', 'singh', 'wedding', 'one', 'biggest', 'bollywood', 'events', 'happened', '2018'], ['the', 'deepika', 'ranveer', 'celebrations', 'hooked', 'phones', 'waiting', 'come', 'also', 'gave', 'enough', 'reason', 'believe', 'stylish', 'two', 'couple'], ['from', 'airport', 'looks', 'reception', 'parties', 'everything', 'entire', 'timeline', 'deepika', 'ranveer', 'wedding', 'style', 'file'], ['not', 'ambanis', 'deepika', 'ranveer', 'priyanka', 'nick'], ['man', 'proves', 'wedding', 'the', 'year', 'this', 'year', 'year', 'big', 'fat', 'lavish', 'extravagant', 'weddings'], ['from', 'isha', 'ambani', 'anand', 'piramal', 'deepika', 'padukone', 'ranveer', 'singh', 'priyanka', 'chopra', 'nick', 'jonas', 'kapil', 'sharma', 'ginni', 'chatrath', '2018', 'saw', 'many', 'grand', 'weddings'], ['but', 'nothing', 'beats', 'man', 'wedding', 'the', 'year', 'award', 'social', 'media'], ['priyanka', 'also', 'shared', 'video', 'featuring', 'nick', 'jonaswas', 'also', 'celebrating',

### Create Model

In [45]:
from gensim.models import Word2Vec

model = Word2Vec(text, size=300, window=10, min_count=1)
print(model)

Word2Vec(vocab=116, size=300, alpha=0.025)


In [46]:
words = list(model.wv.vocab)
print(words)

['deepika', 'padukone', 'ranveer', 'singh', 'wedding', 'one', 'biggest', 'bollywood', 'events', 'happened', '2018', 'the', 'celebrations', 'hooked', 'phones', 'waiting', 'come', 'also', 'gave', 'enough', 'reason', 'believe', 'stylish', 'two', 'couple', 'from', 'airport', 'looks', 'reception', 'parties', 'everything', 'entire', 'timeline', 'style', 'file', 'not', 'ambanis', 'priyanka', 'nick', 'man', 'proves', 'year', 'this', 'big', 'fat', 'lavish', 'extravagant', 'weddings', 'isha', 'ambani', 'anand', 'piramal', 'chopra', 'jonas', 'kapil', 'sharma', 'ginni', 'chatrath', 'saw', 'many', 'grand', 'but', 'nothing', 'beats', 'award', 'social', 'media', 'shared', 'video', 'featuring', 'jonaswas', 'celebrating', 'family', 'first', 'celebrated', 'christmas', 'london', 'pictures', 'new', 'outstanding', 'glimpses', 'celebration', 'verbier', 'switzerland', 'married', 'december', 'three', 'receptions', 'delhi', 'mumbai', 'jaggo', 'night', 'made', 'even', 'special', 'industry', 'friends', 'long', '

### Create Analogies

In [47]:
def predict_actor(a, b, c, word_vectors):
    
    a,b,c = a.lower(), b.lower(), c.lower()
    max_similarity = -100
    d = None
    
    wa,wb,wc = word_vectors[a], word_vectors[b], word_vectors[c]
    options = ['ranveer', 'deepika', 'padukone', 'singh', 'nick', 'jonas', 'chopra', 'priyanka', 'virat', 'anushka', 'ginni']
    
    for w in options:
        if w in [a,b,c]:
            continue
            
        wv = word_vectors[w]
        sim = cosine_similarity([wb-wa], [wv-wc])
        
        if sim>max_similarity:
            max_similarity = sim
            d = w
    
    return d

### Test your model

In [48]:
triad = ('nick', 'priyanka', 'virat')
predict_actor(*triad, model.wv)

'ranveer'

In [49]:
triad = ('ranveer', 'deepika', 'priyanka')
predict_actor(*triad, model.wv)

'padukone'

In [50]:
triad = ('ranveer', 'singh', 'deepika')
predict_actor(*triad, model.wv)

'nick'