# Analogy Using Embeddings

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import numpy as np

In [2]:
glove_dir = './temp'
embedding_index = {}
with open(os.path.join(glove_dir, 'glove.6B.100d.txt'), 'r') as file:
    for line in file.readlines():
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

## Cosine Similarity
In data analysis, **cosine similarity** is a measure of similarity between two non-zero vectors defined in an inner product space. Cosine similarity is the cosine of the angle between the vectors; that is, it is the dot product of the vectors divided by the product of their lengths. It follows that the cosine similarity does not depend on the magnitudes of the vectors, but only on their angle. The cosine similarity always belongs to the interval $[-1, 1]$. For example, two proportional vectors have a cosine similarity of $1$, two orthogonal vectors have a similarity of $0$, and two opposite vectors have a similarity of $-1$. In some contexts, the component values of the vectors cannot be negative, in which case the cosine similarity is bounded in $[0, 1]$

### Definition
The cosine of two non-zero vectors can be derived by using the Euclidean dot product formula:
$$
\begin{equation}
    A \cdot B = \lVert A \rVert \lVert B \rVert cos(\theta)
\end{equation}
$$
Given two $n$-dimensional vectors of attributes, $\mathbf{A}$ and $\mathbf{B}$, the cosine similarity, $cos(\theta)$, is represented using a dot product and magnitude as:
$$
\begin{align}
    \text{Cosine Similarity} = S_C(A, B) &= cos(\theta)\\
    &= \frac{\mathbf{A \cdot B}}{\lVert \mathbf{A} \rVert \lVert \mathbf{B} \rVert}\\
    &= \frac{\sum_{i=1}^{n}A_iB_i}{\sqrt{\sum_{i=1}^{n}A_i^2} \cdot \sqrt{\sum_{i = 1}^{n}B_i^2}}
\end{align}
$$

In [3]:
def similarity(u, v):
    dot = np.dot(u, v.T)
    length_prod = np.sqrt(np.sum(np.square(u), axis=-1)) * np.sqrt(np.sum(np.square(v), axis=-1))
    return dot / (length_prod + 1e-12)

In [4]:
father = embedding_index['father']
mother = embedding_index['mother']
ball = embedding_index['ball']
crocodile = embedding_index['crocodile']
france = embedding_index['france']
tehran = embedding_index['tehran']
paris = embedding_index['paris']
iran = embedding_index['iran']
print('Cosine similarity of (father, mother) is', similarity(father, mother))
print('Cosine similarity of (ball, crocodile) is', similarity(ball, crocodile))
print('Cosine similarity of (france - paris, iran - tehran) is', similarity(france - paris, iran - tehran))

Cosine similarity of (father, mother) is 0.8656660919003184
Cosine similarity of (ball, crocodile) is 0.1520657243738079
Cosine similarity of (france - paris, iran - tehran) is 0.6854124336246552


## Word Analogy Task
In the work analogy task, we complete the sentence
<font color="red">
"a is to b as c is to ____"
<font color="black">. An example is 
<font color="red">"man is to woman as king is to queen"
<font color="black">. In detail, we are trying to find a word $d$, such that the associated word vectors $e_a, e_b, e_c, e_d$ are related in the following manner: $e_b - e_a \approx e_d - e_c$. We will measure the similarity between $e_b - e_a$ and $e_d - e_c$ using cosine similarity.

In [5]:
def analogy(word_a, word_b, word_c, embedding_index):
    word_a, word_b, word_c = word_a.lower(), word_b.lower(), word_c.lower()
    e_a, e_b, e_c = embedding_index[word_a], embedding_index[word_b], embedding_index[word_c]
    words = np.array(list(embedding_index.keys()))

    # get index of each word
    ## This is done so to avoid word_d being the same thing
    index_a = np.argmax(words == word_a)
    index_b = np.argmax(words == word_b)
    index_c = np.argmax(words == word_c)
    
    embedding_matrix = np.array(list(embedding_index.values()), dtype='float32')
    part_1 = e_b - e_a
    part_2 = embedding_matrix - e_c
    similarities = similarity(part_2, part_1)
    # avoid word_d to be the same word as word_a, word_b and word_c
    similarities[[index_a, index_b, index_c]] = -100
    # select the most similar word
    selected_word = words[np.argmax(similarities)]
    return selected_word

In [6]:
analogy('china', 'chinese', 'iran', embedding_index)

'iranian'

In [7]:
analogy('india', 'delhi', 'iran', embedding_index)

'tehran'

In [8]:
analogy('man', 'woman', 'boy', embedding_index)

'girl'

In [9]:
analogy('small', 'smaller', 'big', embedding_index)

'bigger'

In [10]:
analogy('man', 'hardworking', 'women', embedding_index) 

'low-income'

In [11]:
analogy('iran', 'farsi', 'canada', embedding_index) 

'inuktitut'