In [2]:
import numpy as np

def softmax(x_):
    x = np.atleast_2d(x_)
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

# Lets consider a limited vocabulary of 9 words
word_vects = {}
word_vects['yankees'] = np.array([[0.,0.,0.]])
word_vects['bears'] = np.array([[0.,0.,0.]])
word_vects['braves'] = np.array([[0.,0.,0.]])
word_vects['red'] = np.array([[0.,0.,0.]])
word_vects['sox'] = np.array([[0.,0.,0.]])
word_vects['lose'] = np.array([[0.,0.,0.]])
word_vects['defeat'] = np.array([[0.,0.,0.]])
word_vects['beat'] = np.array([[0.,0.,0.]])
word_vects['tie'] = np.array([[0.,0.,0.]])

# Classification layer (Weight matrix to predict next word given a sentence vector of size 3)
sent2output = np.random.rand(3, len(word_vects))

# Transition weights (Identity matix initially)
identity = np.eye(3)

In [3]:
print(sent2output)

[[0.01636224 0.08583247 0.15471846 0.85304228 0.62501893 0.48109461
  0.55273576 0.50447677 0.56604863]
 [0.22439237 0.75110089 0.67995706 0.1248642  0.89321913 0.85559151
  0.83948413 0.01708117 0.19405993]
 [0.32043313 0.14318351 0.83059157 0.34954823 0.08050018 0.18747478
  0.74834337 0.15884757 0.10447104]]


In [4]:

# Forward propagation for the sentence "red sox defeat" to predict "yankees".
layer_0 = word_vects['red']
layer_1 = layer_0.dot(identity) + word_vects['sox']
layer_2 = layer_1.dot(identity) + word_vects['defeat']

# Trying to predict which values should come up given previous inputs
pred = softmax(layer_2.dot(sent2output))

print(pred)

[[0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.11111111
  0.11111111 0.11111111 0.11111111]]


<img src="./images/sentence_embedding.png" width=400 height=400 />

### Image credit: Grokkin Deep Learning

In [7]:
# This is the expected output "Yankees" (from the classification layer)
y = np.array([1,0,0,0,0,0,0,0,0])

# Doing the backpropagation
pred_delta = pred - y   # Error

layer_2_delta = pred_delta.dot(sent2output.T)

defeat_delta = layer_2_delta * 1
layer_1_delta = layer_2_delta.dot(identity.T)

sox_delta = layer_1_delta * 1
layer_0_delta = layer_1_delta.dot(identity.T)

alpha = 0.01

# Modifing the word embedding based on the gradient
word_vects['red'] -= layer_0_delta * alpha
word_vects['sox'] -= sox_delta * alpha
word_vects['defeat'] -= defeat_delta * alpha

# Modifying the transition matrix
identity -= np.outer(layer_0, layer_1_delta) * alpha
identity -= np.outer(layer_1, layer_2_delta) * alpha

sent2output -= np.outer(layer_2, pred_delta) * alpha


[[0.41023    0.28446878 0.00438836]]


### When you add two vectors together during forward propagation, you backpropagate the same gradient into both sides of the addition. When you generate **layer_2_delta**, you’ll backpropagate it twice: once across the identity matrix to create **layer_1_delta**, and again to **word_vects['defeat']**