## softmax

In [12]:
import numpy as np
import random

In [13]:
def softmax(x):
											#x=np.array([[1001,1002],[3,4]])
	if len(x.shape) > 1:					#x.shape=(2, 2)  len(x.shape)=2				
		tmp = np.max(x, axis = 1)			#np.max(x, axis = 1)=array([1002,  4])， max in each row
		x -= tmp.reshape((x.shape[0], 1))	#tmp.reshape((x.shape[0], 1))， tmp becomes 2row1column
		x = np.exp(x)						#xi - max this row, then exp
		tmp = np.sum(x, axis = 1)			#array([ 1.36787944,  1.36787944])，sum of each row
		x /= tmp.reshape((x.shape[0], 1))	#xi / sum this row
	
	else:									#x=[1,2]   x.shape=(2,)   len(x.shape)=1
		tmp = np.max(x)
		x -= tmp
		x = np.exp(x)
		tmp = np.sum(x)
		x /= tmp
	
	return x

## gradcheck

In [14]:
# Function: 
# for each element in x
# compare derivative calculated by formular and calculus
# f: 1st parameter is cost function, 2nd parameter is gradient
def gradcheck_naive(f, x):
	
	#Return an object capturing the current internal state of the generator
	rndstate = random.getstate()			#why use state??????
	random.setstate(rndstate)
	fx, grad = f(x)							#fx=np.sum(x ** 2), grad=x * 2 
	h = 1e-4
	
	#Efficient multi-dimensional iterator object to iterate over arrays
	# Iterate over all indexes in x
	it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])	
	
	while not it.finished:
		ix = it.multi_index					#starts from (0, 0) then (0, 1)
		
		x[ix] += h							#To calculate [f(xi+h)-f(xi-h)] / 2h
		random.setstate(rndstate)
		fxh, _ = f(x)
		x[ix] -= 2*h
		random.setstate(rndstate)
		fxnh, _ = f(x)
		x[ix] += h
		numgrad = (fxh - fxnh) / 2 / h
											#To compare gradient calculated by formular and calculus
		reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
		if reldiff > 1e-5:
			print "Gradient check failed."
			print "First gradient error found at index %s" % str(ix)
			print "Your gradient: %f \t Numerical gradient: %f" % (grad[ix], numgrad)
			return
		
		it.iternext()
		
	print "Gradient check passed"

## word2vec

In [16]:
def normalizeRows(x):

	N = x.shape[0]
	x /= np.sqrt(np.sum(x**2, axis=1)).reshape((N,1)) + 1e-30
	
	return x


def test_normalize_rows():
	print "Testing normalizeRows..."
	x = normalizeRows(np.array([[3.0, 4.0],[1, 2]]))
	print x
	assert (np.amax(np.fabs(x - np.array([[0.6,0.8],[0.4472136,0.89442719]]))) <= 1e-6)
	print ""

### """ Softmax cost function for word2vec models """

In [17]:
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
	""" Softmax cost function for word2vec models """
	
	probabilities = softmax(predicted.dot(outputVectors.T))			
	cost = -np.log(probabilities[target])
	
	delta = probabilities
	delta[target] -= 1
	
	N = delta.shape[0]												#delta.shape = (5,)
	D = predicted.shape[0]											#predicted.shape = (3,)
	grad = delta.reshape((N, 1)) * predicted.reshape((1, D))
	gradPred = (delta.reshape((1, N)).dot(outputVectors)).flatten()
	
	return cost, gradPred, grad

### """ Skip-gram model in word2vec """

In [18]:
def skipgram(currentWord, C, contextWords, tokens, inputVectors, outputVectors,
	dataset, word2vecCostAndGradient = softmaxCostAndGradient):
	""" Skip-gram model in word2vec """
	
	currentI = tokens[currentWord]						#the order of this center word in the whole vocabulary
	predicted = inputVectors[currentI, :]				#turn this word to vector representation
	
	cost = 0.0
	gradIn = np.zeros(inputVectors.shape)
	gradOut = np.zeros(outputVectors.shape)
	for cwd in contextWords:							#contextWords is of 2C length
		idx = tokens[cwd]
		cc, gp, gg = word2vecCostAndGradient(predicted, idx, outputVectors, dataset)
		cost += cc										#final cost/gradient is the 'sum' of result calculated by each word in context
		gradOut += gg
		gradIn[currentI, :] += gp
	
	return cost, gradIn, gradOut

###  word2vec_sgd_wrapper

In [19]:
def word2vec_sgd_wrapper(word2vecModel, tokens, wordVectors, dataset, C, word2vecCostAndGradient = softmaxCostAndGradient):
	batchsize = 50
	cost = 0.0
	grad = np.zeros(wordVectors.shape)   #each element in wordVectors has a gradient
	N = wordVectors.shape[0]
	inputVectors = wordVectors[:N/2, :]
	outputVectors = wordVectors[N/2:, :]
	for i in xrange(batchsize):									#train word2vecModel for 50 times
		C1 = random.randint(1, C)
		centerword, context = dataset.getRandomContext(C1)		#randomly choose 1 word, and generate a context of it
		
		if word2vecModel == skipgram:
			denom = 1
		else:
			denom = 1
		
		c, gin, gout = word2vecModel(centerword, C1, context, tokens, inputVectors, outputVectors, dataset, word2vecCostAndGradient)
		cost += c / batchsize / denom							#calculate the average
		grad[:N/2, :] += gin / batchsize / denom
		grad[N/2:, :] += gout / batchsize / denom
	
	return cost, grad

## test_word2vec

In [20]:
def test_word2vec():
    # Interface to the dataset for negative sampling
    dataset = type('dummy', (), {})()
    def dummySampleTokenIdx():
        return random.randint(0, 4)

    def getRandomContext(C):
        tokens = ["a", "b", "c", "d", "e"]
        return tokens[random.randint(0,4)], [tokens[random.randint(0,4)] \
           for i in xrange(2*C)]
    dataset.sampleTokenIdx = dummySampleTokenIdx
    dataset.getRandomContext = getRandomContext

    random.seed(31415)
    np.random.seed(9265)
    dummy_vectors = normalizeRows(np.random.randn(10,3))
    dummy_tokens = dict([("a",0), ("b",1), ("c",2),("d",3),("e",4)])

    print "==== Gradient check for skip-gram ===="
    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5), dummy_vectors)

    print "\n=== Results ==="
    print skipgram("c", 3, ["a", "b", "e", "d", "b", "c"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset)

if __name__ == "__main__":
    test_word2vec()

==== Gradient check for skip-gram ====
Gradient check passed

=== Results ===
(11.166109001533981, array([[ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [-1.26947339, -1.36873189,  2.45158957],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ]]), array([[-0.41045956,  0.18834851,  1.43272264],
       [ 0.38202831, -0.17530219, -1.33348241],
       [ 0.07009355, -0.03216399, -0.24466386],
       [ 0.09472154, -0.04346509, -0.33062865],
       [-0.13638384,  0.06258276,  0.47605228]]))
