## softmax

In [1]:
import numpy as np
import random

In [2]:
def softmax(x):
											#x=np.array([[1001,1002],[3,4]])
	if len(x.shape) > 1:					#x.shape=(2, 2)  len(x.shape)=2				
		tmp = np.max(x, axis = 1)			#np.max(x, axis = 1)=array([1002,  4])， max in each row
		x -= tmp.reshape((x.shape[0], 1))	#tmp.reshape((x.shape[0], 1))， tmp becomes 2row1column
		x = np.exp(x)						#xi - max this row, then exp
		tmp = np.sum(x, axis = 1)			#array([ 1.36787944,  1.36787944])，sum of each row
		x /= tmp.reshape((x.shape[0], 1))	#xi / sum this row
	
	else:									#x=[1,2]   x.shape=(2,)   len(x.shape)=1
		tmp = np.max(x)
		x -= tmp
		x = np.exp(x)
		tmp = np.sum(x)
		x /= tmp
	
	return x

## gradcheck

In [3]:
# Function: 
# for each element in x
# compare derivative calculated by formular and calculus
# f: 1st parameter is cost function, 2nd parameter is gradient
def gradcheck_naive(f, x):
	
	#Return an object capturing the current internal state of the generator
	rndstate = random.getstate()			#why use state??????
	random.setstate(rndstate)
	fx, grad = f(x)							#fx=np.sum(x ** 2), grad=x * 2 
	h = 1e-4
	
	#Efficient multi-dimensional iterator object to iterate over arrays
	# Iterate over all indexes in x
	it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])	
	
	while not it.finished:
		ix = it.multi_index					#starts from (0, 0) then (0, 1)
		
		x[ix] += h							#To calculate [f(xi+h)-f(xi-h)] / 2h
		random.setstate(rndstate)
		fxh, _ = f(x)
		x[ix] -= 2*h
		random.setstate(rndstate)
		fxnh, _ = f(x)
		x[ix] += h
		numgrad = (fxh - fxnh) / 2 / h
											#To compare gradient calculated by formular and calculus
		reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
		if reldiff > 1e-5:
			print "Gradient check failed."
			print "First gradient error found at index %s" % str(ix)
			print "Your gradient: %f \t Numerical gradient: %f" % (grad[ix], numgrad)
			return
		
		it.iternext()
		
	print "Gradient check passed"

## sigmoid

In [4]:
def sigmoid(x):
	
	x = 1. / (1 + np.exp(-x))
	
	return x

#input f is the sigmoid function value of your original input x
def sigmoid_grad(f):

	f = f * (1-f)
	
	return f

## Neural Network

In [5]:
N = 20
dimensions = [10, 5, 10]							#Dimension of x, H, y=labels
data = np.random.randn(N, dimensions[0])			#data.shape, labels.shape = (20, 10)
labels = np.zeros((N, dimensions[2]))
for i in xrange(N):
	labels[i, random.randint(0, dimensions[2]-1)] = 1	#each row randomly set a position to 1
	
params = np.random.randn((dimensions[0]+1) * dimensions[1] + 
	(dimensions[1]+1) * dimensions[2], )				#params.shape = (115,)


In [6]:
# Distribute elements in params to W1 b1 W2 b2
ofs = 0
Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])
	
W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))					#W1.shape = (10, 5)
ofs += Dx * H
b1 = np.reshape(params[ofs:ofs + H], (1, H))						#b1.shape = (1, 5)
ofs += H
W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))					#W2.shape = (5, 10)
ofs += H * Dy
b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))						#b2.shape = (1, 10)

In [7]:
W1

array([[-0.086359  , -1.7893894 ,  1.72879812, -0.37633982, -0.61258456],
       [-0.22882234,  0.73135113,  0.77769584, -1.07391963,  0.73834674],
       [ 1.50172915, -0.7012838 , -0.5635071 , -0.37266293,  1.74841039],
       [-1.50710725,  0.32849134, -1.49382086,  0.25151469, -0.28410388],
       [-0.9832677 , -0.49509124, -1.660077  ,  0.10368796,  0.24700598],
       [ 0.45922949,  1.02942231, -0.11449204,  0.58113958, -0.63698562],
       [-0.38942537,  1.46078631, -2.53980613, -0.02312754, -1.79198136],
       [-0.34912294,  0.57365517, -0.04883836,  0.0150068 , -0.4150821 ],
       [-1.22877365, -0.89924689, -0.31146598, -1.59621439,  0.65435119],
       [ 0.03179186,  0.69071231,  1.69296701, -0.57461252,  0.09830557]])

In [8]:
data.dot(W1)

array([[ 1.74739421, -2.52766907,  4.63602874, -0.99422554,  1.18788305],
       [ 0.06737661, -1.42986099,  0.65388524,  1.34774694,  1.54353151],
       [-0.29748813, -1.18121227,  2.68004763, -1.44619471, -0.90652675],
       [ 1.73578308,  2.66167759,  5.60514929, -1.50578679, -1.33226899],
       [ 1.49060587, -3.27278625, -3.20716246,  0.52243271,  4.69049248],
       [-2.88377063, -2.578496  , -6.28608908,  1.13592949,  0.0573391 ],
       [ 1.8516827 ,  0.45395863,  5.42309698, -4.06832975,  2.83581701],
       [-2.75855323,  2.56198846,  1.43315557, -2.35535709, -2.74202105],
       [-0.05986138, -2.45954458,  5.07502389, -5.28570851,  4.12189691],
       [ 1.25845335, -1.67623477,  7.0670415 , -0.26483213,  1.20555565],
       [-4.30714775,  7.82231375, -4.76031918, -0.283028  , -3.91621723],
       [ 2.87494752,  0.52854677,  6.69335065, -0.40067715, -0.08428161],
       [ 1.43050942,  2.30168206, -2.46841334, -0.63324945,  1.85357011],
       [ 3.91095724, -0.9530142 ,  3.2

In [9]:
b1

array([[-1.0605455 , -0.38391412,  0.06158494,  0.50079202, -0.84483032]])

In [10]:
data.dot(W1) + b1

array([[ 0.68684871, -2.91158319,  4.69761367, -0.49343352,  0.34305273],
       [-0.99316888, -1.81377511,  0.71547018,  1.84853896,  0.69870119],
       [-1.35803362, -1.56512639,  2.74163256, -0.94540269, -1.75135707],
       [ 0.67523758,  2.27776347,  5.66673422, -1.00499477, -2.17709931],
       [ 0.43006037, -3.65670036, -3.14557752,  1.02322473,  3.84566217],
       [-3.94431612, -2.96241012, -6.22450414,  1.63672151, -0.78749121],
       [ 0.7911372 ,  0.07004452,  5.48468191, -3.56753773,  1.9909867 ],
       [-3.81909873,  2.17807434,  1.49474051, -1.85456507, -3.58685137],
       [-1.12040688, -2.8434587 ,  5.13660883, -4.78491649,  3.27706659],
       [ 0.19790786, -2.06014889,  7.12862644,  0.23595989,  0.36072533],
       [-5.36769324,  7.43839963, -4.69873425,  0.21776402, -4.76104755],
       [ 1.81440202,  0.14463265,  6.75493559,  0.10011487, -0.92911192],
       [ 0.36996393,  1.91776794, -2.4068284 , -0.13245743,  1.0087398 ],
       [ 2.85041174, -1.33692832,  3.3

In [12]:
hidden = sigmoid(data.dot(W1) + b1)

In [13]:
hidden

array([[ 0.66526554,  0.05158393,  0.99096536,  0.37908505,  0.58493188],
       [ 0.27028662,  0.14018249,  0.67160874,  0.86395547,  0.66789975],
       [ 0.20456008,  0.17291227,  0.93943905,  0.27981032,  0.14787611],
       [ 0.66267495,  0.9070186 ,  0.99655278,  0.26796052,  0.10182591],
       [ 0.60588808,  0.02516779,  0.04126589,  0.73560026,  0.97907497],
       [ 0.0189966 ,  0.04915324,  0.00197639,  0.83708834,  0.31270761],
       [ 0.68807546,  0.51750397,  0.9958673 ,  0.02745047,  0.87984749],
       [ 0.02147622,  0.89826323,  0.81678873,  0.1353378 ,  0.02693953],
       [ 0.24593582,  0.05502043,  0.99415676,  0.0082856 ,  0.96363363],
       [ 0.5493161 ,  0.1130309 ,  0.99919882,  0.55871779,  0.589216  ],
       [ 0.00464322,  0.99941212,  0.00902461,  0.55422688,  0.00848405],
       [ 0.85989306,  0.53609526,  0.99883624,  0.52500783,  0.28310492],
       [ 0.59145026,  0.87188932,  0.08265348,  0.46693397,  0.73277345],
       [ 0.94533996,  0.20801565,  0.9

In [15]:
prediction = softmax(hidden.dot(W2) + b2)

In [16]:
prediction

array([[ 0.37591471,  0.02592661,  0.01899812,  0.00541773,  0.0157326 ,
         0.00483973,  0.14582068,  0.2522755 ,  0.09127206,  0.06380227],
       [ 0.28367236,  0.03853217,  0.01910914,  0.01931276,  0.0236569 ,
         0.00814624,  0.29496255,  0.11442976,  0.16611075,  0.03206737],
       [ 0.24680205,  0.03131329,  0.06243686,  0.009442  ,  0.0340785 ,
         0.01374054,  0.20746283,  0.23752421,  0.07748739,  0.07971232],
       [ 0.12291278,  0.00670877,  0.06067634,  0.0044787 ,  0.02071233,
         0.01098134,  0.12260692,  0.52175974,  0.06575338,  0.0634097 ],
       [ 0.27505427,  0.0534861 ,  0.00850434,  0.03578428,  0.01315435,
         0.00517647,  0.27086285,  0.04977741,  0.25494855,  0.03325139],
       [ 0.12875119,  0.0570546 ,  0.03399454,  0.05375922,  0.04759961,
         0.01366274,  0.39188392,  0.03231193,  0.20989546,  0.0310868 ],
       [ 0.33433522,  0.01699837,  0.01389638,  0.00606421,  0.00453218,
         0.00630411,  0.1068146 ,  0.38944567

In [17]:
cost = -np.sum(np.log(prediction) * labels)

In [18]:
cost

68.768467527309085

In [19]:
np.log(prediction)

array([[-0.97839301, -3.65248544, -3.96341549, -5.21807823, -4.15202049,
        -5.33089692, -1.92537764, -1.37723352, -2.39391059, -2.75196644],
       [-1.25993536, -3.25626191, -3.95758829, -3.94698935, -3.74410038,
        -4.81019863, -1.2209069 , -2.1677941 , -1.79510054, -3.43991625],
       [-1.39916869, -3.46371277, -2.77359946, -4.66258694, -3.37908846,
        -4.28740455, -1.57280307, -1.43748573, -2.55764007, -2.5293311 ],
       [-2.0962803 , -5.00433904, -2.80220138, -5.40842331, -3.87702616,
        -4.51155793, -2.09877178, -0.65054806, -2.7218442 , -2.7581385 ],
       [-1.29078686, -2.92833343, -4.76717903, -3.33024647, -4.33100295,
        -5.2636322 , -1.30614269, -3.00019409, -1.36669353, -3.40365861],
       [-2.04987347, -2.86374663, -3.38155534, -2.92324009, -3.04493075,
        -4.29308289, -0.93678961, -3.43231879, -1.5611457 , -3.47097203],
       [-1.09561115, -4.07463795, -4.27612691, -5.1053517 , -5.3965521 ,
        -5.06655269, -2.23666069, -0.94303092

In [21]:
labels

array([[ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.

In [22]:
np.log(prediction) * labels

array([[-0.        , -0.        , -0.        , -0.        , -4.15202049,
        -0.        , -0.        , -0.        , -0.        , -0.        ],
       [-0.        , -0.        , -0.        , -0.        , -0.        ,
        -0.        , -0.        , -0.        , -1.79510054, -0.        ],
       [-0.        , -0.        , -0.        , -0.        , -0.        ,
        -0.        , -0.        , -1.43748573, -0.        , -0.        ],
       [-0.        , -0.        , -0.        , -0.        , -0.        ,
        -4.51155793, -0.        , -0.        , -0.        , -0.        ],
       [-0.        , -0.        , -4.76717903, -0.        , -0.        ,
        -0.        , -0.        , -0.        , -0.        , -0.        ],
       [-0.        , -0.        , -0.        , -0.        , -0.        ,
        -4.29308289, -0.        , -0.        , -0.        , -0.        ],
       [-0.        , -0.        , -0.        , -0.        , -0.        ,
        -0.        , -0.        , -0.        

In [24]:
delta = prediction - labels
gradW2 = hidden.T.dot(delta)
gradb2 = np.sum(delta, axis = 0)
	
delta = delta.dot(W2.T) * sigmoid_grad(hidden)
gradW1 = data.T.dot(delta)
gradb1 = np.sum(delta, axis = 0)
	
grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
	gradW2.flatten(), gradb2.flatten()))

In [34]:
grad.shape

(115,)

# word2vec

### some initialization

In [35]:
def normalizeRows(x):

	N = x.shape[0]
	x /= np.sqrt(np.sum(x**2, axis=1)).reshape((N,1)) + 1e-30
	
	return x

In [36]:
x = normalizeRows(np.array([[3.0, 4.0],[1, 2]]))
print x

[[ 0.6         0.8       ]
 [ 0.4472136   0.89442719]]


In [56]:
dataset = type('dummy', (), {})()

In [59]:
def dummySampleTokenIdx():
	return random.randint(0, 4)

In [60]:
random.randint(0, 4)

3

In [62]:
def getRandomContext(C):
    tokens = ["a", "b", "c", "d", "e"]
    return tokens[random.randint(0,4)], [tokens[random.randint(0,4)] \
        for i in xrange(2*C)]

In [69]:
dataset.sampleTokenIdx = dummySampleTokenIdx

In [70]:
dataset.sampleTokenIdx

<function __main__.dummySampleTokenIdx>

In [71]:
dataset.getRandomContext = getRandomContext

In [72]:
dataset.getRandomContext

<function __main__.getRandomContext>

In [73]:
random.seed(31415)

In [75]:
np.random.seed(9265)

In [78]:
np.random.randn(10,3)

array([[-0.70587231,  0.25710111, -0.35533466],
       [ 0.31069259, -2.04599152, -0.56811484],
       [ 1.36743115, -1.5807286 ,  0.88547839],
       [ 0.70812613,  0.06703542, -0.06339527],
       [ 1.93164664, -0.39561042, -1.8199931 ],
       [ 0.02396553,  1.12269695, -0.06042891],
       [-0.24787301, -0.57285093,  1.13467606],
       [-0.62011217, -1.66330779, -1.63206995],
       [ 0.35296059,  0.66356772,  2.07439977],
       [-0.47973227, -0.16808006,  0.47249777]])

In [79]:
dummy_vectors = normalizeRows(np.random.randn(10,3))

In [81]:
dummy_tokens = dict([("a",0), ("b",1), ("c",2), ("d",3), ("e",4)])

In [82]:
dummy_tokens

{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4}

### """ Softmax cost function for word2vec models """

In [83]:
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
	""" Softmax cost function for word2vec models """
	
	probabilities = softmax(predicted.dot(outputVectors.T))
	cost = -np.log(probabilities[target])
	delta = probabilities
	delta[target] -= 1
	N = delta.shape[0]
	D = predicted.shape[0]
	grad = delta.reshape((N, 1)) * predicted.reshape((1, D))
	gradPred = (delta.reshape((1, N)).dot(outputVectors)).flatten()
	
	return cost, gradPred, grad


### """ Negative sampling cost function for word2vec models """

In [84]:
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10):
	""" Negative sampling cost function for word2vec models """
	
	grad = np.zeros(outputVectors.shape)
	gradPred = np.zeros(predicted.shape)
	
	indices = [target]
	for k in xrange(K):
		newidx = dataset.sampleTokenIdx()
		while newidx == target:
			newidx = dataset.sampleTokenIdx()
		indices += [newidx]
	
	labels = np.array([1] + [-1 for k in xrange(K)])
	vecs = outputVectors[indices, :]
	
	t = sigmoid(vecs.dot(predicted) * labels)
	cost = -np.sum(np.log(t))
	
	delta = labels * (t-1)
	gradPred = delta.reshape((1, K+1)).dot(vecs).flatten()
	gradtemp = delta.reshape((K+1, 1)).dot(predicted.reshape(1, predicted.shape[0]))
	
	for k in xrange(K+1):
		grad[indices[k]] += gradtemp[k, :]
		
	return cost, gradPred, grad

### """ Skip-gram model in word2vec """

In [85]:
def skipgram(currentWord, C, contextWords, tokens, inputVectors, outputVectors,
	dataset, word2vecCostAndGradient = softmaxCostAndGradient):
	""" Skip-gram model in word2vec """
	
	currentI = tokens[currentWord]
	predicted = inputVectors[currentI, :]
	
	cost = 0.0
	gradIn = np.zeros(inputVectors.shape)
	gradOut = np.zeros(outputVectors.shape)
	for cwd in contextWords:
		idx = tokens[cwd]
		cc, gp, gg = word2vecCostAndGradient(predicted, idx, outputVectors, dataset)
		cost += cc
		gradOut += gg
		gradIn[currentI, :] += gp
	
	return cost, gradIn, gradOut

### """ CBOW model in word2vec """

In [86]:
def cbow(currentWord, C, contextWords, tokens, inputVectors, outputVectors,
	dataset, word2vecCostAndGradient = softmaxCostAndGradient):
	""" CBOW model in word2vec """
	
	cost = 0
	gradIn = np.zeros(inputVectors.shape)
	gradOut = np.zeros(outputVectors.shape)
	
	D = inputVectors.shape[1]
	predicted = np.zeros((D, ))
	
	indices = [tokens[cwd] for cwd in contextWords]
	for idx in indices:
		predicted += inputVectors[idx, :]
	
	cost, gp, gradOut = word2vecCostAndGradient(predicted, tokens[currentWord], outputVectors, dataset)
	gradIn = np.zeros(inputVectors.shape)
	for idx in indices:
		gradIn[idx, :] += gp
	
	return cost, gradIn, gradOut

## word2vec_sgd_wrapper

In [93]:

def word2vec_sgd_wrapper(word2vecModel, tokens, wordVectors, dataset, C, word2vecCostAndGradient = softmaxCostAndGradient):
	batchsize = 5
	cost = 0.0
	grad = np.zeros(wordVectors.shape)
	N = wordVectors.shape[0]
	inputVectors = wordVectors[:N/2, :]
	outputVectors = wordVectors[N/2:, :]
	for i in xrange(batchsize):
		C1 = random.randint(1, C)
		centerword, context = dataset.getRandomContext(C1)
		
		if word2vecModel == skipgram:
			denom = 1
		else:
			denom = 1
		
		c, gin, gout = word2vecModel(centerword, C1, context, tokens, inputVectors, outputVectors, dataset, word2vecCostAndGradient)
		cost += c / batchsize / denom
		grad[:N/2, :] += gin / batchsize / denom
		grad[N/2:, :] += gout / batchsize / denom
	
	return cost, grad

In [89]:
dummy_vectors

array([[-0.61331763,  0.77875956, -0.13181438],
       [-0.474378  , -0.55783081,  0.68102151],
       [-0.30087076, -0.54142512, -0.78507046],
       [ 0.02253624, -0.68215815,  0.73085729],
       [-0.61861899, -0.02709019, -0.78522396],
       [ 0.24199906, -0.02173036,  0.97003312],
       [ 0.07424959,  0.73251154, -0.67669332],
       [ 0.67872663, -0.66531069, -0.31095313],
       [ 0.7732178 ,  0.41869739,  0.4762633 ],
       [-0.47861833, -0.74233518,  0.46889549]])

In [94]:
word2vec_sgd_wrapper(skipgram, dummy_tokens, dummy_vectors, dataset, 5)

(12.016628332220138, array([[ 0.        ,  0.        ,  0.        ],
        [-0.46457538, -0.11173218,  0.16258121],
        [-0.1913188 , -0.84644701, -0.49131664],
        [ 0.1904991 , -0.24900435,  0.69459972],
        [ 0.        ,  0.        ,  0.        ],
        [-0.15872594, -0.37973335, -0.04377402],
        [-0.06560221,  0.26280499, -0.35817857],
        [-0.11211205, -0.24245679, -0.7811653 ],
        [ 0.44945344,  0.5126397 ,  0.88405186],
        [-0.11301324, -0.15325455,  0.29906603]]))

In [95]:
dummy_vectors.shape[0]

10

In [104]:
random.randint(1,5)

2

In [105]:
def getRandomContext(C):
    tokens = ["a", "b", "c", "d", "e"]
    return tokens[random.randint(0,4)], [tokens[random.randint(0,4)] \
        for i in xrange(2*C)]

getRandomContext(3)

('d', ['d', 'd', 'd', 'e', 'a', 'd'])