Source : 

https://www.geeksforgeeks.org/implement-your-own-word2vecskip-gram-model-in-python/

In [11]:
import numpy as np 
import string 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [2]:
def softmax(x): 
	"""Compute softmax values for each sets of scores in x."""
	e_x = np.exp(x - np.max(x)) 
	return e_x / e_x.sum() 

In [3]:
class word2vec(object): 
	def __init__(self): 
		self.N = 10
		self.X_train = [] 
		self.y_train = [] 
		self.window_size = 2
		self.alpha = 0.001
		self.words = [] 
		self.word_index = {} 

	def initialize(self,V,data): 
		self.V = V 
		self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N)) 
		self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V)) 
		
		self.words = data 
		for i in range(len(data)): 
			self.word_index[data[i]] = i 

	
	def feed_forward(self,X): 
		self.h = np.dot(self.W.T,X).reshape(self.N,1) 
		self.u = np.dot(self.W1.T,self.h) 
		#print(self.u) 
		self.y = softmax(self.u) 
		return self.y 
		
	def backpropagate(self,x,t): 
		e = self.y - np.asarray(t).reshape(self.V,1) 
		# e.shape is V x 1 
		dLdW1 = np.dot(self.h,e.T) 
		X = np.array(x).reshape(self.V,1) 
		dLdW = np.dot(X, np.dot(self.W1,e).T) 
		self.W1 -=  self.alpha*dLdW1 
		self.W -= self.alpha*dLdW 
		
	def train(self,epochs): 
		for x in range(1,epochs+1):		 
			self.loss = 0
			for j in range(len(self.X_train)): 
				self.feed_forward(self.X_train[j]) 
				self.backpropagate(self.X_train[j],self.y_train[j]) 
				C = 0
				for m in range(self.V): 
					if(self.y_train[j][m]): 
						self.loss += -1*self.u[m][0] 
						C += 1
				self.loss += C*np.log(np.sum(np.exp(self.u))) 
			print("epoch ",x, " loss = ",self.loss) 
			self.alpha *= 1/( (1+self.alpha*x) ) 
			
	def predict(self,word,number_of_predictions): 
		if word in self.words: 
			index = self.word_index[word] 
			X = [0 for i in range(self.V)] 
			X[index] = 1
			prediction = self.feed_forward(X) 
			output = {} 
			for i in range(self.V): 
				output[prediction[i][0]] = i 
			
			top_context_words = [] 
			for k in sorted(output,reverse=True): 
				top_context_words.append(self.words[output[k]]) 
				if(len(top_context_words)>=number_of_predictions): 
					break
	
			return top_context_words 
		else: 
			print("Word not found in dicitonary") 

In [4]:
def preprocessing(corpus): 
	stop_words = set(stopwords.words('english'))	 
	training_data = [] 
	sentences = corpus.split(".") 
	for i in range(len(sentences)): 
		sentences[i] = sentences[i].strip() 
		sentence = sentences[i].split() 
		x = [word.strip(string.punctuation) for word in sentence if word not in stop_words] 
		x = [word.lower() for word in x] 
		training_data.append(x) 
	return training_data 

In [5]:
def prepare_data_for_training(sentences,w2v): 
	data = {} 
	for sentence in sentences: 
		for word in sentence: 
			if word not in data: 
				data[word] = 1
			else: 
				data[word] += 1
	V = len(data) 
	data = sorted(list(data.keys())) 
	vocab = {} 
	for i in range(len(data)): 
		vocab[data[i]] = i 
	
	for sentence in sentences: 
		for i in range(len(sentence)): 
			center_word = [0 for x in range(V)] 
			center_word[vocab[sentence[i]]] = 1
			context = [0 for x in range(V)] 
			
			for j in range(i-w2v.window_size,i+w2v.window_size): 
				if i!=j and j>=0 and j<len(sentence): 
					context[vocab[sentence[j]]] += 1
			w2v.X_train.append(center_word) 
			w2v.y_train.append(context) 
	w2v.initialize(V,data) 

	return w2v.X_train,w2v.y_train 

In [12]:
corpus = "The earth revolves around the sun. The moon revolves around the earth"
epochs = 1000

In [13]:
training_data = preprocessing(corpus) 

In [14]:
training_data

[['the', 'earth', 'revolves', 'around', 'sun'],
 ['the', 'moon', 'revolves', 'around', 'earth']]

In [15]:
w2v = word2vec() 

In [16]:
prepare_data_for_training(training_data,w2v) 

([[0, 0, 0, 0, 0, 1],
  [0, 1, 0, 0, 0, 0],
  [0, 0, 0, 1, 0, 0],
  [1, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 1, 0],
  [0, 0, 0, 0, 0, 1],
  [0, 0, 1, 0, 0, 0],
  [0, 0, 0, 1, 0, 0],
  [1, 0, 0, 0, 0, 0],
  [0, 1, 0, 0, 0, 0]],
 [[0, 1, 0, 0, 0, 0],
  [0, 0, 0, 1, 0, 1],
  [1, 1, 0, 0, 0, 1],
  [0, 1, 0, 1, 1, 0],
  [1, 0, 0, 1, 0, 0],
  [0, 0, 1, 0, 0, 0],
  [0, 0, 0, 1, 0, 1],
  [1, 0, 1, 0, 0, 1],
  [0, 1, 1, 1, 0, 0],
  [1, 0, 0, 1, 0, 0]])

In [17]:
w2v.train(epochs) 

epoch  1  loss =  45.35307881336559
epoch  2  loss =  45.266765911717016
epoch  3  loss =  45.181172258272035
epoch  4  loss =  45.09637324984151
epoch  5  loss =  45.01244095545078
epoch  6  loss =  44.92944375624252
epoch  7  loss =  44.8474460289852
epoch  8  loss =  44.766507876512314
epoch  9  loss =  44.68668490736191
epoch  10  loss =  44.60802806581595
epoch  11  loss =  44.530583512488874
epoch  12  loss =  44.454392554621094
epoch  13  loss =  44.37949162432353
epoch  14  loss =  44.305912302217905
epoch  15  loss =  44.233681383241624
epoch  16  loss =  44.162820980845616
epoch  17  loss =  44.093348665414446
epoch  18  loss =  44.0252776324753
epoch  19  loss =  43.9586168961339
epoch  20  loss =  43.89337150316502
epoch  21  loss =  43.82954276328339
epoch  22  loss =  43.7671284913082
epoch  23  loss =  43.706123257196005
epoch  24  loss =  43.646518640233786
epoch  25  loss =  43.58830348404174
epoch  26  loss =  43.53146414941574
epoch  27  loss =  43.475984762432404
ep

epoch  329  loss =  40.854432061833215
epoch  330  loss =  40.85339883615712
epoch  331  loss =  40.85237186879757
epoch  332  loss =  40.85135110339865
epoch  333  loss =  40.85033648427509
epoch  334  loss =  40.849327956402455
epoch  335  loss =  40.84832546540733
epoch  336  loss =  40.84732895755788
epoch  337  loss =  40.84633837975437
epoch  338  loss =  40.84535367952008
epoch  339  loss =  40.84437480499206
epoch  340  loss =  40.84340170491242
epoch  341  loss =  40.842434328619454
epoch  342  loss =  40.84147262603909
epoch  343  loss =  40.840516547676515
epoch  344  loss =  40.83956604460768
epoch  345  loss =  40.83862106847136
epoch  346  loss =  40.83768157146096
epoch  347  loss =  40.83674750631672
epoch  348  loss =  40.83581882631794
epoch  349  loss =  40.83489548527538
epoch  350  loss =  40.83397743752365
epoch  351  loss =  40.83306463791404
epoch  352  loss =  40.83215704180708
epoch  353  loss =  40.83125460506553
epoch  354  loss =  40.830357284047366
epoch  

epoch  680  loss =  40.678933600419
epoch  681  loss =  40.67869312539011
epoch  682  loss =  40.67845335958421
epoch  683  loss =  40.678214299872835
epoch  684  loss =  40.6779759431458
epoch  685  loss =  40.677738286311204
epoch  686  loss =  40.67750132629522
epoch  687  loss =  40.67726506004193
epoch  688  loss =  40.67702948451324
epoch  689  loss =  40.67679459668877
epoch  690  loss =  40.67656039356568
epoch  691  loss =  40.67632687215862
epoch  692  loss =  40.67609402949942
epoch  693  loss =  40.675861862637205
epoch  694  loss =  40.67563036863814
epoch  695  loss =  40.67539954458527
epoch  696  loss =  40.67516938757853
epoch  697  loss =  40.67493989473447
epoch  698  loss =  40.67471106318629
epoch  699  loss =  40.67448289008356
epoch  700  loss =  40.67425537259226
epoch  701  loss =  40.67402850789456
epoch  702  loss =  40.67380229318871
epoch  703  loss =  40.67357672568898
epoch  704  loss =  40.67335180262552
epoch  705  loss =  40.67312752124421
epoch  706  

In [18]:
w2v.predict("around",3)

['the', 'moon', 'around']

In [13]:
w2v.predict("around",5)

['revolves', 'moon', 'the', 'earth', 'sun']

In [14]:
w2v.predict("sun",3)

['sun', 'moon', 'around']

In [15]:
w2v.predict("earth",3)

['the', 'earth', 'sun']

In [16]:
w2v.predict("jupiter",3)

Word not found in dicitonary
