In [1]:
import numpy as np 
import string 
from nltk.corpus import stopwords  

In [2]:
def softmax(x): 
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x)) 
    return e_x / e_x.sum() 

In [3]:
class word2vec(object): 
    def __init__(self): 
        self.N = 10
        self.X_train = [] 
        self.y_train = [] 
        self.window_size = 2
        self.alpha = 0.001 # learning rate
        self.words = [] 
        self.word_index = {} 
   
    def initialize(self,V,data): 
        self.V = V #number of words(nodes): 6
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V)) # weight matrix_1
           
        self.words = data # words in sentences(nodes in graph)
        for i in range(len(data)): # store index by word in word_index dictionary
            self.word_index[data[i]] = i 
   
       
    def feed_forward(self,X): 
        self.h = np.dot(self.W.T,X).reshape(self.N,1) #compute hidden layer
        self.u = np.dot(self.W1.T,self.h) #compute output layer
        #print(self.u) 
        self.y = softmax(self.u) #finally apply softmax to the output (34,2) 
        return self.y 
           
    def backpropagate(self,x,t): #train by backpropagating
        e = self.y - np.asarray(t).reshape(self.V,1) #(34,2) - (34,2)
        # e.shape is V x 1 
        dLdW1 = np.dot(self.h,e.T) 
        X = np.array(x).reshape(self.V,1) 
        dLdW = np.dot(X, np.dot(self.W1,e).T) 
        self.W1 = self.W1 - self.alpha*dLdW1 
        self.W = self.W - self.alpha*dLdW 
           
    def train(self,epochs): #training process with given epochs
        for x in range(1,epochs):         
            self.loss = 0 #initialize loss
            for j in range(len(self.X_train)): 
                self.feed_forward(self.X_train[j]) 
                self.backpropagate(self.X_train[j],self.y_train[j]) 
                C = 0
                for m in range(self.V): 
                    if(self.y_train[j][m]): 
                        self.loss += -1*self.u[m][0] 
                        C += 1
                self.loss += C*np.log(np.sum(np.exp(self.u))) #update loss
            print("epoch ",x, " loss = ",self.loss) 
            self.alpha *= 1/( (1+self.alpha*x) ) 
              
    def predict(self,word,number_of_predictions): # predict context words that appear within window length
        if word in self.words: 
            index = self.word_index[word] 
            X = [0 for i in range(self.V)] 
            X[index] = 1
            prediction = self.feed_forward(X) 
            output = {} 
            for i in range(self.V): 
                output[prediction[i][0]] = i 
               
            top_context_words = [] 
            for k in sorted(output,reverse=True): 
                top_context_words.append(self.words[output[k]]) 
                if(len(top_context_words)>=number_of_predictions): 
                    break
       
            return top_context_words 
        else: 
            print("Word not found in dicitonary")   

In [4]:
#Data cleaning and preprocessing
def preprocessing(corpus): #input : "The earth revolves around the sun. The moon revolves around the earth"
    stop_words = set(stopwords.words('english'))#eliminate stop words   
    training_data = [] 
    sentences = corpus.split(".")#split whole input sentence based on "."
    for i in range(len(sentences)): 
        sentences[i] = sentences[i].strip()#remove unwanted spaces("", "\n") 
        sentence = sentences[i].split()#split each sentence to words 
        x = [word.strip(string.punctuation) for word in sentence 
                                     if word not in stop_words]#remove any punctuation in word x
        x = [word.lower() for word in x]#convert all characters in word x to lowercase 
        training_data.append(x)#append refined sentence list x to traning_data list 
    return training_data#return refined traing_data (each element is refined word list) 
    #output: [['the', 'earth', 'revolves', 'around', 'sun'], ['the', 'moon', 'revolves', 'around', 'earth']]


def prepare_data_for_training(sentences,w2v):#receive (prepocessed data, word2vec object) 
    data = {}#data dictionary(key = each word(vertex), value = number of the word in data) 
    for sentence in sentences:# count the number of each word(vertex) in input data and save it to data dictionary 
        for word in sentence: 
            if word not in data: 
                data[word] = 1
            else: 
                data[word] += 1
    V = len(data)#number of types of words(vertices) 
    data = sorted(list(data.keys()))#sort words in the data list

    vocab = {}#vocabulary dictionary(key = word, value = the word's index in data list)
    for i in range(len(data)): 
        vocab[data[i]] = i 
    # vocab : {'around': 0, 'earth': 1, 'moon': 2, 'revolves': 3, 'sun': 4, 'the': 5}
    
    #for i in range(len(words)): 
    for sentence in sentences: #convert each word to one-hot vector
        for i in range(len(sentence)): 
            center_word = [0 for x in range(V)]#[0,0,0,0,0,0] e.g., list of 6 number of zeros
            center_word[vocab[sentence[i]]] = 1

            context = [0 for x in range(V)]#[0,0,0,0,0,0] e.g., list of 6 number of zeros 

            #slide window over random walk(sentence)
            for j in range(i-w2v.window_size,i+w2v.window_size): #window of length 2w+1
                if i!=j and j>=0 and j<len(sentence): 
                    context[vocab[sentence[j]]] += 1 # words that appear within window length
                    
            #assign data for training      
            w2v.X_train.append(center_word) 
            w2v.y_train.append(context) 
    
    print("X_train:{}".format(w2v.X_train))
    print("\ny_train{}".format(w2v.y_train))
    #assign number of words in sentences:6
    #list of words in sentence:['around', 'earth', 'moon', 'revolves', 'sun', 'the']
    w2v.initialize(V,data)

    return w2v.X_train,w2v.y_train 

In [5]:
#call functions
corpus = "" 
corpus += "The earth revolves around the sun. The moon revolves around the earth"
epochs = 1000
  
training_data = preprocessing(corpus)#refiend data 
w2v = word2vec()#word2vec object 
  
prepare_data_for_training(training_data,w2v) 
w2v.train(epochs)  
  
print(w2v.predict("around",3))  

X_train:[[0, 0, 0, 0, 0, 1], [0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]]

y_train[[0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 1], [1, 1, 0, 0, 0, 1], [0, 1, 0, 1, 1, 0], [1, 0, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 1], [1, 0, 1, 0, 0, 1], [0, 1, 1, 1, 0, 0], [1, 0, 0, 1, 0, 0]]
V
6
epoch  1  loss =  48.05080323198996
epoch  2  loss =  47.9477926745004
epoch  3  loss =  47.84565284543995
epoch  4  loss =  47.74447317049176
epoch  5  loss =  47.64433907129731
epoch  6  loss =  47.54533154106408
epoch  7  loss =  47.447526772358145
epoch  8  loss =  47.350995840954845
epoch  9  loss =  47.25580444836354
epoch  10  loss =  47.16201272437006
epoch  11  loss =  47.069675089694805
epoch  12  loss =  46.97884017768838
epoch  13  loss =  46.889550812910635
epoch  14  loss =  46.80184404349491
epoch  15  loss =  46.71575122340437
epoch  16  loss =  46.6312981

epoch  234  loss =  42.854831926278436
epoch  235  loss =  42.852400961685504
epoch  236  loss =  42.849990453504944
epoch  237  loss =  42.84760014752916
epoch  238  loss =  42.84522979369746
epoch  239  loss =  42.84287914601279
epoch  240  loss =  42.840547962460455
epoch  241  loss =  42.83823600492879
epoch  242  loss =  42.83594303913166
epoch  243  loss =  42.83366883453281
epoch  244  loss =  42.831413164271936
epoch  245  loss =  42.82917580509256
epoch  246  loss =  42.82695653727154
epoch  247  loss =  42.82475514455008
epoch  248  loss =  42.82257141406658
epoch  249  loss =  42.820405136290816
epoch  250  loss =  42.81825610495975
epoch  251  loss =  42.81612411701475
epoch  252  loss =  42.8140089725402
epoch  253  loss =  42.8119104747036
epoch  254  loss =  42.80982842969695
epoch  255  loss =  42.80776264667948
epoch  256  loss =  42.80571293772153
epoch  257  loss =  42.80367911775004
epoch  258  loss =  42.801661004494775
epoch  259  loss =  42.79965841843608
epoch  

epoch  505  loss =  42.54811099340684
epoch  506  loss =  42.54758908823371
epoch  507  loss =  42.54706925303111
epoch  508  loss =  42.54655147553967
epoch  509  loss =  42.54603574359635
epoch  510  loss =  42.545522045133616
epoch  511  loss =  42.54501036817835
epoch  512  loss =  42.54450070085105
epoch  513  loss =  42.543993031364934
epoch  514  loss =  42.543487348024904
epoch  515  loss =  42.5429836392268
epoch  516  loss =  42.54248189345644
epoch  517  loss =  42.541982099288774
epoch  518  loss =  42.54148424538704
epoch  519  loss =  42.5409883205019
epoch  520  loss =  42.54049431347057
epoch  521  loss =  42.54000221321609
epoch  522  loss =  42.539512008746364
epoch  523  loss =  42.53902368915345
epoch  524  loss =  42.53853724361273
epoch  525  loss =  42.53805266138218
epoch  526  loss =  42.537569931801386
epoch  527  loss =  42.53708904429102
epoch  528  loss =  42.53660998835197
epoch  529  loss =  42.5361327535645
epoch  530  loss =  42.53565732958761
epoch  53

epoch  751  loss =  42.46178347097366
epoch  752  loss =  42.46154836099265
epoch  753  loss =  42.461313878813236
epoch  754  loss =  42.461080021927096
epoch  755  loss =  42.460846787839245
epoch  756  loss =  42.460614174067906
epoch  757  loss =  42.460382178144464
epoch  758  loss =  42.460150797613444
epoch  759  loss =  42.45992003003223
epoch  760  loss =  42.45968987297121
epoch  761  loss =  42.45946032401348
epoch  762  loss =  42.45923138075494
epoch  763  loss =  42.45900304080411
epoch  764  loss =  42.458775301782076
epoch  765  loss =  42.45854816132237
epoch  766  loss =  42.45832161707094
epoch  767  loss =  42.45809566668606
epoch  768  loss =  42.45787030783822
epoch  769  loss =  42.45764553821008
epoch  770  loss =  42.45742135549634
epoch  771  loss =  42.45719775740374
epoch  772  loss =  42.45697474165092
epoch  773  loss =  42.45675230596838
epoch  774  loss =  42.45653044809834
epoch  775  loss =  42.45630916579473
epoch  776  loss =  42.45608845682313
epoch

epoch  967  loss =  42.4223400653265
epoch  968  loss =  42.42219857957222
epoch  969  loss =  42.42205738723222
epoch  970  loss =  42.421916487395336
epoch  971  loss =  42.42177587915421
epoch  972  loss =  42.421635561605214
epoch  973  loss =  42.421495533848415
epoch  974  loss =  42.42135579498765
epoch  975  loss =  42.421216344130386
epoch  976  loss =  42.421077180387826
epoch  977  loss =  42.42093830287473
epoch  978  loss =  42.42079971070959
epoch  979  loss =  42.42066140301444
epoch  980  loss =  42.42052337891495
epoch  981  loss =  42.420385637540335
epoch  982  loss =  42.42024817802341
epoch  983  loss =  42.420110999500494
epoch  984  loss =  42.41997410111144
epoch  985  loss =  42.419837481999636
epoch  986  loss =  42.41970114131188
epoch  987  loss =  42.41956507819853
epoch  988  loss =  42.41942929181333
epoch  989  loss =  42.419293781313485
epoch  990  loss =  42.41915854585964
epoch  991  loss =  42.41902358461576
epoch  992  loss =  42.41888889674928
epoc