In [4]:
import numpy as np
import string

In [7]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

class skip_gram(object):
    def __init__(self):
        self.N = 10
        self.X_train = []
        self.Y_train = []
        self.window_size = 2
        self.alpha = 0.001
        self.words = []
        self.word_index = {}
        
    def initialize(self,V, data, word_index):
        self.V = V
        self.W1 = np.random.uniform(-0.8, 0.8, (self.V, self.N))
        self.W2 = np.random.uniform(-0.8, 0.8, (self.N, self.V))
        self.words = data
        self.word_index = word_index
            
    def feed_forward(self, X):
        self.h = np.dot(self.W1.T, X).reshape(self.N, 1)
        self.u = np.dot(self.W2.T, self.h)
        self.y = softmax(self.u)
        return self.y
    
    def backpropagate(self, x, truth):
        EI = self.y - np.asarray(truth).reshape(self.V, 1)
        grad_W2 = np.dot(self.h, EI.T)
        X = np.array(x).reshape(self.V,1)
        grad_W1 = np.dot(X, np.dot(self.W2, EI).T)
        self.W2 = self.W2 - self.alpha*grad_W2
        self.W1 = self.W1 - self.alpha*grad_W1
        
    def train(self, epochs):
        for x in range(1, epochs):
            self.loss = 0
            for j in range(len(self.X_train)):
                self.feed_forward(self.X_train[j])
                self.backpropagate(self.X_train[j], self.Y_train[j])
                C = 0
                for m in range(self.V):
                    if (self.Y_train[j][m]):
                        self.loss  += -1*self.u[m][0]
                        C += 1
                self.loss += C*np.log(np.sum(np.exp(self.u)))
            if x%100 ==0:
                print("epoch:", x, "loss =",self.loss)
            self.alpha *= 1/(1+self.alpha*x)
            
    def predict(self, word, number_of_predictions):
        if word in self.words:
            index = self.word_index[word]
            X = [0 for i in range(self.V)]
            X[index] = 1
            prediction = self.feed_forward(X)
#             print(prediction)
            output = {}
            for i in range(self.V):
                output[prediction[i][0]] = i
            top_context_words = []
            for k in sorted(output, reverse=True):
                top_context_words.append(self.words[output[k]])
                if (len(top_context_words)>=number_of_predictions):
                    break
            return top_context_words
        else:
            print("Word not found in dictionary")
    
def preprocessing(corpus):
    training_data = []
    sentences = corpus.split('.')
    for i in range(len(sentences)):
        sentences[i] = sentences[i].strip()
        sentence = sentences[i].split()
        x = [word.strip(string.punctuation) for word in sentence]
        x = [word.lower() for word in x]
        training_data.append(x)
    return training_data

def prepare_data_for_train(sentences, w2v):
    data = {}
    for sentence in sentences:
        for word in sentence:
            #counter the number of times each word appears
            if word not in data:
                data[word] = 1
            else:
                data[word] += 1
    V = len(data) # number of word(no repeatition)
    data = sorted(list(data.keys()))# sord the dict by word name
#     print(data) 
    # all words' index
    word_index = {}
    for i in range(len(data)):
        word_index[data[i]] = i
            
    for sentence in sentences:
        for i in range(len(sentence)):
            center_word_vector = [0 for x in range(V)]
            center_word_vector[word_index[sentence[i]]] = 1
            
            context_vector = [0 for x in range(V)]
            for j in range(i-w2v.window_size, i+w2v.window_size):
                if i!=j and j>=0 and j<len(sentence):
                    context_vector[word_index[sentence[j]]] += 1

            w2v.X_train.append(center_word_vector)
            w2v.Y_train.append(context_vector) 
    w2v.initialize(V,data, word_index)

    return w2v.X_train, w2v.Y_train

In [17]:
corpus = "The earth revolves around the sun. The moon revolves around the earth"
epochs = 1000
training_data = preprocessing(corpus)
w2v = skip_gram()
prepare_data_for_train(training_data, w2v)
w2v.train(epochs)
print(w2v.predict('revolves',4))

epoch: 100 loss = 53.70853911883946
epoch: 200 loss = 53.1490172151716
epoch: 300 loss = 52.956798492552544
epoch: 400 loss = 52.860690743075125
epoch: 500 loss = 52.80317680569317
epoch: 600 loss = 52.764931992077365
epoch: 700 loss = 52.73767315698624
epoch: 800 loss = 52.717265415278916
epoch: 900 loss = 52.701416113031655
['the', 'around', 'earth', 'revolves']


In [11]:
corpus = "Computational processes are abstract beings that inhabit computers."
epochs = 5000
training_data = preprocessing(corpus)
w2v = skip_gram()
prepare_data_for_train(training_data, w2v)
w2v.train(epochs)
print(w2v.predict('abstract',4))

epoch: 100 loss = 40.55970966760545
epoch: 200 loss = 40.1616897683346
epoch: 300 loss = 40.021185696120504
epoch: 400 loss = 39.95018228488393
epoch: 500 loss = 39.90744737344812
epoch: 600 loss = 39.87892786683252
epoch: 700 loss = 39.8585505305099
epoch: 800 loss = 39.84326726716397
epoch: 900 loss = 39.83138152076275
epoch: 1000 loss = 39.82187437890899
epoch: 1100 loss = 39.81409712625049
epoch: 1200 loss = 39.80761718771815
epoch: 1300 loss = 39.80213506516579
epoch: 1400 loss = 39.79743683366302
epoch: 1500 loss = 39.79336562478859
epoch: 1600 loss = 39.78980379820715
epoch: 1700 loss = 39.78666140360757
epoch: 1800 loss = 39.78386848799607
epoch: 1900 loss = 39.78136983212742
epoch: 2000 loss = 39.77912126606311
epoch: 2100 loss = 39.777087037541925
epoch: 2200 loss = 39.775237898188685
epoch: 2300 loss = 39.77354968908049
epoch: 2400 loss = 39.77200228001148
epoch: 2500 loss = 39.77057876340658
epoch: 2600 loss = 39.76926483431321
epoch: 2700 loss = 39.76804830821802
epoch: 28

In [16]:
print(w2v.predict('are',4))

['computers', 'abstract', 'that', 'beings']
