In [4]:
import numpy as np
import string
import nltk
from nltk.corpus import stopwords   

In [44]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

class skip_gram(object):
    def __init__(self):
        self.N = 10
        self.X_train = []
        self.Y_train = []
        self.window_size = 2
        self.alpha = 0.001
        self.words = []
        self.word_index = {}
        
    def initialize(self,V, data, word_index):
        self.V = V
        self.W1 = np.random.uniform(-0.8, 0.8, (self.V, self.N))
        self.W2 = np.random.uniform(-0.8, 0.8, (self.N, self.V))
        self.words = data
        self.word_index = word_index
            
    def feed_forward(self, X):
        self.h = np.dot(self.W1.T, X).reshape(self.N, 1)
        self.u = np.dot(self.W2.T, self.h)
        self.y = softmax(self.u)
        return self.y
    
    def backpropagate(self, x, truth):
        EI = self.y - np.asarray(truth).reshape(self.V, 1)
        grad_W2 = np.dot(self.h, EI.T)
        X = np.array(x).reshape(self.V,1)
        grad_W1 = np.dot(X, np.dot(self.W2, EI).T)
        self.W2 = self.W2 - self.alpha*grad_W2
        self.W1 = self.W1 - self.alpha*grad_W1
        
    def train(self, epochs):
        for x in range(1, epochs):
            self.loss = 0
            for j in range(len(self.X_train)):
                self.feed_forward(self.X_train[j])
                self.backpropagate(self.X_train[j], self.Y_train[j])
                C = 0
                for m in range(self.V):
                    if (self.Y_train[j][m]):
                        self.loss += -1*self.u[m][0]
                        C += 1
                self.loss += C*np.log(np.sum(np.exp(self.u)))
            print("epoch:", x, "loss =",self.loss)
            self.alpha *= 1/(1+self.alpha*x)
            
    def predict(self, word, number_of_predictions):
        if word in self.words:
            index = self.word_index[word]
            X = [0 for i in range(self.V)]
            X[index] = 1
            prediction = self.feed_forward(X)
            print(prediction)
            output = {}
            for i in range(self.V):
                output[prediction[i][0]] = i
            top_context_words = []
            for k in sorted(output, reverse=True):
                top_context_words.append(self.words[output[k]])
                if (len(top_context_words)>=number_of_predictions):
                    break
            return top_context_words
        else:
            print("Word not found in dictionary")
    
def preprocessing(corpus):
    stop_words = set(stopwords.words('english')) 
    training_data = []
    sentences = corpus.split('.')
    for i in range(len(sentences)):
        sentences[i] = sentences[i].strip()
        sentence = sentences[i].split()
        x = [word.strip(string.punctuation) for word in sentence if word not in stop_words]
        x = [word.lower() for word in x]
        training_data.append(x)
    return training_data

def prepare_data_for_train(sentences, w2v):
    data = {}
    for sentence in sentences:
        for word in sentence:
            #counter the number of times each word appears
            if word not in data:
                data[word] = 1
            else:
                data[word] += 1
    V = len(data) # number of word(no repeatition)
    data = sorted(list(data.keys()))# sord the dict by word name
#     print(data) 
    # all words' index
    word_index = {}
    for i in range(len(data)):
        word_index[data[i]] = i
            
    for sentence in sentences:
        for i in range(len(sentence)):
            center_word_vector = [0 for x in range(V)]
            center_word_vector[word_index[sentence[i]]] = 1
            
            context_vector = [0 for x in range(V)]
            for j in range(i-w2v.window_size, i+w2v.window_size):
                if i!=j and j>=0 and j<len(sentence):
                    context_vector[word_index[sentence[j]]] += 1

            w2v.X_train.append(center_word_vector)
            w2v.Y_train.append(context_vector) 
    w2v.initialize(V,data, word_index)

    return w2v.X_train, w2v.Y_train

In [45]:
corpus = "The earth revolves around the sun. The moon revolves around the earth"
epochs = 1000
training_data = preprocessing(corpus)
w2v = skip_gram()
prepare_data_for_train(training_data, w2v)
w2v.train(epochs)
print(w2v.predict('around',5))

['around', 'earth', 'moon', 'revolves', 'sun', 'the']
epoch: 1 loss = 41.018992341776936
epoch: 2 loss = 40.946585219173144
epoch: 3 loss = 40.87816452368678
epoch: 4 loss = 40.81353447416799
epoch: 5 loss = 40.75250478694059
epoch: 6 loss = 40.69489100692434
epoch: 7 loss = 40.6405147803915
epoch: 8 loss = 40.58920407077605
epoch: 9 loss = 40.540793319659606
epoch: 10 loss = 40.495123555662914
epoch: 11 loss = 40.45204245447018
epoch: 12 loss = 40.41140435360211
epoch: 13 loss = 40.37307022583714
epoch: 14 loss = 40.3369076153604
epoch: 15 loss = 40.30279054080606
epoch: 16 loss = 40.27059936935948
epoch: 17 loss = 40.24022066601048
epoch: 18 loss = 40.211547021910775
epoch: 19 loss = 40.184476865597986
epoch: 20 loss = 40.15891426061749
epoch: 21 loss = 40.13476869281092
epoch: 22 loss = 40.1119548502603
epoch: 23 loss = 40.09039239858307
epoch: 24 loss = 40.07000575397957
epoch: 25 loss = 40.05072385614214
epoch: 26 loss = 40.0324799428533
epoch: 27 loss = 40.01521132783198
epoch: 2

epoch: 232 loss = 39.62228291408279
epoch: 233 loss = 39.622179561610245
epoch: 234 loss = 39.622077373074845
epoch: 235 loss = 39.621976330092316
epoch: 236 loss = 39.62187641464742
epoch: 237 loss = 39.62177760908514
epoch: 238 loss = 39.62167989610193
epoch: 239 loss = 39.62158325873733
epoch: 240 loss = 39.6214876803657
epoch: 241 loss = 39.621393144688376
epoch: 242 loss = 39.621299635725734
epoch: 243 loss = 39.621207137809876
epoch: 244 loss = 39.62111563557711
epoch: 245 loss = 39.62102511396096
epoch: 246 loss = 39.620935558185145
epoch: 247 loss = 39.62084695375688
epoch: 248 loss = 39.62075928646033
epoch: 249 loss = 39.62067254235015
epoch: 250 loss = 39.62058670774539
epoch: 251 loss = 39.620501769223324
epoch: 252 loss = 39.62041771361368
epoch: 253 loss = 39.62033452799282
epoch: 254 loss = 39.62025219967822
epoch: 255 loss = 39.620170716223
epoch: 256 loss = 39.6200900654107
epoch: 257 loss = 39.62001023525005
epoch: 258 loss = 39.61993121397
epoch: 259 loss = 39.619852

epoch: 543 loss = 39.61100055261483
epoch: 544 loss = 39.610988524828606
epoch: 545 loss = 39.61097655040636
epoch: 546 loss = 39.61096462900339
epoch: 547 loss = 39.610952760277854
epoch: 548 loss = 39.61094094389077
epoch: 549 loss = 39.610929179506016
epoch: 550 loss = 39.61091746679031
epoch: 551 loss = 39.61090580541314
epoch: 552 loss = 39.610894195046704
epoch: 553 loss = 39.610882635365996
epoch: 554 loss = 39.61087112604867
epoch: 555 loss = 39.610859666775035
epoch: 556 loss = 39.6108482572281
epoch: 557 loss = 39.61083689709347
epoch: 558 loss = 39.610825586059285
epoch: 559 loss = 39.61081432381634
epoch: 560 loss = 39.610803110057944
epoch: 561 loss = 39.61079194447985
epoch: 562 loss = 39.6107808267804
epoch: 563 loss = 39.610769756660346
epoch: 564 loss = 39.61075873382288
epoch: 565 loss = 39.61074775797361
epoch: 566 loss = 39.61073682882056
epoch: 567 loss = 39.61072594607409
epoch: 568 loss = 39.6107151094469
epoch: 569 loss = 39.610704318654044
epoch: 570 loss = 39.

epoch: 834 loss = 39.6088826001301
epoch: 835 loss = 39.60887823599897
epoch: 836 loss = 39.60887388393394
epoch: 837 loss = 39.6088695438858
epoch: 838 loss = 39.60886521580567
epoch: 839 loss = 39.60886089964493
epoch: 840 loss = 39.608856595355164
epoch: 841 loss = 39.60885230288828
epoch: 842 loss = 39.608848022196426
epoch: 843 loss = 39.60884375323196
epoch: 844 loss = 39.60883949594754
epoch: 845 loss = 39.60883525029608
epoch: 846 loss = 39.608831016230695
epoch: 847 loss = 39.6088267937048
epoch: 848 loss = 39.608822582672026
epoch: 849 loss = 39.60881838308625
epoch: 850 loss = 39.608814194901626
epoch: 851 loss = 39.60881001807249
epoch: 852 loss = 39.60880585255349
epoch: 853 loss = 39.60880169829946
epoch: 854 loss = 39.60879755526546
epoch: 855 loss = 39.60879342340683
epoch: 856 loss = 39.608789302679135
epoch: 857 loss = 39.608785193038145
epoch: 858 loss = 39.60878109443988
epoch: 859 loss = 39.6087770068406
epoch: 860 loss = 39.60877293019678
epoch: 861 loss = 39.6087

In [39]:
['moon', 'revolves', 'earth', 'around', 'the']

TypeError: list indices must be integers or slices, not float

In [27]:
print(s)

[array([1, 4, 2, 3]), array([0, 3, 1, 2]), array([0, 3, 1, 2]), array([1, 4, 2, 3])]


In [28]:
EI = np.sum(s, axis=0)

In [29]:
print(EI)

[ 2 14  6 10]


In [34]:
print(y_pred-w_c)

TypeError: unsupported operand type(s) for -: 'list' and 'list'