In [1]:
import numpy as np
import tensorflow as tf
from collections import defaultdict

In [3]:
def load_ferdosi_for_wrod2vec(dir):
    sentences = []
    with open(dir, 'r') as f:
        for b in f:
            m1, m2 = b.split(",")
            split = m1.split(" ")
            sentences.append([elem for elem in split if elem!=''])
            split = m2.split(" ")
            split[-1] = split[-1][:len(split[-1])-1]
            sentences.append([elem for elem in split if elem!=''])
        return np.array(sentences)

In [4]:
dataset_dir = '/Users/Alireza/Desktop/Current Semester/Deep Learning/Assignments/DL_HW4/ferdosi.txt'
sentences = load_ferdosi_for_wrod2vec(dataset_dir)
print (sentences[0])
print (sentences[1])

['به', 'نام', 'خداوند', 'جان', 'و', 'خرد']
['کزین', 'برتر', 'اندیشه', 'برنگذرد']


In [5]:
window_size = 5
hidden_layer = 25
epochs = 50
learning_rate = 0.01

In [6]:
def word2onehot(word, v_count, word_index):
    word_vec = np.zeros(v_count)
    word_index = word_index[word]
    word_vec[word_index] = 1
    return word_vec
def softmax(x): 
    e_x = np.exp(x - np.max(x)) 
    return e_x / e_x.sum()

In [50]:
def generate_training_data(sentences):
    word_counts = defaultdict(int)
    for row in sentences:
        for word in row:
            word_counts[word] += 1
    v_count = len(word_counts.keys())
    words_list = list(word_counts.keys())
    word_index = dict((word, i) for i, word in enumerate(words_list))
    index_word = dict((i, word) for i, word in enumerate(words_list))

    training_data = np.zeros((v_count, v_count))
    print (training_data.shape)
    for sentence in sentences:
        sent_len = len(sentence)
        for i, word in enumerate(sentence):
            for j in range(max(i - window_size, 0), i + window_size + 1):
                if j != i and j <= sent_len-1 and j >= 0 and word_index[word] != word_index[sentence[j]]:
                    training_data[word_index[word], word_index[sentence[j]]] += 1
    return training_data, words_list, word_index, index_word

In [51]:
training_data, words_list, word_index, index_word = generate_training_data(sentences)

(17658, 17658)


In [52]:
print (training_data[0])
print (index_word[0])
print (index_word[1])

[  0. 160.  25. ...   1.   0.   0.]
به
نام


In [53]:
class word2vec(object): 
    def __init__(self): 
        self.words = []
        self.V = None
        self.word_index = None
       
    def feed_forward(self,X_index): 
        self.h = self.W[X_index].reshape(hidden_layer,1)
#         print (self.h.shape)
        self.u = np.dot(self.W1.T,self.h).reshape(self.V,1)
#         print (self.u.shape)
        self.y = softmax(self.u).reshape(self.V,1)
#         print (self.y.shape)
        return self.y 
           
    def backpropagate(self,x_index,t): 
#         print("t shape = " + str(t.shape))
        e = self.y - np.asarray(t).reshape(self.V,1)
#         print("e shape = " + str(e.shape))
        dLdW1 = np.dot(self.h,e.T) 
#         print(np.dot(self.W1,e).shape)
        dLdW = np.zeros((self.V, hidden_layer)) 
        dLdW[x_index] = np.dot(self.W1,e).flatten()
#         print(dLdW.shape)
        self.W1 = self.W1 - learning_rate*dLdW1 
        self.W = self.W - learning_rate*dLdW 
           
    def train(self, sentences, epochs): 
        word_counts = defaultdict(int)
        for row in sentences:
            for word in row:
                word_counts[word] += 1
        self.V = len(word_counts.keys())
        self.words = list(word_counts.keys())
        self.word_index = dict((word, i) for i, word in enumerate(self.words))
                
        self.W = np.random.uniform(-0.8, 0.8, (self.V, hidden_layer)) 
        self.W1 = np.random.uniform(-0.8, 0.8, (hidden_layer, self.V))
        print (sentences.shape)
        for x in range(1,epochs):         
            self.loss = 0
            sent_len = len(sentence)
            for w_target_index, w_context in enumerate(training_data):
                self.feed_forward(w_target_index) 
                self.backpropagate(w_target_index, w_context) 
                C = 0
                for m in range(self.V): 
                    if(w_context[m]): 
                        self.loss += -1*self.u[m][0] 
                        C += 1
                self.loss += C*np.log(np.sum(np.exp(self.u))) 
            print("epoch ",x, " loss = ",self.loss) 
            learning_rate *= 1/( (1+learning_rate*x) ) 
              
    def predict(self,word,number_of_predictions): 
        if word in self.words:
            X = word2onehot(word, self.V, self.word_index)
            prediction = self.feed_forward(X) 
            output = {} 
            for i in range(self.V): 
                output[prediction[i][0]] = i 
               
            top_context_words = [] 
            for k in sorted(output,reverse=True): 
                top_context_words.append(self.words[output[k]]) 
                if(len(top_context_words)>=number_of_predictions): 
                    break
       
            return top_context_words 
        else: 
            print("Word not found in dicitonary") 

In [None]:
w2v = word2vec()
w2v.train(sentences, epochs)

(99218,)
