In [0]:
# Corpus Cell

import tensorflow as tf
import numpy as np
import re

window_length = 5
embedding_dimension = 25


def tokenize(data):
    data = data.replace('\n', ' , ')
    data = data.replace('\t', ' ')
    data = data.replace('\r', ' ')
    data = data.replace(',', '.')
    data = data.lower()
    data = re.sub(' +', ' ', data)
#     data = data.split()
    return data

data = open('ferdosi.txt', 'r', encoding='utf-8').read()

data = tokenize(data)
corpus_raw = data
corpus_raw = corpus_raw.lower()

words = []
for word in corpus_raw.split():
    if word != '.': 
        words.append(word)

words = set(words) 
word_to_index = {}
index_to_word = {}
word_count = len(words) 
words_list = list(words)
  
for i,word in enumerate(words):
    word_to_index[word] = i
    index_to_word[i] = word

    

raw_sentences = corpus_raw.split('.')
sentences = []
for sentence in raw_sentences:
    sentences.append(sentence.split())

    
    
  
    
    
unigram = np.zeros(word_count)

for mesra in sentences:
  for word in mesra:
    unigram[word_to_index[word]] += 1

unigram=unigram/unigram.sum()

smoothed_unigram=np.power(unigram,3/4)
smoothed_unigram=smoothed_unigram/smoothed_unigram.sum()




co_occurrence_matrix=np.zeros((word_count,word_count))

for mesra in sentences:
  for target_iterator in range(len(mesra)):
    for context_iterator in range(  max(0,target_iterator-window_length), min(len(mesra),target_iterator+window_length+1)   ):
      if context_iterator != target_iterator:
        co_occurrence_matrix[word_to_index[mesra[target_iterator]],word_to_index[mesra[context_iterator]]]+=1

smoothed_negative_samples_unigram =np.zeros((word_count,word_count))
smoothed_negative_samples_unigram [np.nonzero(co_occurrence_matrix)]=1
smoothed_negative_samples_unigram = 1-smoothed_negative_samples_unigram
smoothed_negative_samples_unigram = smoothed_negative_samples_unigram*unigram.reshape(1,-1)
smoothed_negative_samples_unigram = np.power(smoothed_negative_samples_unigram,0.75)
smoothed_negative_samples_unigram = smoothed_negative_samples_unigram/np.sum(smoothed_negative_samples_unigram,axis=1).reshape(-1,1)



In [0]:
#Model Definition Cell

batch_size = 128
negative_sample_count = 10
learning_rate=0.001
 

W      = tf.Variable(tf.random_normal([word_count, embedding_dimension], stddev=0.00001))
W_prim = tf.Variable(tf.random_normal([word_count, embedding_dimension], stddev=0.00001))


target           = tf.placeholder(tf.int32, shape=(None,))
context          = tf.placeholder(tf.int32, shape=(None,))
negative_samples = tf.placeholder(tf.int32, shape=(None,negative_sample_count,))


v_target           = tf.gather(W,indices=target,axis=0)
u_context          = tf.gather(W_prim,indices=context,axis=0)
u_negative_samples = tf.gather_nd(W_prim, indices=tf.reshape(negative_samples,shape=(-1,negative_sample_count,1)))


positive_score=tf.log(tf.sigmoid(tf.reduce_sum(tf.multiply(u_context,v_target),axis=1)))
negative_score=tf.reduce_sum(tf.log(tf.sigmoid(-tf.reduce_sum(tf.multiply(u_negative_samples,tf.reshape(v_target,[-1,1,embedding_dimension])),axis=2))),axis=1)

loss = -tf.reduce_mean(positive_score+negative_score,axis=0)


train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) 


In [0]:
#Training Cell
#Once you run this cell, your model gets trained on one epoch data i.e. a pass on corpus

target_batch =np.zeros((batch_size,))
context_batch=np.zeros((batch_size,))
negative_samples_batch=np.zeros((batch_size,negative_sample_count,))

sample_counter=0
batch_counter=0
for mesra in sentences:
  for target_iterator in range(len(mesra)):
    for context_iterator in range(  max(0,target_iterator-window_length), min(len(mesra),target_iterator+window_length+1)   ):
      if context_iterator != target_iterator:
        target_batch[sample_counter]           = word_to_index[mesra[target_iterator ]]
        context_batch[sample_counter]          = word_to_index[mesra[context_iterator]]
        negative_samples_batch[sample_counter] = np.random.choice(np.arange(word_count), negative_sample_count, p=smoothed_negative_samples_unigram[word_to_index[mesra[target_iterator]]])
        sample_counter+=1
        if sample_counter==batch_size-1:
          sess.run(train_step, feed_dict={target: target_batch, context: context_batch, negative_samples: negative_samples_batch})
          if batch_counter%200==0:
            print("batch : ", batch_counter,"   ,   loss : ",sess.run(loss, feed_dict={target: target_batch, context: context_batch, negative_samples: negative_samples_batch}))
          sample_counter=0
          batch_counter+=1
  


In [0]:
# Save parameters Cell
np.savetxt('W_saved.txt',sess.run(W))
np.savetxt('W_prim_saved.txt',sess.run(W_prim))


In [0]:
# Load Parameters Cell

W_loaded=np.loadtxt('W_saved.txt', dtype=np.double)
W_prim_loaded=np.loadtxt('W_prim_saved.txt', dtype=np.double)


In [56]:
# Test cell

target_word='خردمند'
embedding=W_loaded



target_vector=embedding[word_to_index[target_word]]

normalized_embedding= embedding / np.sqrt(np.power(embedding,2).sum(axis=1).reshape(-1,1))
target_vector_normalized = target_vector / np.sqrt( target_vector.reshape(1,-1)@target_vector.reshape(-1,1) )

context_scores=(normalized_embedding * target_vector_normalized.reshape(1,-1)).sum(axis=1)

best_neighbor_indices=np.flip(np.argsort(context_scores)[-7:-1])

print(context_scores[best_neighbor_indices])
best_neighbor_words=[]
for t in best_neighbor_indices:
  best_neighbor_words.append(index_to_word[t])
print(best_neighbor_words)

[0.90475746 0.89054388 0.88912665 0.88815269 0.86909583 0.85987825]
['دانا', 'خوی', 'جهاندار', 'هوش', 'دستور', 'رای']
