In [132]:
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
import os
import warnings
import matplotlib.pyplot as plt


from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tensorflow.contrib.tensorboard.plugins import projector


warnings.simplefilter('ignore')

In [106]:
def read_doc():
    new_text=[]
    words=[]
    sentences=[]
    
    text=open('corpus_text.txt').read().lower()
    raw_text=nltk.sent_tokenize(text)
    for sent in raw_text:
        for char in sent:
            if char in '!?.^' :
                sent=sent.replace(char,'')
        new_text.append(sent)
    
    for sent in new_text:
        for word in sent.split():
            words.append(word)
         
    words=set(words)       
    word2int,int2word = {},{}
    vocab_size = len(words) 

    for i,word in enumerate(words):
        word2int[word] = i
        int2word[i] = word
        
    for sentence in new_text:
        sentences.append(sentence.split())
       
    return(word2int,int2word,sentences,vocab_size,words)
    
def make_training_data(sentences):
    WINDOW_SIZE = 2
    data=[]
    
    for sentence in sentences:
        for word_index, word in enumerate(sentence):
            for nb_word in sentence[max(word_index - WINDOW_SIZE, 0) : min(word_index + WINDOW_SIZE, len(sentence)) + 1] : 
                if nb_word != word:
                    data.append([word, nb_word])  
    return(data)
                    
def to_one_hot(data_point_index,vocab_size):
    temp=np.zeros(vocab_size)
    temp[data_point_index]=1
    return temp

def make_vectors(data,word2int,vocab_size):
    x_train=[]#input_word
    y_train=[]#output_word

    for data_word in data:
        x_train.append(to_one_hot(word2int[data_word[0]],vocab_size))
        y_train.append(to_one_hot(word2int[data_word[1]],vocab_size))
    
    x_train=np.asarray(x_train)
    y_train=np.asarray(y_train)
    
    return(x_train,y_train)

In [141]:
def view_results(x_train,y_train):
    print("X_train:", x_train)
    print("Y_train:", y_train)
    print(x_train.shape,y_train.shape)
    
def create_and_train_model(x_train,y_train,vocab_size):
    x=tf.placeholder(tf.float32,shape=(None,vocab_size))
    y_label=tf.placeholder(tf.float32,shape=(None,vocab_size))

    EMBEDDING_DIM=5
    W1=tf.Variable(tf.random_normal([vocab_size,EMBEDDING_DIM]))
    b1=tf.Variable(tf.random_normal([EMBEDDING_DIM]))
    hidden_representation=tf.add(tf.matmul(x,W1),b1)
    W2=tf.Variable(tf.random_normal([EMBEDDING_DIM,vocab_size]))
    b2=tf.Variable(tf.random_normal([vocab_size]))

    prediction=tf.nn.softmax(tf.add(tf.matmul(hidden_representation,W2),b2))
    
    sess=tf.Session()
    init=tf.global_variables_initializer()
    sess.run(init)
    #define loss
    cross_entropy_loss=tf.reduce_mean(-tf.reduce_sum(y_label*tf.log(prediction),reduction_indices=[1]))
    #start training
    train_step=tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)

    n_iters=100000
    for i in range(n_iters):
        sess.run(train_step,feed_dict={x:x_train,y_label:y_train})
        sess.run(cross_entropy_loss,feed_dict={x:x_train,y_label:y_train})
        if(i%500==0):
            print('Loss after {}th iteration is : {}'.format(i,sess.run(cross_entropy_loss,feed_dict={x:x_train,y_label:y_train})))
            
    vectors=sess.run(W1+b1)
    #print(sess.run(W1))
    #print(vectors)
    return(vectors)
    

In [108]:
def euclidean_distance(vec1,vec2):
    return np.sqrt(np.sum(vec1-vec2)**2)

def find_closest(word_index,vectors):
    min_dist=10000
    min_index=-1
    query_vector=vectors[word_index]
    for index,vector in enumerate(vectors):
        if euclidean_distance(vector,query_vector)<min_dist and not np.array_equal(vector,query_vector):
            min_dist=euclidean_distance(vector,query_vector)
            min_index=index
            
    return min_index    

In [137]:
def plot(vectors,words,word2int):
    model=TSNE(n_components=2,random_state=0)
    np.set_printoptions(suppress=True)
    vectors=model.fit_transform(vectors)
    normalizer=preprocessing.Normalizer()
    vectors=normalizer.fit_transform(vectors,'l2')
    
    fig,ax=plt.subplots()
    for word in words:
        print(word,vectors[word2int[word]][1])
        ax.annotate(word,(vectors[word2int[word]][0],vectors[word2int[word]][1]))

    plt.show()

In [139]:
def main():
    word2int,int2word,sentences,vocab_size,words=read_doc()
    data=make_training_data(sentences)
    x_train,y_train=make_vectors(data,word2int,vocab_size)
    
    #view_results(x_train,y_train)
    
    vectors=create_and_train_model(x_train,y_train,vocab_size)
    
    print('Word closest to sister : ',int2word[find_closest(word2int['sister'], vectors)])
    print('Word closest to monica : ',int2word[find_closest(word2int['monica'], vectors)])
    print('Word closest to eat : ',int2word[find_closest(word2int['eat'], vectors)])
    
    plt.xticks(range(-1.5,1.5))
    plt.yticks(range(-1.5,1.5))
    plot(vectors,words,word2int)

In [None]:
if __name__=="__main__":
    main()

Loss after 0th iteration is : 9.192723274230957
Loss after 500th iteration is : 6.555208683013916
Loss after 1000th iteration is : 5.929463863372803
Loss after 1500th iteration is : 5.591761589050293
Loss after 2000th iteration is : 5.371921062469482
Loss after 2500th iteration is : 5.212832450866699
Loss after 3000th iteration is : 5.08939266204834
Loss after 3500th iteration is : 4.98776388168335
Loss after 4000th iteration is : 4.900006294250488
Loss after 4500th iteration is : 4.8215718269348145
Loss after 5000th iteration is : 4.7497239112854
Loss after 5500th iteration is : 4.682715892791748
Loss after 6000th iteration is : 4.6193976402282715
Loss after 6500th iteration is : 4.558995723724365
Loss after 7000th iteration is : 4.500997066497803
Loss after 7500th iteration is : 4.445059776306152
Loss after 8000th iteration is : 4.390960216522217
Loss after 8500th iteration is : 4.338551044464111
Loss after 9000th iteration is : 4.287731647491455
Loss after 9500th iteration is : 4.23