In [1]:
import tensorflow as tf
import numpy as np
import math
from tensorflow import nn
from collections import Counter
import random

In [2]:
def subsample(data):
    threshold=1e-5
    data=data.split()
    data=data[:len(data)//10]
    word_count=Counter(data)
    total_count=len(data)
    freq={word:count/total_count for word,count in word_count.items()}
    p_drop={word:1-np.sqrt(threshold/freq[word]) for word in word_count}
    subsampled=[word for word in data if random.random() <1 -p_drop[word]]
    return subsampled

In [3]:
def make_dictionary(data):
    words=set(data) ## gets unique words only
    word2int={}
    int2word={}
    vocab_size=len(words)
    
    for i, word in enumerate(words):
        word2int[word]=i
        int2word[i]=word
    return word2int,int2word,vocab_size

In [4]:
def read_data(filename):
    with open(filename,'r') as f:
        return f.read()

url='/home/farrukh/Courses GIST/Codes and Books/Word2Vec/datasets/Word2Vec/text8'
data=read_data(url)
subsampled=subsample(data)
print(len(data.split()))
print(len(subsampled))


17005207
478894


In [30]:
word2int,int2word,vocabulari_size=make_dictionary(data.split())
embedding_size=128
print(vocabulari_size)

253854


In [6]:
## used for checking and debuggin 
#data='the quick brown fox jumped over the lazy dog'
#word2int,int2word,vocabulari_size=make_dictionary(data)
#embedding_size=5

In [7]:
len(word2int)

70889

In [45]:
## makes stacked (context,target) tuples using window size of 1
def make_context_target_pairs(data,window_size=1):
    dataset=data
    left=[dataset[i-window_size:i] for i in range(window_size,len(dataset)-window_size)]
    right=[dataset[i+1:i+window_size+1] for i in range(window_size,len(dataset)-window_size)]
    middle=[dataset[i] for i in range(window_size,len(dataset)-window_size)]
    dataset=[]
    for i in range(len(left)):
        combined=left[i]
        combined.extend(right[i])
        dataset.append((combined,middle[i]))
    return dataset

#makes unstacked (target_i,context_i_j) pairwise tuples
## where (target,context) is the same as (input,output) datasets for skip-grammar model
def make_pair(data):
    dataset=[]
    for i in range(len(data)):
        dataset.extend([[data[i][1],data[i][0][j]] for j in range(len(data[i][0])) ])
    return dataset
lngth=len(data.split())
preprocessed=make_context_target_pairs(data.split()[:lngth//10],1)
invrtd_data=make_pair(preprocessed)
preprocessed,invrtd_data
print(len(preprocessed))

1700518


In [46]:
def replace_data_with_codes(data,word2int):
    dataset=[[word2int[inpt],word2int[outpt]] for inpt,outpt in data]
    return dataset
train_data=np.array(replace_data_with_codes(invrtd_data,word2int))
train_data

array([[167993,  10207],
       [167993, 137436],
       [137436, 167993],
       ...,
       [103693, 227854],
       [227854, 103693],
       [227854,  12372]])

In [47]:
embeddings=tf.Variable(tf.random_uniform([vocabulari_size,embedding_size],-1.0,1.0))
nce_weights=tf.Variable(tf.truncated_normal([vocabulari_size,embedding_size]))
nce_biases=tf.Variable(tf.zeros([vocabulari_size]))

In [48]:
norm=tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keepdims=True))
normalized_embeddings=embeddings/norm
#valid_embeddings=tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)

In [49]:
batch_size=10000
train_input=tf.placeholder(tf.int32,shape=[batch_size])
train_labels=tf.placeholder(tf.int32,shape=[batch_size,1])

In [50]:
embed=tf.nn.embedding_lookup(embeddings,train_input)

In [51]:
loss=tf.reduce_mean(nn.sampled_softmax_loss(weights=nce_weights,
                                  biases=nce_biases,
                                  labels=train_labels,
                                  inputs=embed,
                                  num_classes=vocabulari_size,
                                  num_sampled=5,partition_strategy='div'))


In [52]:
optimizer=tf.train.GradientDescentOptimizer(learning_rate=1.0)
training_op=optimizer.minimize(loss)


In [53]:
def generate_batch(data,batch_size):
    indecis=np.random.permutation(len(data))
    n_batches=len(data)//batch_size
    for ind in np.array_split(indecis,n_batches):
        return data[ind[:batch_size],:1].reshape(batch_size), data[ind[:batch_size],1:]

In [54]:
len(train_data)

3401036

In [None]:
init=tf.global_variables_initializer()
saver=tf.train.Saver()
with tf.Session() as sess:
    init.run()
    print(embeddings.eval(),'\n\n')

    for epoch in range(10000):
        for i in range(len(train_data)//batch_size):
            x_batch,y_batch=generate_batch(train_data,batch_size)
            feed_dict={train_input:x_batch,train_labels:y_batch}
            _,cur_loss=sess.run([training_op,loss], feed_dict=feed_dict)
        if(epoch%10==0):
            print('Epoch: %d out of 10000'%epoch)
            saver.save(sess,'/home/farrukh/Courses GIST/Codes and Books/Word2Vec/datasets/Word2Vec/negModelNoSubsample.ckpt')
    print(embeddings.eval(),'\n\n\n\n')
    final_embeddings=normalized_embeddings.eval()

[[-0.56302524  0.787374   -0.9346087  ...  0.6997583  -0.9952462
  -0.6744714 ]
 [ 0.03886008  0.69121027 -0.14256096 ...  0.27742195  0.28424072
  -0.0429678 ]
 [ 0.8495772   0.93993044 -0.82224655 ...  0.72667575  0.85389876
   0.53896046]
 ...
 [-0.6574106  -0.64742184  0.7485759  ... -0.98803973  0.16143823
   0.8789139 ]
 [-0.80495524  0.9255545  -0.9140315  ... -0.5067208   0.7236531
   0.38112855]
 [-0.9307692  -0.59663033  0.05232549 ... -0.85570717 -0.71479464
   0.30171013]] 


Epoch: 0 out of 10000
Epoch: 10 out of 10000
Epoch: 20 out of 10000
Epoch: 30 out of 10000
Epoch: 40 out of 10000
Epoch: 50 out of 10000
Epoch: 60 out of 10000
Epoch: 70 out of 10000
Epoch: 80 out of 10000
Epoch: 90 out of 10000
Epoch: 100 out of 10000
Epoch: 110 out of 10000
Epoch: 120 out of 10000
Epoch: 130 out of 10000
Epoch: 140 out of 10000
Epoch: 150 out of 10000
Epoch: 160 out of 10000
Epoch: 170 out of 10000
Epoch: 180 out of 10000
Epoch: 190 out of 10000
Epoch: 200 out of 10000
Epoch: 210 out

In [None]:
final_embeddings

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
def plot_with_labels(low_dim_embs,labels):
    plt.figure(figsize=(35,35))
    for i, label in enumerate(labels):
        x,y=low_dim_embs[i,:]
        plt.scatter(x,y)
        plt.annotate(label,
                   xy=(x,y),
                   xytext=(5,2),
                   textcoords='offset points',
                   ha='right',
                   va='bottom')
    plt.savefig('/home/farrukh/Courses GIST/Codes and Books/Word2Vec/datasets/Word2Vec/pic2.png')

In [None]:
tsne=TSNE(perplexity=32,n_components=2,init='pca',n_iter=5000,
         method='exact')
plot_only=1500
low_dim_embs=tsne.fit_transform(final_embeddings[:plot_only,:])
labels=[int2word[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs,labels)