In [1]:
import pandas as pd
from pandas import Series, DataFrame
from IPython.display import display, HTML
import numpy as np





# Reading processed data

In [2]:
delimiter='$'
columns = ['first_char_id', 'second_char_id', 'movie_id', 'source_sentence', 'target_sentence']
source_df = pd.read_csv('../../data/cornell_corpus/clean_data.csv',delimiter=delimiter,names=columns,skiprows=[0])

In [3]:
display(source_df)

Unnamed: 0,first_char_id,second_char_id,movie_id,source_sentence,target_sentence
0,u0,u2,m0,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,..."
1,u0,u2,m0,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....
2,u0,u2,m0,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
3,u0,u2,m0,You're asking me out. That's so cute. What's ...,Forget it.
4,u0,u2,m0,"No, no, it's my fault -- we didn't have a prop...",Cameron.
5,u0,u2,m0,Cameron.,"The thing is, Cameron -- I'm at the mercy of a..."
6,u0,u2,m0,"The thing is, Cameron -- I'm at the mercy of a...",Seems like she could get a date easy enough...
7,u0,u2,m0,Why?,Unsolved mystery. She used to be really popul...
8,u0,u2,m0,Unsolved mystery. She used to be really popul...,That's a shame.
9,u0,u2,m0,"Gosh, if only we could find Kat a boyfriend...",Let me see what I can do.


# Sentence Preprocessing

## Get word dictionary

In [4]:
import string
# This function is used to remove punctuation, espace in each sentence, and do the tokenization
def clean_and_tokenization(sentence):
    if not sentence or sentence == '':
        return
    
    clean_sentence = "".join(char for char in str(sentence).strip().lower() if char not in string.punctuation)
    
    return clean_sentence.split(' ')


# Unit test
sentence = "   Choose your targets men. That's right Watch for you good!!   "
print(clean_and_tokenization(sentence))

['choose', 'your', 'targets', 'men', 'thats', 'right', 'watch', 'for', 'you', 'good']


In [5]:
# Now we use this function to get word list first

word_list = []
for row in source_df.iterrows():
    try: 
        series = row[1]
        
        source_sentence = series['source_sentence']
        target_sentence = series['target_sentence']
        
        source_words = clean_and_tokenization(source_sentence)
        target_words = clean_and_tokenization(target_sentence)
        
        word_list.extend(source_words)
        word_list.extend(target_words)
        
    except Exception as e:
        print(e)
        pass

    

In [9]:
word_set = list(set(word_list))

In [10]:
# To know the metrics

print("Total words: {}".format(len(word_list)))
print("Unique words: {}".format(len(word_set)))

Total words: 4749484
Unique words: 66501


In [16]:
# Check content

word_set[:25]

['',
 'flatlines',
 'nonot',
 'winder',
 'interdimensional',
 'any',
 'thisthere',
 'cargo',
 'switchin',
 'rustlers',
 'cheery',
 'strangeits',
 'rass',
 'grassi',
 'chink',
 'valet',
 'mmmmmmmmmm',
 'virgins',
 'backtrack',
 'thug',
 'inplease',
 'rode',
 'bowdoin',
 'thorwalds',
 'room�s']

In [49]:
# Create word to int dic and int to word dic

word_to_int={}
int_to_word={}
index = 1

for word in word_set:
    word_to_int[word] = index
    int_to_word[str(index)] = word
    
    index = index+1


### Quick test

print(word_to_int['stay'])
print(int_to_word['4791'])




64338
respected


In [18]:
n_vocab = len(int_to_word)

In [19]:
# Convert the word list to index list using the word_to_int dictionary

word_index_list = [word_to_int[word] for word in word_list]




## Subsampling

### Some words like 'a', 'the', 'this' has no significant meaning, should remove them from word list in order to gain a better preformance

In [20]:
from collections import Counter

threshold=1e-5
word_counts = Counter(word_index_list)
total_count = len(word_index_list)

frequence = {word: count/total_count for word, count in word_counts.items()}
drop_prob = {word: 1 - np.sqrt(threshold / frequence[word]) for word in word_counts}

In [21]:
# Use a random to decide if we pick a word into training word
import random

trainning_word_index_list=[word for word in word_index_list if random.random() < (1 - drop_prob[word])]

In [22]:
# An unit test

test_list = trainning_word_index_list[:30]

for index in test_list:
    print(int_to_word[str(index)])

quick
roxanne
korrine
barrett
incredibly
horrendous
public
break
quad
wed
pronunciation
pronunciation
hacking
gagging
spitting
part
hacking
gagging
spitting
cuisine
saturday
night
asking
cute
proper
introduction
cameron
cameron
cameron
mercy


## Making data into small batches

In [23]:
# Now we should make data into small batches in order to use skip-gram model

# The batch size C is the size of training context, if c is larger, normally we can extract more patterns. But the sarcrifice is training time



In [24]:
# First thing, should have a function to get target words

def get_target_words(word_index_list, index, window_size=5):
    
    R = np.random.randint(1, window_size+1)
    
    start = index - R if (index - R) > 0 else 0
    end = index + R
    
    target_word_indexes = set(word_index_list[start: index] + word_index_list[index+1: end+1])
    
    return target_word_indexes


# A quick unit test
index = 125
print('The input word is {}'.format(int_to_word[str(trainning_word_index_list[index])]))

print('The target words are: ')
for target in get_target_words(trainning_word_index_list, index, 5):
    print(int_to_word[str(target)])
    

The input word is shed
The target words are: 
dating
pretty
dip
likes
smokes


In [62]:
# Then we try to make the training index word list into samll batches

def get_batches(word_index_list, batch_size, window_size=5):
    n_batches = len(word_index_list) //batch_size
    
    # We keep only the n_batches of data
    rest_word_index_list = word_index_list[:batch_size * n_batches]
    
    for idx in range(0, len(word_index_list), batch_size):
        x, y =[],[]
        
        batch = word_index_list[idx:idx+batch_size]
        
        for ii in range(len(batch)):
            batch_x = batch[ii]
            batch_y = get_target_words(batch, ii, window_size)
            
            y.extend(batch_y)
            x.extend([batch_x] * len(batch_y))
        
        yield x,y
        
    

# A quick unit test for this function

test_list = trainning_word_index_list[:100]
batch_size =10

for x, y  in get_batches(test_list, batch_size):
    print("input x, bath length is {}".format(len(x)))
    for x_in in x:
        print(int_to_word[str(x_in)])
        
    
    print("***************")    
        
    print("input y, bath lenght is {}".format(len(y)))
    for y_in in y:
        print(int_to_word[str(y_in)])
    
    break

    
    

input x, bath length is 52
quick
quick
quick
quick
quick
roxanne
roxanne
roxanne
roxanne
korrine
korrine
korrine
korrine
korrine
korrine
korrine
barrett
barrett
barrett
barrett
barrett
barrett
incredibly
incredibly
incredibly
incredibly
incredibly
incredibly
incredibly
incredibly
incredibly
horrendous
horrendous
horrendous
horrendous
horrendous
horrendous
public
public
public
public
break
break
break
break
break
quad
quad
quad
quad
wed
wed
***************
input y, bath lenght is 52
barrett
horrendous
incredibly
roxanne
korrine
korrine
barrett
incredibly
quick
barrett
break
horrendous
incredibly
public
roxanne
quick
quick
horrendous
incredibly
public
roxanne
korrine
wed
barrett
break
quick
horrendous
public
roxanne
quad
korrine
barrett
break
incredibly
public
quad
korrine
horrendous
incredibly
quad
break
wed
horrendous
incredibly
public
quad
horrendous
public
wed
break
quad
break


In [26]:
# The length of training data

print("The length of training data is {0}".format(len(trainning_word_index_list)))

The length of training data is 909014


# Graph building

## Build the tensorflow to train the word2vec model

In [50]:
# Define the graph
import tensorflow as tf


train_graph = tf.Graph()


In [51]:
# Define input layer

with train_graph.as_default():
    inputs = tf.placeholder(tf.int32, [None], name='inputs')
    labels = tf.placeholder(tf.int32, [None, None], name = "labels")
    
    
    

In [52]:
# Define embedding layer

num_words = len(int_to_word)
num_embedding = 200

with train_graph.as_default():
    embedding=tf.Variable(tf.random_uniform((num_words, num_embedding), -1, 1))
    embed = tf.nn.embedding_lookup(embedding,inputs)


In [53]:
# Define nagative sampling

n_sampled = 200
with train_graph.as_default():
    softmax_w = tf.Variable(tf.truncated_normal((n_vocab, num_embedding), stddev=0.1))
    softmax_b = tf.Variable(tf.zeros(n_vocab))
    
     # Calculate the loss using negative sampling
    #loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, 
                                      #labels, embed,
                                      #n_sampled, n_vocab,partition_strategy="div")
            
    loss = tf.nn.nce_loss(weights=softmax_w,biases=softmax_b,labels=labels,inputs=embed,num_sampled=n_sampled,num_classes=n_vocab, partition_strategy="div")        
    cost = tf.reduce_mean(loss)
    optimizer = tf.train.AdamOptimizer().minimize(cost)

In [54]:
# validation


with train_graph.as_default():
    ## From Thushan Ganegedara's implementation
    valid_size = 16 # Random set of words to evaluate similarity on.
    valid_window = 100
    # pick 8 samples from (0,100) and (1000,1100) each ranges. lower id implies more frequent 
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples, 
                               random.sample(range(1000,1000+valid_window), valid_size//2))

    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
    normalized_embedding = embedding / norm
    valid_embedding = tf.nn.embedding_lookup(normalized_embedding, valid_dataset)
    similarity = tf.matmul(valid_embedding, tf.transpose(normalized_embedding))

## Start training

In [55]:
epochs = 10
batch_size = 1000
window_size = 10

valid_size = 16

In [56]:
import time

with train_graph.as_default():
    saver = tf.train.Saver()
    
with tf.Session(graph=train_graph) as sess:
    iteration = 1
    loss = 0
    sess.run(tf.global_variables_initializer())
    
    
    for e in range(1, epochs+1):
        batches = get_batches(trainning_word_index_list, batch_size, window_size)
        start = time.time()

   
        for x, y in batches:
            try:
                feed = {inputs: x, labels: np.array(y)[:, None]}
                train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)

                loss += train_loss

                if iteration % 100 == 0: 
                    end = time.time()
                    print("Epoch {}/{}".format(e, epochs),
                          "Iteration: {}".format(iteration),
                          "Avg. Training loss: {:.4f}".format(loss/100),
                          "{:.4f} sec/batch".format((end-start)/100))
                    loss = 0
                    start = time.time()

                if iteration % 1000 == 0:
                    # note that this is expensive (~20% slowdown if computed every 500 steps)
                    sim = similarity.eval()
                    for i in range(valid_size):
                        valid_word = int_to_word[str(valid_examples[i])]
                        top_k = 8 # number of nearest neighbors
                        nearest = (-sim[i, :]).argsort()[1:top_k+1]
                        log = 'Nearest to %s:' % valid_word

                        for k in range(top_k):
                            close_word = int_to_word[str(nearest[k])]
                            log = '%s %s,' % (log, close_word)
                        print(log)

                iteration = iteration +1
            except Exception as err:
                print(err)
                pass 
                
                
    save_path = saver.save(sess, "checkpoints/text8.ckpt")
    embed_mat = sess.run(normalized_embedding)           
                

indices[4845] = 66501 is not in [0, 66501)
	 [[Node: nce_loss/embedding_lookup_1 = GatherV2[Taxis=DT_INT32, Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@Adam/update_Variable_2/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](Variable_2/read, nce_loss/concat, nce_loss/embedding_lookup_1/axis)]]

Caused by op 'nce_loss/embedding_lookup_1', defined at:
  File "/Users/shuhanLin/anaconda/envs/tensorflow/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/shuhanLin/anaconda/envs/tensorflow/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/shuhanLin/anaconda/envs/tensorflow/lib/python3.6/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/Users/shuhanLin/anaconda/envs/tensorflow/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/shuhanLin/anaconda/envs/tensorflow/lib/

indices[1327] = 66501 is not in [0, 66501)
	 [[Node: nce_loss/embedding_lookup_1 = GatherV2[Taxis=DT_INT32, Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@Adam/update_Variable_2/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](Variable_2/read, nce_loss/concat, nce_loss/embedding_lookup_1/axis)]]

Caused by op 'nce_loss/embedding_lookup_1', defined at:
  File "/Users/shuhanLin/anaconda/envs/tensorflow/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/shuhanLin/anaconda/envs/tensorflow/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/shuhanLin/anaconda/envs/tensorflow/lib/python3.6/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/Users/shuhanLin/anaconda/envs/tensorflow/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/shuhanLin/anaconda/envs/tensorflow/lib/

Epoch 1/10 Iteration: 100 Avg. Training loss: 781.1411 0.3148 sec/batch
indices[3247] = 66501 is not in [0, 66501)
	 [[Node: nce_loss/embedding_lookup_1 = GatherV2[Taxis=DT_INT32, Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@Adam/update_Variable_2/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](Variable_2/read, nce_loss/concat, nce_loss/embedding_lookup_1/axis)]]

Caused by op 'nce_loss/embedding_lookup_1', defined at:
  File "/Users/shuhanLin/anaconda/envs/tensorflow/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/shuhanLin/anaconda/envs/tensorflow/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/shuhanLin/anaconda/envs/tensorflow/lib/python3.6/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/Users/shuhanLin/anaconda/envs/tensorflow/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instanc

KeyboardInterrupt: 