# Word2Vec

In [2]:
import time
import numpy as np
import tensorflow as tf
import utils
from random import randint

Load the [text8 dataset](http://mattmahoney.net/dc/textdata.html), a file of cleaned up Wikipedia articles from Matt Mahoney. The next cell will download the data set to the `data` folder. Then you can extract it and delete the archive file to save storage space.

### Load the dataset

In [3]:
with open('data/text8') as f:
    text = f.read()

### Preprocessing

In [8]:
words = utils.preprocess(text)
print(words[:30])

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst']


In [9]:
# Shape
print("Total words: {}".format(len(words)))
print("Unique words: {}".format(len(set(words))))

Total words: 16680599
Unique words: 63641


### Create lookup table

A look table will contain index for each word to convert word into index and index converted into word

In [10]:
vocab_to_int, int_to_vocab = utils.create_lookup_tables(words)
int_words = [vocab_to_int[word] for word in words]

In [12]:
vocab_to_int

{'dominant': 2316,
 'regeneration': 16202,
 'blanche': 20564,
 'reggie': 22153,
 'courted': 34947,
 'cosmogony': 33800,
 'fortean': 58112,
 'swabian': 28629,
 'distilling': 33801,
 'libera': 58113,
 'powerplants': 58114,
 'aqueducts': 33802,
 'fora': 36074,
 'eli': 12743,
 'wayward': 44611,
 'tar': 10712,
 'habilitation': 38364,
 'heenan': 49753,
 'louisbourg': 33556,
 'nascar': 14778,
 'melvin': 18497,
 'nigra': 63375,
 'salicylic': 44612,
 'designer': 3845,
 'certify': 37430,
 'conducive': 19932,
 'asimov': 5603,
 'exponents': 17626,
 'confiscate': 53722,
 'communications': 1664,
 'titanium': 18498,
 'reeder': 58117,
 'egalitarianism': 19222,
 'erudite': 42432,
 'royce': 14777,
 'megahertz': 50096,
 'signal': 1470,
 'anscombe': 44613,
 'backtracking': 47134,
 'hybrid': 6187,
 'jati': 42433,
 'ascension': 9825,
 'spawned': 8889,
 'morecambe': 58118,
 'soulful': 40561,
 'immortalised': 34948,
 'gesner': 44614,
 'irc': 4924,
 'brevet': 53723,
 'luftwaffe': 9291,
 'woo': 13902,
 'summari

In [13]:
int_to_vocab

{0: 'the',
 1: 'of',
 2: 'and',
 3: 'one',
 4: 'in',
 5: 'a',
 6: 'to',
 7: 'zero',
 8: 'nine',
 9: 'two',
 10: 'is',
 11: 'as',
 12: 'eight',
 13: 'for',
 14: 's',
 15: 'five',
 16: 'three',
 17: 'was',
 18: 'by',
 19: 'that',
 20: 'four',
 21: 'six',
 22: 'seven',
 23: 'with',
 24: 'on',
 25: 'are',
 26: 'it',
 27: 'from',
 28: 'or',
 29: 'his',
 30: 'an',
 31: 'be',
 32: 'this',
 33: 'which',
 34: 'at',
 35: 'he',
 36: 'also',
 37: 'not',
 38: 'have',
 39: 'were',
 40: 'has',
 41: 'but',
 42: 'other',
 43: 'their',
 44: 'its',
 45: 'first',
 46: 'they',
 47: 'some',
 48: 'had',
 49: 'all',
 50: 'more',
 51: 'most',
 52: 'can',
 53: 'been',
 54: 'such',
 55: 'many',
 56: 'who',
 57: 'new',
 58: 'used',
 59: 'there',
 60: 'after',
 61: 'when',
 62: 'into',
 63: 'american',
 64: 'time',
 65: 'these',
 66: 'only',
 67: 'see',
 68: 'may',
 69: 'than',
 70: 'world',
 71: 'i',
 72: 'b',
 73: 'would',
 74: 'd',
 75: 'no',
 76: 'however',
 77: 'between',
 78: 'about',
 79: 'over',
 80: 'year

In [14]:
int_words

[5243,
 3082,
 11,
 5,
 194,
 1,
 3136,
 45,
 58,
 155,
 127,
 741,
 476,
 10626,
 133,
 0,
 27617,
 1,
 0,
 102,
 854,
 2,
 0,
 15170,
 60089,
 1,
 0,
 150,
 854,
 3583,
 0,
 194,
 10,
 190,
 58,
 4,
 5,
 10727,
 214,
 6,
 1325,
 104,
 454,
 19,
 58,
 2733,
 362,
 6,
 3674,
 0,
 708,
 1,
 371,
 26,
 40,
 36,
 53,
 539,
 97,
 11,
 5,
 1425,
 2757,
 18,
 567,
 686,
 7090,
 0,
 247,
 5243,
 10,
 1052,
 27,
 0,
 320,
 248,
 45385,
 2878,
 792,
 186,
 5243,
 11,
 5,
 200,
 602,
 10,
 0,
 1135,
 19,
 2622,
 25,
 9027,
 2,
 279,
 31,
 4151,
 141,
 59,
 25,
 6443,
 4186,
 1,
 153,
 32,
 362,
 5243,
 36,
 1137,
 6,
 447,
 344,
 1818,
 19,
 4861,
 0,
 6758,
 1,
 7574,
 1774,
 566,
 0,
 93,
 0,
 247,
 11127,
 11,
 51,
 7090,
 89,
 26,
 270,
 37,
 5959,
 4867,
 20420,
 28,
 57278,
 41,
 317,
 5,
 25636,
 527,
 7574,
 371,
 4,
 258,
 1,
 153,
 25,
 1206,
 11,
 7574,
 200,
 1579,
 2,
 15286,
 332,
 1774,
 7090,
 4861,
 344,
 764,
 160,
 406,
 5695,
 755,
 1,
 4115,
 1132,
 4336,
 1536,
 2,
 567,
 8

## Subsampling

Words that show up often such as "the", "of", and "for" don't provide much context to the nearby words. If we discard some of them, we can remove some of the noise from our data and in return get faster training and better representations. This process is called subsampling by Mikolov. For each word $w_i$ in the training set, we'll discard it with probability given by 

$$ P(w_i) = 1 - \sqrt{\frac{t}{f(w_i)}} $$

where $t$ is a threshold parameter and $f(w_i)$ is the frequency of word $w_i$ in the total dataset.

I'm going to leave this up to you as an exercise. This is more of a programming challenge, than about deep learning specifically. But, being able to prepare your data for your network is an important skill to have. Check out my solution to see how I did it.


In [16]:
from collections import Counter
import random

threshold = 1e-5
number_of_words = len(int_words)
word_counter = Counter(int_words)
frequencies = dict()
drop_probabilities = dict()
train_words = []

droped_count = 0
for word,count in word_counter.items():
    frequency = count/number_of_words
    frequencies[word] = frequency
    drop_probabilities[word] = 1 - np.sqrt(threshold/frequency)
    
for word in int_words:
    if drop_probabilities[word] < 0.85:
        train_words.append(word)
        
print(len(train_words))

7852711


# Making the batch

Now that our data is in good shape, we need to get it into the proper form to pass it into our network. With the skip-gram architecture, for each word in the text, we want to grab all the words in a window around that word, with size $C$. 

From [Mikolov et al.](https://arxiv.org/pdf/1301.3781.pdf): 

"Since the more distant words are usually less related to the current word than those close to it, we give less weight to the distant words by sampling less from those words in our training examples... If we choose $C = 5$, for each training word we will select randomly a number $R$ in range $< 1; C >$, and then use $R$ words from history and $R$ words from the future of the current word as correct labels."

In [20]:
def get_target(words, idx, window_size=5):
    ''' Get a list of words in a window around an index. '''
    
    # Your code here
    random_count = randint(1,window_size) #the R number described in the description
    
    if idx - random_count < 0:
        start_word = 0
    else:
        start_word = idx - random_count
        
    if idx + random_count > len(words) - 1:
        end_word = len(words)  
    else:
        end_word = idx + random_count + 1
 
    return list(set(words[start_word:idx]+words[idx+1:end_word]))

# test
print(get_target([0,1,2,3,4,5,6,7,8,9],4,3)) #returns a list of the words around the given index

[2, 3, 5, 6]


Here's a function that returns batches for our network. The idea is that it grabs `batch_size` words from a words list. Then for each of those words, it gets the target words in the window. I haven't found a way to pass in a random number of target words and get it to work with the architecture, so I make one row per input-target pair. This is a generator function by the way, helps save memory.

In [21]:
def get_batches(words, batch_size, window_size=5):
    ''' Create a generator of word batches as a tuple (inputs, targets) '''
    
    n_batches = len(words)//batch_size
    
    # only full batches
    words = words[:n_batches*batch_size]
    
    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx:idx+batch_size]
        for ii in range(len(batch)):
            batch_x = batch[ii]
            batch_y = get_target(batch, ii, window_size)
            y.extend(batch_y)
            x.extend([batch_x]*len(batch_y))
        yield x, y
    

# Building the graph

In [22]:
train_graph = tf.Graph()
with train_graph.as_default():
    inputs = tf.placeholder(tf.int32,shape=[None],name = "inputs")
    labels = tf.placeholder(tf.int32,shape = [None,1],name = "labels")