In [55]:
import argparse
import math
import struct
import sys
import time
import warnings
import numpy as np
import itertools 
import time 

In [56]:
class VocabItem:
    def __init__(self, word):
        self.word = word
        self.count = 0

In [57]:
class Vocabulary:
    def __init__(self, fi, min_count):
        vocab_items = []
        vocab_hash = {}
        word_count = 0
        fi = open(fi, 'r')
        # Add special tokens <bol> (beginning of line) and <eol> (end of line)
        for token in ['<bol>', '<eol>']:
            vocab_hash[token] = len(vocab_items)
            vocab_items.append(VocabItem(token))
        for line in fi:
            tokens = line.split()
            #print("\rReading line %s" %tokens)
            for token in tokens:
                if token not in vocab_hash:
                    vocab_hash[token] = len(vocab_items)
                    #print ("\r\r token %s" %token)
                    #print ("\t\t token value",vocab_hash[token])
                    vocab_items.append(VocabItem(token))
                #assert vocab_items[vocab_hash[token]].word == token, 'Wrong vocab_hash index'
                vocab_items[vocab_hash[token]].count += 1
                word_count += 1
                if word_count % 10000 == 0:
                    sys.stdout.write("\rReading word %d" % word_count)
                    sys.stdout.flush()

            # Add special tokens <bol> (beginning of line) and <eol> (end of line)
            vocab_items[vocab_hash['<bol>']].count += 1
            vocab_items[vocab_hash['<eol>']].count += 1
            word_count += 2
        self.vocab_items = vocab_items # List of VocabItem objects
        self.vocab_hash = vocab_hash  # Mapping from each token to its index in vocab
        self.word_count = word_count # Total number of words in train file
        # Add special token <unk> (unknown),
        # merge words occurring less than min_count into <unk>, and
        # sort vocab in descending order by frequency in train file
        self.__sort(min_count)
        print ('Total words in training file: %d' % self.word_count)
        #print ('Total bytes in training file: %d' % self.bytes)
        print ('Vocab size: %d' % len(self))
    def __getitem__(self, i):
        return self.vocab_items[i]

    def __len__(self):
        return len(self.vocab_items)

    def __iter__(self):
        return iter(self.vocab_items)

    def __contains__(self, key):
        return key in self.vocab_hash

    def __sort(self, min_count):
        tmp = []
        tmp.append(VocabItem('<unk>'))
        unk_hash = 0
        
        count_unk = 0
        for token in self.vocab_items:
            if token.count < min_count:
                count_unk += 1
                tmp[unk_hash].count += token.count
                #print("word setting as unknow:",token.word)
            else:
                tmp.append(token)

        tmp.sort(key=lambda token : token.count, reverse=True)

        # Update vocab_hash
        vocab_hash = {}
        for i, token in enumerate(tmp):
            vocab_hash[token.word] = i

        self.vocab_items = tmp
        self.vocab_hash = vocab_hash
        #print ("printing vocab_hash")
        #for key,value in vocab_hash.items():
         #   print (key,value)
        #print ('Unknown vocab size:', count_unk)

    def indices(self, tokens):
        return [self.vocab_hash[token] if token in self else self.vocab_hash['<unk>'] for token in tokens]

In [58]:
class paraItem:
    def __init__(self,par):
        self.label = par
        self.wc = 0
        self.filename = None
        self.dmvec = np.random.uniform(low=-0.5/100, high=0.5/100, size=(100)) #for every phrase size of dim
        self.words = []

In [59]:
class paragrahps:
    def __init__(self,fi,max_sentence_length):
        paras =[]
        local_w = 0
        sen_count = 0 
        curr_sen =[]
        total_wc = 0
        fi = open(fi,'r')
        for line in fi:
            tokens = line.split()
            print (len(tokens))
            for token in tokens:
                if local_w <= max_sentence_length:
                    curr_sen.append(token)
                    local_w +=1 
                    total_wc+=1
                    #print ("in if loop")
                else:
                    local_w = 0 #reset the word count for sentence
                    sys.stdout.write("\rRead word %d and updating %d para object" % (total_wc,sen_count))
                    sys.stdout.flush()
                    paras.append(paraItem(sen_count)) #list of objects with sentence index as lable.
                    paras[len(paras)-1].words = curr_sen #add the words into objects. 
                    paras[len(paras)-1].wc = len(curr_sen) # upated word count in the para
                    sen_count+=1 #increase the sentence count==> index to paraItems
                    curr_sen=[] #clear the words to add new words.
        self.paras =paras
    def __getlist__(self):
        return self.paras

In [60]:
class UnigramTable:
    """
    A list of indices of tokens in the vocab following a power law distribution,
    used to draw negative samples.
    """
    def __init__(self, vocab):
        vocab_size = len(vocab)
        power = 0.75
        norm = sum([math.pow(t.count, power) for t in vocab]) # Normalizing constant
        print (norm)
        table_size = 1e8 # Length of the unigram table depends on vocab
        #print table_size
        table = np.zeros(table_size, dtype=np.uint32)

        print ('Filling unigram table')
        p = 0 # Cumulative probability
        i = 0
        old_i = 0 
        for j, unigram in enumerate(vocab):
            #print "j",j
            #print "unigram",unigram
            
            p += float(math.pow(unigram.count, power))/norm
            while i < table_size and float(i) / table_size < p:
                table[i] = j
                i += 1
            old_i = i - old_i
            sys.stdout.write("\r propability for word '%s' is %f, kept it  %d times" %(unigram.word,p,old_i))
            sys.stdout.flush()
            #print("propability for word %s is %f, kept it  %d times" %(unigram.word,p,old_i))
        self.table = table
    def sample(self, count):
        indices = np.random.randint(low=0, high=len(self.table), size=count)
        return [self.table[i] for i in indices]

In [61]:
vocab = Vocabulary("text8",5)

Reading word 17000000Total words in training file: 17005209
Vocab size: 71291


In [62]:
para_list = paragrahps("text8",20000)

17005207
Read word 17000850 and updating 849 para object

In [63]:
pa = para_list.__getlist__()

In [64]:
table = UnigramTable(vocab)

2253145.7750768745
Filling unigram table
 propability for word 'ngurunderi' is 0.999927, kept it  49587746 times



In [65]:
def initialize(dim, vocab_size):
    # Init input words with random numbers from a uniform distribution on the interval [-0.5, 0.5]/dim
    tmp = np.random.uniform(low=-0.5/dim, high=0.5/dim, size=(vocab_size, dim))
    input_word = tmp 
    # Init weights with zeros
    tmp = np.zeros(shape=(vocab_size, dim))
    weights = tmp
  

    return (input_word,weights)

In [66]:
def sigmoid(z): #sigmoid function goes from -6 to +6
    if z > 6:
        return 1.0
    elif z < -6:
        return 0.0
    else:
        return 1 / (1 + math.exp(-z))

In [67]:
input_word,weights = initialize(100,len(vocab))

In [None]:
start = time.clock()
starting_alpha=0.025
current_word_count=0  
alpha_count = 0
last_alpha_count = 0
win=5 
for i in pa:
    current_p = i
    sentence = current_p.words
    ind = vocab.indices(sentence)
    for sent_pos, token in enumerate(ind):
        neu1e = np.zeros(100)
        if current_word_count % 10000 == 0:
            alpha_count += (current_word_count - last_alpha_count)
            last_alpha_count = current_word_count
            alpha = starting_alpha * (1 - float(alpha_count) / 17020851)
            if alpha < starting_alpha * 0.0001: alpha = starting_alpha * 0.0001
            sys.stdout.write("\rAlpha: %f  para Progress: %d of %d (%.2f%%)" %
                                 (alpha, current_p.label, len(pa),
                                  float(current_p.label* 100/len(pa))))
            sys.stdout.flush()
        current_win = np.random.randint(low=1, high=win+1)
        context_start = max(sent_pos - current_win, 0)
        context_end = min(sent_pos + current_win + 1, len(ind))
        context = ind[context_start:sent_pos] + ind[sent_pos+1:context_end]
        for context_word in context:
            neu1e = np.zeros(100)
            classifiers = [(token, 1)] + [(target, 0) for target in table.sample(10)]
            for target, label in classifiers:
                z = np.dot(input_word[context_word],weights[target])
                p = sigmoid(z)
                g = alpha * (label - p)
                neu1e+=g*weights[target]
                weights[target] +=g* input_word[context_word]
            input_word[context_word] +=neu1e
            current_p.dmvec+=neu1e
            current_word_count += 1    
print ('time taken',time.clock() - start)                

Alpha: 0.000003  para Progress: 504 of 850 (59.29%)