## Geo-Vec Model 
- basic geo-vec model
- auxilliary task models

#### Imports:

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as ss

import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

  return f(*args, **kwds)


#### Helper Functions

In [2]:
# Load tokenized reuters
word2id = np.load('data/reuters/reuters_word2id.npy').item(0)
id2word = np.load('data/reuters/reuters_id2word.npy').item(0)
tokenized = list(np.load('data/reuters/reuters_tokenized.npy'))

#### Helper Functions

In [3]:
import numpy as np
from gensim.corpora.wikicorpus import WikiCorpus
from gensim import utils
# from scipy.sparse import coo_matrix

def get_adj(tokenized_docs, word2id):
    for docidx in tokenized_docs:
        adj_i = np.vstack((docidx[:-1], docidx[1:]))
        adj_o = np.flip(adj_i, axis=0)
        sp_adj_i = ss.coo_matrix((np.ones(adj_i.shape[1]), (adj_i[0, :], adj_i[1, :])), 
                                 (len(word2id), len(word2id)))
        sp_adj_o = ss.coo_matrix((np.ones(adj_o.shape[1]), (adj_o[0, :], adj_o[1, :])), 
                                 (len(word2id), len(word2id)))
        yield sp_adj_o, sp_adj_i

def get_lapl(tokenized_docs, word2id, renorm_trick=True):
    for A_o, A_i in get_adj(tokenized_docs, word2id):
        if renorm_trick == True:
            _A_i = A_i + ss.eye(A_i.shape[0])
            _A_o = A_o + ss.eye(A_o.shape[0])
        D_inv_sqrt_i = ss.diags(np.power(np.array(_A_i.sum(1)), -0.5).flatten())
        D_inv_sqrt_o = ss.diags(np.power(np.array(_A_o.sum(1)), -0.5).flatten())
        L_i = _A_i.dot(D_inv_sqrt_i).transpose().dot(D_inv_sqrt_i).tocoo()
        L_o = _A_o.dot(D_inv_sqrt_o).transpose().dot(D_inv_sqrt_o).tocoo()
        
        yield A_o, A_i, L_o, L_i
        
Ls = []
for A_o, A_i, L_o, L_i in get_lapl(tokenized[:10], word2id):
     Ls.append([A_o, A_i, L_o, L_i])

In [4]:
class WikiCorpusExtended(WikiCorpus):
    """
        Extension on the WikiCorpus from gensim
    
    """
    def __init__(self, args):
        super().__init__(args)
        
    def get_docidx(self):
        for doc in self.get_texts():
            doc = [word if isinstance(word, str) else str(word, 'utf-8') for word in doc]
            yield np.array([wiki.dictionary.token2id.get(word) for word in doc])
    
    def get_adj(self):
        for docidx in self.get_docidx():
            adj_i = np.vstack((docidx[:-1], docidx[1:]))
            adj_o = np.flip(np.vstack((docidx[:-1], docidx[1:])), axis=0)
            sp_adj_i = ss.coo_matrix((np.ones(adj_i.shape[1]), (adj_i[0, :], adj_i[1, :])), 
                                     (len(self.dictionary), len(self.dictionary)))
            sp_adj_o = ss.coo_matrix((np.ones(adj_o.shape[1]), (adj_o[0, :], adj_o[1, :])), 
                                     (len(self.dictionary), len(self.dictionary)))
            yield sp_adj_i, sp_adj_o
            
    def get_lapl(self, renorm_trick=True):
        for A_i, A_o in self.get_adj():
            if renorm_trick == True:
                A_i += ss.eye(A_i.shape[0])
                A_o += ss.eye(A_o.shape[0])
            D_inv_sqrt_i = ss.diags(np.power(np.array(A_i.sum(1)), -0.5).flatten())
            D_inv_sqrt_o = ss.diags(np.power(np.array(A_o.sum(1)), -0.5).flatten())
            L_i = A_i.dot(D_inv_sqrt_i).transpose().dot(D_inv_sqrt_i).tocoo()
            L_o = A_o.dot(D_inv_sqrt_o).transpose().dot(D_inv_sqrt_o).tocoo()
            yield L_i, L_o
            
            
# wiki = WikiCorpusExtended('process/enwiki-latest-pages-articles1.xml-p10p30302.bz2')


In [5]:
class Doc2Graph():
    """Convert tokenized document to weighted Adjacency matrix
    and graph Laplacian"""
    def __init__(self, doc, doc_id=-1):
        self.doc = doc
        self.doc_id = doc_id
        
    def doc2graph(self):
        g = self.load()
            
        if not g:
            As = self.get_As()
            Ls = self.get_Ls(As)
            g = As + Ls
            self.save(g)

        return g
    
    def get_As(self):
        """Get the weighted adjacency matrices of incoming
        and outcoming edges"""
        As = []
        e1 = np.vstack((self.doc[:-1], self.doc[1:])).T
        e2 = np.flip(e1, 1)
        for a in [e2, e1]:
            rc, cooc = np.unique(a, return_counts=True, axis=0)
            As.append(ss.coo_matrix((cooc, (rc[:,0], rc[:,1])), 
                                   tuple((np.max(a)+1, np.max(a)+1))))
        return As
    
    def get_Ls(self, As, renorm_trick=False):
        """Create graph Laplacians from adjacency matrices"""
        Ls = []
        for A in As:
            A = ss.coo_matrix(A)
            if renorm_trick:
                A_ = A + ss.eye(A.shape[0])
            D_inv_sqrt = ss.diags(np.power(np.array(A.sum(1)), -0.5).flatten())
            L = A.dot(D_inv_sqrt).transpose().dot(D_inv_sqrt).tocoo()
            Ls.append(L)
            
        return Ls    
    
    def save(self, g):
        """Save graph to folder for reuse"""
        print('save: implement me!')
        pass
    
    def load(self):
        print('load: implement me!')
        return None
            
def sparse_to_tuple(sparse_mx):
    if not ss.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

def sp2tf(sp_t, shape=None):
    t = sparse_to_tuple(sp_t)
#     tensor = tf.SparseTensor(t[0],t[1].astype(np.float32),t[2])
    if shape is not None:
        t[2] == shape
    tensor = tf.SparseTensorValue(t[0],t[1].astype(np.float32),t[2])
    return tensor

def dropout_sparse(x, keep_prob, num_nonzero_elems):
    """Dropout for sparse tensors. Currently fails for very large sparse tensors (>1M elements)
    """
    noise_shape = [num_nonzero_elems]
    random_tensor = keep_prob
    random_tensor += tf.random_uniform(noise_shape)
    dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool)
    pre_out = tf.sparse_retain(x, dropout_mask)
    return pre_out * (1./keep_prob)

#### Geo-Vec

In [26]:
import random
import time

class GeoVec():
    def __init__(self, corpus=None, vocab_size=10, h_layers = [8, 4], 
                 act = tf.nn.relu, dropout=0.0, learning_rate = 1e-3):
        """Geo-Vec model as described in the report model section."""
        
        self.corpus = corpus
        self.vocab_size = vocab_size
        self.h_layers = h_layers
        self.act = act
        self.dropout = dropout
        self.learning_rate = learning_rate
        
        # use for plotting
        self._loss_vals, self._acc_vals = [], []
        
        #placeholders
        s = [self.vocab_size, self.vocab_size]
        self.placeholders = {
            'A_o': tf.sparse_placeholder(tf.float32),
            'L_o': tf.sparse_placeholder(tf.float32),
            'A_i': tf.sparse_placeholder(tf.float32),
            'L_i': tf.sparse_placeholder(tf.float32),
            'idx_i': tf.placeholder(tf.int64),
            'idx_o': tf.placeholder(tf.int64),
            'val_i': tf.placeholder(tf.float32),
            'val_o': tf.placeholder(tf.float32),
            'dropout': tf.placeholder_with_default(0., shape=())
        }
        
        # model
        self.aux_losses = None
        dummy = sp2tf(ss.eye(self.vocab_size))
        self.init_model(x=dummy)

        #optimizer
        self.init_optimizer()
        
        #sess
        self.trained = 0
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
    def init_model(self, x, aux_tasks = None):
        """geo-vec model with variable number of gcn layers. Optional aux_taks
        param is now unimplemented to specify which tasks to add. All aux losses
        should be gathered in a self.aux_losses variable to gather later on."""
        for i, h_layer in enumerate(self.h_layers):
            if i == 0:
                h = self.gcn(x, self.vocab_size, self.h_layers[0], self.act, layer=i,sparse=True)  
            elif (i+1) < len(self.h_layers):
                h = self.gcn(h, self.h_layers[i-1], h_layer, self.act, layer=i, )
            else:
                self.emb_o, self.emb_i = self.gcn(h, self.h_layers[i-1], 
                                             h_layer, act=lambda x: x, layer=i,separate=True)
                
        # here we can left multiply the last layer h
        # and perform auxilliary tasks.
        posneg_samples_o = tf.gather(self.emb_o, tf.transpose(self.placeholders['idx_o']))
        posneg_samples_i = tf.gather(self.emb_i, tf.transpose(self.placeholders['idx_i']))
        
        self.recon_o = self.decode(posneg_samples_o)
        self.recon_i = self.decode(posneg_samples_i)
    
    def gcn(self, x, dim_in, dim_out, act, layer, sparse=False, separate=False):
        """basic graph convolution using a split up adjacency matrix.
        The separation param is to create the final embeddings to reconstruct."""
        w1 = tf.get_variable('w1_{}'.format(layer), shape=[dim_in, dim_out], 
                             initializer=tf.contrib.layers.xavier_initializer())
        w2 = tf.get_variable('w2_{}'.format(layer), shape=[dim_in, dim_out], 
                             initializer=tf.contrib.layers.xavier_initializer())

        if sparse:
            x1 = tf.sparse_tensor_dense_matmul(x, w1)
            x2 = tf.sparse_tensor_dense_matmul(x, w2)
        else:
            x1 = tf.matmul(x, w1)
            x2 = tf.matmul(x, w2)
            
        x1 = tf.sparse_tensor_dense_matmul(self.placeholders['L_o'], x1)
        x2 = tf.sparse_tensor_dense_matmul(self.placeholders['L_i'], x2)
        
        if separate:
            return self.act(x1), self.act(x2)
        
        return self.act(x1 + x2)
    
    def decode(self, x, cap = 1000):
        """simple innerproduct decoder with sigmoid activation to scale
        the edged between 0-1000 (assuming more co-occurances are unlikely)."""
#         print(x)
#         print(x.shape)
#         a_t = x
#         idx = tf.where(tf.not_equal(a_t, 0))
#         # Use tf.shape(a_t, out_type=tf.int64) instead of a_t.get_shape() if tensor shape is dynamic
#         x = tf.SparseTensor(idx, tf.gather_nd(a_t, idx), a_t.get_shape())
        
        x = tf.nn.dropout(x, 1-self.dropout)
        
#         zero = tf.constant(0, dtype=tf.float32)
#         A_rows = tf.sparse_reduce_sum(tf.sparse_add(self.placeholders['A_o'], sp2tf(-ss.eye(self.vocab_size))), 0)

#         where = tf.not_equal(A_rows, zero)
#         indices = tf.where(where)
#         x = tf.gather_nd(x, tf.transpose(indices))
        
        x = tf.reshape(tf.matmul(x, tf.transpose(x)), [-1])
        

        return tf.nn.relu(x)
        
    def init_optimizer(self):
        """initializes optimizer and computes loss + accuracy. The loss function
        is currently a MSE, due to the fact we are dealing with weighted edges.
        This does not seem ideal, and should be thought about."""
        labels_o = self.recon_o
        labels_i = self.recon_i
#         labels_o = tf.reshape(tf.sparse_tensor_to_dense(
#                                 tf.gather(self.placeholders['A_i'], tf.transpose(self.placeholders['idx_i'])),
#                                 validate_indices=False), [-1])
#         labels_i = tf.reshape(tf.sparse_tensor_to_dense(
#                                 tf.gather(self.placeholders['A_i'], tf.transpose(self.placeholders['idx_i'])),
#                                 validate_indices=False), [-1])
        
        emb_or = tf.gather(self.emb_o, self.placeholders['idx_o'][:, 0])
        emb_oc = tf.gather(self.emb_o, self.placeholders['idx_o'][:, 1])
    
        emb_ir = tf.gather(self.emb_i, self.placeholders['idx_i'][:, 0])
        emb_ic = tf.gather(self.emb_i, self.placeholders['idx_i'][:, 1])
        
        self.recon_o = tf.reduce_sum(tf.multiply(emb_or, emb_oc), 1)
        self.recon_i = tf.reduce_sum(tf.multiply(emb_ir, emb_ic), 1)
        
        loss_o = tf.losses.mean_squared_error(self.recon_o, self.placeholders['val_o'])
        loss_i = tf.losses.mean_squared_error(self.recon_i, self.placeholders['val_i']) 
        self.loss = loss_o + loss_i
        
        # gather aux losses and add to total loss
        if self.aux_losses:
            self.loss += self.aux_losses
        
        # optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.opt_op = optimizer.minimize(self.loss)

        cp_o = tf.equal(tf.cast(self.recon_o, tf.int32), tf.cast(self.placeholders['val_o'], tf.int32))
        cp_i = tf.equal(tf.cast(self.recon_i, tf.int32), tf.cast(self.placeholders['val_i'], tf.int32))
        correct_prediction = tf.concat([cp_o, cp_i], 0)
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    def get_feed_dict(self, A_o, A_i, L_o, L_i, idx_i, idx_o, val_o, val_i):
        feed_dict = {self.placeholders['A_o']: A_o,
                     self.placeholders['A_i']: A_i,
                     self.placeholders['L_o']: L_o,
                     self.placeholders['L_i']: L_i,
                     self.placeholders['idx_o']: idx_o,
                     self.placeholders['idx_i']: idx_i,
                     self.placeholders['val_o']: val_o,
                     self.placeholders['val_i']: val_i}
        return feed_dict
    
    def get_sample(self, batch_size=64, ratio=1.0):
        """get random sample from corpus graph cache"""
        dummy = random.choice(Ls).copy()
        
        pos_idx_o = np.random.choice(range(len(dummy[0].row)), batch_size)
        pos_idx_i = np.random.choice(range(len(dummy[1].row)), batch_size)
        
        idx_o = np.array(list(zip(dummy[0].row, dummy[0].col)))[pos_idx_o, :]
        idx_i = np.array(list(zip(dummy[1].row, dummy[1].col)))[pos_idx_i, :]
        val_o = dummy[0].data[pos_idx_o]
        val_i = dummy[1].data[pos_idx_i]
        
        for i, d in enumerate(dummy):
            dummy[i] = sp2tf(d)

        return dummy, idx_o, idx_i, val_o, val_i
    
    def train(self, num_epochs = 100, print_freq=50):
        """train op that can be invoked multiple times."""
        tf.set_random_seed(42)
        np.random.seed(42)

        for e in range(num_epochs):
            self.trained += 1
            (A_o, A_i, L_o, L_i), idx_o, idx_i, val_o, val_i = self.get_sample()
            
            feed_dict = self.get_feed_dict(A_o, A_i, L_o, L_i, idx_o, idx_i, val_o, val_i)
            
#             idx = np.random.choice(self.placeholders['A_o'].indices[:,0], size=(10,1))
#             idx = tf.multinomial(self.placeholders['A_o'].indices[:,0])
#             pos_idx = np.random.choice(idx)
#             x = tf.sparse_slice(self.placeholders['A_o'], self.placeholders['A_o'].indices[:,0], tf.ones(self.placeholders['A_o'].indices[:,0].shape[1]))
#             x = tf.gather(self.emb_o, idx_o)
#             o = self.sess.run([x], feed_dict=feed_dict)
            
            outs = self.sess.run([self.opt_op, self.loss, self.accuracy], feed_dict=feed_dict)
            avg_loss, avg_acc = outs[1], outs[2]
            self._loss_vals.append(avg_loss)
            self._acc_vals.append(avg_acc)
            
            print('\r epoch: %d/%d \t loss: %.3f \t avg_acc: %.3f' 
                      % (e+1, num_epochs, avg_loss, avg_acc), end='')
            if (e + 1) % print_freq == 0:
                print('')
        else:
            print('----> done training: {} epochs'.format(self.trained))
        
    def plot(self):
        """Plotting loss function"""
        plt.figure(figsize=(12, 6))
        plt.plot(self._loss_vals, color='red')
        plt.plot(self._acc_vals, color='blue')
        
        plt.legend(handles=[mpatches.Patch(color='red', label='loss'),
                            mpatches.Patch(color='blue', label='acc')],
                   bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        plt.show()
        
    def get_reconstruction(self, doc = None):
        if doc:
            A_o, A_i, L_o, L_i = Doc2Graph(doc, doc_id).doc2graph()
        else:
            (A_o, A_i, L_o, L_i), idx_o, idx_i, val_o, val_i = self.get_sample()
#             A_o, A_i, L_o, L_i = self.get_sample()
            
        feed_dict = self.get_feed_dict(A_o, A_i, L_o, L_i, idx_o, idx_i, val_o, val_i)
#         feed_dict = self.get_feed_dict(A_o, A_i, L_o, L_i)
        recon_o, recon_i = self.sess.run([self.recon_o, self.recon_i], feed_dict=feed_dict)
        return A_o, A_i, recon_o, recon_i
    
    def get_embeddings(self, doc = None, doc_id = None):
        if doc:
            A_o, A_i, L_o, L_i = Doc2Graph(doc, doc_id).doc2graph()
        else:
            (A_o, A_i, L_o, L_i), idx_o, idx_i, val_o, val_i = self.get_sample()
#             A_o, A_i, L_o, L_i = self.get_sample()
            
        feed_dict = self.get_feed_dict(A_o, A_i, L_o, L_i, idx_o, idx_i, val_o, val_i)
        
#         feed_dict = self.get_feed_dict(A_o, A_i, L_o, L_i, )
        emb_o, emb_i = self.sess.run([self.emb_o, self.emb_i], feed_dict=feed_dict)
        return A_o, A_i, emb_o, emb_i

In [28]:
tf.reset_default_graph()
# fake_doc = np.asarray([1, 2, 3, 4, 1, 5, 6, 4, 1, 7, 9])
geo_vec_model = GeoVec(vocab_size=len(word2id), h_layers = [6, 4])
geo_vec_model.train(100, 10)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


 epoch: 10/100 	 loss: 1.999 	 avg_acc: 0.000
 epoch: 20/100 	 loss: 1.998 	 avg_acc: 0.000
 epoch: 30/100 	 loss: 1.989 	 avg_acc: 0.000
 epoch: 40/100 	 loss: 1.986 	 avg_acc: 0.000
 epoch: 50/100 	 loss: 1.969 	 avg_acc: 0.000
 epoch: 60/100 	 loss: 1.910 	 avg_acc: 0.000
 epoch: 70/100 	 loss: 1.918 	 avg_acc: 0.000
 epoch: 80/100 	 loss: 1.789 	 avg_acc: 0.000
 epoch: 90/100 	 loss: 1.561 	 avg_acc: 0.000
 epoch: 100/100 	 loss: 1.591 	 avg_acc: 0.000
----> done training: 100 epochs


In [None]:
geo_vec_model.plot()

In [8]:
len(word2id)

59575

In [74]:
tf.__version__

'1.4.0'

In [29]:
geo_vec_model.get_reconstruction()

(SparseTensorValue(indices=array([[33169,  6109],
       [30987, 33169],
       [ 6415, 30987],
       [ 6088,  6415],
       [29294,  6088],
       [ 6028, 29294],
       [22762,  6028],
       [37201, 22762],
       [12533, 37201],
       [ 7861, 12533],
       [34136,  7861],
       [32907, 34136],
       [16842, 32907],
       [54644, 16842],
       [48010, 54644],
       [46525, 48010],
       [41315, 46525],
       [57342, 41315],
       [54662, 57342],
       [50502, 54662],
       [55186, 50502],
       [50502, 55186],
       [48019, 50502],
       [41194, 48019],
       [56296, 41194],
       [38428, 56296],
       [59071, 38428],
       [57552, 59071],
       [37790, 57552],
       [38397, 37790],
       [37880, 38397],
       [50600, 37880],
       [49698, 50600],
       [55186, 49698],
       [57552, 55186],
       [31434, 57552],
       [22762, 31434],
       [35308, 22762],
       [50502, 35308],
       [20057, 50502],
       [32850, 20057],
       [45539, 32850],
       