In [1]:
import mxnet as mx
import mxnet.gluon
from mxnet import nd, autograd
import scipy.sparse as sp
import networkx as nx
from mxnet.gluon import Block

  from numpy.testing.decorators import setastest


In [2]:
from mxnet import init

In [4]:
from graph_utils import *

Constructing homo graphs....
The homo graph for users.....
The homo graph for items.....


In [5]:
def sparse_to_tuple(sparse_mx):
    '''
    Convert sparse matrix to tuple representation.
    
    '''
    def to_tuple(mx):
        if not sp.isspmatrix_coo(mx):
            mx = mx.tocoo()
        coords = np.vstack((mx.row, mx.col)).transpose()
        values = mx.data
        shape = mx.shape
        return coords, values, shape

    if isinstance(sparse_mx, list):
        for i in range(len(sparse_mx)):
            sparse_mx[i] = to_tuple(sparse_mx[i])
    else:
        sparse_mx = to_tuple(sparse_mx)

    return sparse_mx

def normalize_adj(adj):
    '''
    Symmetrically normalize adjacency matrix.
    norm_adj = D^{-0.5}*adj*D^{-0.5}.
    
    return the normalized adjcency matrix in coo format.
    '''
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()

In [6]:
#ctx = mx.gpu()
ctx = mx.gpu()

In [7]:
G = nx.convert_node_labels_to_integers(gul.G) # the heter graph

In [8]:
G.number_of_nodes(), G.number_of_edges()

(7178, 17699)

In [9]:
H = nx.disjoint_union(gul.M, gul.N) # the homo graph

In [10]:
H.number_of_nodes(), H.number_of_edges()

(7178, 2920058)

In [11]:
adj = nx.adjacency_matrix(H)

In [12]:
adj_dd = normalize_adj(adj)

In [13]:
adj_d = adj_dd.tocsr()

In [14]:
adj_hat = nd.sparse.csr_matrix(adj_d, ctx=ctx,dtype=np.float32)

In [15]:
print(np.min(adj.data), np.max(adj.data))

1.0 18240.0


In [16]:
# filter those below threshold value

In [None]:
fout = open('new_homo_M.txt', 'w')

fout.write('user item weight neg1 neg2 neg3 neg4 neg5\n')

with open('homo_M.txt') as fin:
    next(fin)
    for line in fin:
        vec = line.strip().split(' ')
        rating = float(vec[2])
        if rating > 30:
            fout.write(line)
        

fout.close()

In [None]:
fout = open('new_homo_N.txt', 'w')

fout.write('user item weight neg1 neg2 neg3 neg4 neg5\n')

with open('homo_N.txt') as fin:
    next(fin)
    for line in fin:
        vec = line.strip().split(' ')
        rating = float(vec[2])
        if rating > 5:
            fout.write(line)
        

fout.close()

# Construct the model

In [17]:
num_nodes = H.number_of_nodes()
rep_size = 128

In [None]:
#embedding = nd.random.normal(loc=0, scale=0.01, shape = (num_nodes, rep_size), ctx=ctx)
#embedding_out = nd.random.normal(loc=0, scale=0.01, shape = (num_nodes, rep_size), ctx=ctx)

In [None]:
# We do not apply dropout here
# since it is an approach to avoid overfitting

In [18]:
from mxnet.gluon import nn

In [18]:
class graph_conv(Block):
    def __init__(self,  **kwargs):
        super(graph_conv, self).__init__(**kwargs)
        self.weight1 = self.params.get('weight1', shape=(rep_size, rep_size))
      #  self.bias = self.params.get('bias', shape=(units,))
        self.adj_hat = adj_hat
        self.embedding = self.params.get('embedding', shape = (num_nodes, rep_size))
        #self.weight2 = self.params.get('weight2', shape=(rep_size, rep_size))
        
    def forward(self):
        x = nd.dot(self.embedding.data(), self.weight1.data())
        hidden = nd.relu(nd.dot(self.adj_hat, x))
       
        return hidden

        #y = nd.dot(hidden, self.weight2.data())
        #return nd.relu(nd.dot(adj_hat, y))

In [19]:
#net3 = nn.Sequential()
net1= graph_conv()
net1.initialize(init=init.Xavier(rnd_type='gaussian'), ctx=ctx)

In [20]:
net1()


[[0.         0.00428166 0.         ... 0.00135739 0.         0.        ]
 [0.         0.001183   0.         ... 0.00259045 0.00067428 0.        ]
 [0.00152674 0.         0.         ... 0.00459627 0.00042417 0.        ]
 ...
 [0.         0.         0.         ... 0.00663018 0.         0.00519713]
 [0.00135232 0.         0.         ... 0.         0.00808461 0.01524705]
 [0.00750169 0.         0.00338491 ... 0.         0.01123093 0.        ]]
<NDArray 7178x128 @gpu(0)>

In [21]:
net2 = graph_conv()
net2.initialize(init=init.Xavier(rnd_type='gaussian'), ctx=ctx)

In [22]:
net2()


[[0.         0.         0.00189501 ... 0.00376245 0.00096783 0.        ]
 [0.         0.0006559  0.00307435 ... 0.00286185 0.00133105 0.00092116]
 [0.         0.0027033  0.00283865 ... 0.         0.00079506 0.00429375]
 ...
 [0.         0.00679323 0.         ... 0.00387856 0.01485468 0.        ]
 [0.         0.         0.         ... 0.01082747 0.         0.        ]
 [0.01528536 0.         0.00116018 ... 0.         0.         0.        ]]
<NDArray 7178x128 @gpu(0)>

# Loading data

In [23]:
import pandas as pd

In [24]:
homo_n = pd.read_table('new_homo_N.txt', sep = ' ')

In [25]:
homo_m = pd.read_table('new_homo_M.txt', sep = ' ')

In [26]:
heter_u = pd.read_table('heter_u.txt', sep = ' ')

In [27]:
heter_v = pd.read_table('heter_i.txt', sep = ' ')

In [28]:
size_n = homo_n.shape[0]
size_m = homo_m.shape[0]
size_u = heter_u.shape[0]
size_v = heter_v.shape[0]


In [29]:
all_nodes = gul.node_u + gul.node_v
look_up = dict(zip(all_nodes, np.arange(len(all_nodes))))

In [30]:
def data_iter(df, batch_size = 300):
    df = df.sample(frac=1).reset_index(drop=True)
    
    num_batch = len(df) // batch_size
    for i in range(num_batch):
        data = df.iloc[i*batch_size:(i+1)*batch_size]
        h = data.iloc[:, 0].values
        t = data.iloc[:, 1].values
        neg = data.iloc[:, 3:].values
        
        h_ind = [[look_up[i]] for i in h]
        t_ind = [[look_up[i]] for i in t]
        neg_ind = []
        for n in neg:
            y = [look_up[i] for i in n]
            neg_ind.append(y)
            
        yield h_ind, t_ind, neg_ind       

# Defining loss

In [31]:
def compute_first_loss(h,t,neg):
    res_emb = net1()
    
    h_vec = res_emb.take(nd.array(h, ctx=ctx))
    t_vec = res_emb.take(nd.array(t, ctx=ctx))
    neg_vec = res_emb.take(nd.array(neg, ctx=ctx))
    
    loss_pos = -nd.sum(nd.log(nd.sigmoid(nd.sum(h_vec*t_vec,axis=2)))) 
    loss_neg = -nd.sum(nd.log(nd.sigmoid(-nd.sum(h_vec*neg_vec,axis=2))))
    
    loss = loss_pos + loss_neg
    
    return loss

In [32]:
def compute_second_loss(h,t,neg):
#    res_emb = net1(embedding)
 #   res_ctx = net2(embedding_out)
    res_emb = net1()
    res_ctx = net2()
    h_vec = res_emb.take(nd.array(h, ctx=ctx))
    t_vec = res_ctx.take(nd.array(t, ctx=ctx))
    neg_vec = res_ctx.take(nd.array(neg, ctx=ctx))
    loss_pos = -nd.sum(nd.log(nd.sigmoid(nd.sum(h_vec*t_vec,axis=2)))) 
    loss_neg = -nd.sum(nd.log(nd.sigmoid(-nd.sum(h_vec*neg_vec,axis=2))))
    
  #  loss = loss_pos + loss_neg
  #  loss = loss_pos
    loss = loss_pos + loss_neg
    
    return loss     

# Defining optimizer

In [33]:
trainer1 = mx.gluon.Trainer(net1.collect_params(), 'adam', {'learning_rate': 0.015, 'wd': 0.0005})

In [34]:
trainer2 = mx.gluon.Trainer(net2.collect_params(), 'adam', {'learning_rate': 0.015, 'wd': 0.0005})

# Training model

In [35]:
num_epoch = 5

In [36]:
batch_size = int(0.1*size_m) + int(0.1*size_n) +  int(0.1*size_u) + int(0.1*size_v)

In [37]:
for epoch in range(num_epoch):
    # Each epoch has 10 loops
    batch_m = data_iter(homo_m, int(0.1*size_m))
    batch_n = data_iter(homo_n, int(0.1*size_n))
    batch_u = data_iter(heter_u, int(0.1*size_u))
    batch_v = data_iter(heter_v, int(0.1*size_v))
    
    for batch in zip(batch_m, batch_n, batch_u, batch_v):
        h1, t1, neg1 = batch[0]
        h2, t2, neg2 = batch[1]
        h3, t3, neg3 = batch[2]
        h4, t4, neg4 = batch[3]
        with autograd.record():
            l1_u = compute_first_loss(h3, t3, neg3)
            l1_v = compute_first_loss(h4, t4, neg4)
            l2_m = compute_second_loss(h1, t1, neg1)
            l2_n = compute_second_loss(h2, t2, neg2)
            loss = l1_u + l1_v + l2_m + l2_n
        loss.backward()
        
        trainer1.step(batch_size)
        trainer2.step(batch_size)
    print('current loss: ', loss)

current loss:  
[30588.71]
<NDArray 1 @gpu(0)>
current loss:  
[30588.635]
<NDArray 1 @gpu(0)>
current loss:  
[30588.594]
<NDArray 1 @gpu(0)>
current loss:  
[30588.586]
<NDArray 1 @gpu(0)>
current loss:  
[30588.586]
<NDArray 1 @gpu(0)>


In [42]:
len(gul.node_u)

6001

In [43]:
embed = net1.embedding.data().asnumpy()

In [44]:
# save embeddings
def save_embeddings(file1, file2):
    fout_u = open(file1, 'w')
    fout_v = open(file2, 'w')
    
    num_u = len(gul.node_u)
    num_v = len(gul.node_v)
    embedding = net1.embedding.data().asnumpy()
    for u in range(num_u):
        fout_u.write("{} {}\n".format(gul.node_u[u], ' '.join([str(x) for x in embedding[u]])))
    
    for i in range(num_u, num_u + num_v):
        fout_v.write("{} {}\n".format(gul.node_v[i-num_u], ' '.join([str(x) for x in embedding[i]])))
        
    fout_u.close()
    fout_v.close()        

In [45]:
save_embeddings('u.txt', 'v.txt')

In [None]:
net1.embedding.data().asnumpy()