In [1]:
import os
import dgl
import json
import pickle
import numpy as np

import torch
from scipy import io as sio
from scipy.sparse import coo_matrix
from dgl.sampling import sample_neighbors
from torch.utils.data import TensorDataset, Dataset
from sklearn.model_selection import train_test_split

Using backend: pytorch


In [2]:
config_path = './'

data_config = {
    'data_path': os.path.join(config_path, 'data'),
    'dataset': 'ACM', # ACM, DBLP, IMDB, AIFB
    'data_name': 'ACM.mat', # ACM.mat, DBLP.mat, IMDB.mat, AIFB.mat
    'primary_type': 'p', # p, a, m, Personen            # primary node type
    'task': ['CF', 'CL'],
    'K_length': 4, # Context path length K
    'resample': False, # Whether resample the training and testing dataset
    'random_seed': 123,
    'test_ratio': 0.8
}

model_config = {
    'primary_type': data_config['primary_type'],
    'auxiliary_embedding': 'non_linear',  # auxiliary embedding generating method: non_linear, linear, emb
    'K_length': data_config['K_length'],
    'embedding_dim': 128,
    'in_dim': 128,
    'out_dim': 128,
    'num_heads': 8,
    'merge': 'linear',  # Multi head Attention merge method: linear, mean, stack
    'g_agg_type': 'mean',  # Graph representation encoder: mean, sum
    'drop_out': 0.3,
    'cgnn_non_linear': True,  # Enable non linear activation function for CGNN
    'multi_attn_linear': False,  # Enable atten K/Q-linear for each type
    'graph_attention': True,
    'kq_linear_out_dim': 128,
    'path_attention': False,  # Enable Context path attention
    'c_linear_out_dim': 8,
    'enable_bilinear': False,  # Enable Bilinear for context attention
    'gru': True,
    'add_init': False
}

train_config = {
    'continue': False,
    'lr': 0.05,
    'l2': 0,
    'factor': 0.2,
    'total_epoch': 10000000,
    'batch_size': 1024 * 20,
    'pos_num_for_each_hop': [20, 20, 20, 20, 20, 20, 20, 20, 20],
    'neg_num_for_each_hop': [3, 3, 3, 3, 3, 3, 3, 3, 3],
    'sample_workers': 8,
    'patience': 15,
    'checkpoint_path': os.path.join(config_path, 'checkpoint', data_config['dataset'])
}

evaluate_config = {
    'method': 'LR',
    'save_heat_map': True,
    'result_path': os.path.join('result', data_config['dataset']),
    'random_state': 123,
    'max_iter': 500,
    'n_jobs': 1,
}

In [3]:
class EdgesDataset(Dataset):
    def __init__(self, hg, pos_sample_num, neg_sample_num):
        self.hg = hg
        self.pos_sample_num = pos_sample_num
        self.neg_sample_num = neg_sample_num

    def __len__(self):
        return self.hg.number_of_nodes()

    def __getitem__(self, idx):
        pos_graph = sample_neighbors(self.hg, [idx], self.pos_sample_num, edge_dir='out')
        pos_src, pos_dst = pos_graph.edges()
        neg_src = pos_src.repeat(self.neg_sample_num)
        neg_dst = torch.randint(0, self.hg.number_of_nodes(), neg_src.shape, dtype=torch.long)
        return pos_src, pos_dst, neg_src, neg_dst

    @staticmethod
    def collate(batches):
        pos_src = []
        pos_dst = []
        neg_src = []
        neg_dst = []
        for item in batches:
            pos_src.append(item[0])
            pos_dst.append(item[1])
            neg_src.append(item[2])
            neg_dst.append(item[3])
        return torch.cat(pos_src), torch.cat(pos_dst), torch.cat(neg_src), torch.cat(neg_dst)

class GraphDataLoader(object):
    def __init__(self, data_config, remove_self_loop):
        self.data_config = data_config
        self.base_data_path = os.path.join(data_config['data_path'], data_config['dataset'])
        self.data_path = os.path.join(data_config['data_path'], data_config['dataset'], data_config['data_name'])
        self.train_data_path = os.path.join(data_config['data_path'], data_config['dataset'], 'train_data')
        if not os.path.exists(self.train_data_path):
            os.mkdir((self.train_data_path))
        self.k_hop_graph_path = os.path.join(self.train_data_path, 'graph')
        if not os.path.exists(self.k_hop_graph_path):
            os.mkdir(self.k_hop_graph_path)

    def load_raw_matrix(self):
        raise NotImplementedError("Not Implement load_raw_matrix")

    def load_k_hop_train_data(self):
        raise NotImplementedError("Not Implement load_k_hop_train_data method")

    def load_classification_data(self):
        raise NotImplementedError("Not Implement load_classification_data method")

    def load_links_prediction_data(self):
        raise NotImplementedError("Not Implement load_links_prediction_data method")

    def _load_k_hop_graph(self, hg, k, primary_type):
        '''
        Return k hop neighbors graph
        :param hg: DGLHeteroGraph
        :param k: hop neighbors
        :param primary_type: primary_type
        :return: k-hop graph of primary type graph, DGLHeteroGraph
        '''
        print('Process: {} hop graph'.format(k))
        k_hop_graph_path = os.path.join(self.k_hop_graph_path,
                                        '{}_{}_hop_graph.pkl'.format(primary_type, k))
        if not os.path.exists(k_hop_graph_path):
            ntype = hg.ntypes
            primary_type_id = ntype.index(primary_type)
            homo_g = dgl.to_homo(hg)
            p_nodes_id = homo_g.filter_nodes(
                lambda nodes: (nodes.data['_TYPE'] == primary_type_id))  # Find the primary nodes ID
            min_p = torch.min(p_nodes_id).item()
            max_p = torch.max(p_nodes_id).item()
            raw_adj = homo_g.adjacency_matrix()  # It is a square matrix
            # Speed up with torch
            raw_adj = raw_adj.to_dense().float()
            adj_k = torch.matrix_power(raw_adj, k)  # K-hop neighbors
            p_adj = adj_k[min_p:max_p, min_p:max_p].cpu()  # Get primary sub graph
            row, col = torch.nonzero(p_adj, as_tuple=True)
            p_g = dgl.graph((row, col))
            with open(k_hop_graph_path, 'wb') as f:
                pickle.dump(p_g, f, protocol=4)
        else:
            with open(k_hop_graph_path, 'rb') as f:
                p_g = pickle.load(f)
        return p_g

    def load_train_k_context_edges(self, hg, K, primary_type, pos_num_for_each_hop, neg_num_for_each_hop):
        edges_data_dict = {}
        for k in range(1, K + 2):
            k_hop_primary_graph = self._load_k_hop_graph(hg, k, primary_type)
            k_hop_edge = EdgesDataset(k_hop_primary_graph, pos_num_for_each_hop[k], neg_num_for_each_hop[k])
            edges_data_dict[k] = k_hop_edge
        return edges_data_dict

class ACMDataLoader(GraphDataLoader):
    def __init__(self, data_config, remote_self_loop):
        super(ACMDataLoader, self).__init__(data_config, remote_self_loop)

        self.heter_graph, self.raw_matrix = self.load_raw_matrix()

    def load_raw_matrix(self):
        data = sio.loadmat(self.data_path)
        '''
        ['__header__', '__version__', '__globals__', 'TvsP', 'PvsA', 'PvsV', 'AvsF', 'VvsC', 'PvsL', 'PvsC', 'A', 'C', 'F', 'L', 'P', 'T', 'V', 'PvsT', 'CNormPvsA', 'RNormPvsA', 'CNormPvsC', 'RNormPvsC', 'CNormPvsT', 'RNormPvsT', 'CNormPvsV', 'RNormPvsV', 'CNormVvsC', 'RNormVvsC', 'CNormAvsF', 'RNormAvsF', 'CNormPvsL', 'RNormPvsL', 'stopwords', 'nPvsT', 'nT', 'CNormnPvsT', 'RNormnPvsT', 'nnPvsT', 'nnT', 'CNormnnPvsT', 'RNormnnPvsT', 'PvsP', 'CNormPvsP', 'RNormPvsP']
        P: Paper
        A：Author
        F: Facility
        C: Conference
        L: Subject
        '''

        # EDGES
        p_vs_l = data['PvsL']  # paper-Subject
        p_vs_p = data['PvsP']  # paper-paper
        p_vs_a = data['PvsA']  # paper-author
        a_vs_f = data['AvsF']  # author-facility

        # src_type, edge_type, dst_type
        hg = dgl.heterograph({
            ('p', 'pa', 'a'): p_vs_a,
            ('a', 'ap', 'p'): p_vs_a.transpose(),
            ('p', 'pp', 'p'): p_vs_p,  # P cite P
            ('p', 'ps', 's'): p_vs_l,
            ('s', 'sp', 'p'): p_vs_l.transpose(),
            ('a', 'af', 'f'): a_vs_f,
            ('f', 'fa', 'a'): a_vs_f.transpose(),
        })

        return hg, data

In [4]:
dataloader = ACMDataLoader(data_config, remote_self_loop=False)
hg = dataloader.heter_graph

In [5]:
edges_data_dict = dataloader.load_train_k_context_edges(hg, 
                                                        data_config['K_length'],
                                                        data_config['primary_type'],
                                                        train_config['pos_num_for_each_hop'],
                                                        train_config['neg_num_for_each_hop']
                                                        )

Process: 1 hop graph
Process: 2 hop graph
Process: 3 hop graph
Process: 4 hop graph




RuntimeError: [enforce fail at C:\cb\pytorch_1000000000000\work\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 4046740996 bytes.