In [61]:
from matplotlib.pyplot import delaxes
import torch
import random
import os
import pandas as pd
import numpy as np
import torch.nn as nn
import time
import scipy.sparse as sparse

class HyperG:
    def __init__(self, H, X=None, w=None):
        """ Initial the incident matrix, node feature matrix and hyperedge weight vector of hypergraph
        :param H: scipy coo_matrix of shape (n_nodes, n_edges)
        :param X: numpy array of shape (n_nodes, n_features)
        :param w: numpy array of shape (n_edges,)
        """
        assert sparse.issparse(H)
        assert H.ndim == 2

        self._H = H
        self._n_nodes = self._H.shape[0]
        self._n_edges = self._H.shape[1]

        if X is not None:
            assert isinstance(X, np.ndarray) and X.ndim == 2
            self._X = X
        else:
            self._X = None

        if w is not None:
            self.w = w.reshape(-1)
            assert self.w.shape[0] == self._n_edges
        else:
            self.w = np.ones(self._n_edges)

        self._DE = None
        self._DV = None
        self._INVDE = None
        self._DV2 = None
        self._THETA = None
        self._L = None

    def num_edges(self):
        return self._n_edges

    def num_nodes(self):
        return self._n_nodes

    def incident_matrix(self):
        return self._H

    def hyperedge_weights(self):
        return self.w

    def node_features(self):
        return self._X

    def node_degrees(self):
        if self._DV is None:
            H = self._H.tocsr()
            dv = H.dot(self.w.reshape(-1, 1)).reshape(-1)
            self._DV = sparse.diags(dv, shape=(self._n_nodes, self._n_nodes))
        return self._DV

    def edge_degrees(self):
        if self._DE is None:
            H = self._H.tocsr()
            de = H.sum(axis=0).A.reshape(-1)
            self._DE = sparse.diags(de, shape=(self._n_edges, self._n_edges))
        return self._DE

    def inv_edge_degrees(self):
        if self._INVDE is None:
            self.edge_degrees()
            inv_de = np.power(self._DE.data.reshape(-1), -1.)
            self._INVDE = sparse.diags(inv_de, shape=(self._n_edges, self._n_edges))
        return self._INVDE

    def inv_square_node_degrees(self):
        if self._DV2 is None:
            self.node_degrees()
            dv2 = np.power(self._DV.data.reshape(-1)+1e-6, -0.5)
            self._DV2 = sparse.diags(dv2, shape=(self._n_nodes, self._n_nodes))
        return self._DV2

    def theta_matrix(self):
        if self._THETA is None:
            self.inv_square_node_degrees()
            self.inv_edge_degrees()

            W = sparse.diags(self.w)
            self._THETA = self._DV2.dot(self._H).dot(W).dot(self._INVDE).dot(self._H.T).dot(self._DV2)

        return self._THETA

    def laplacian(self):
        if self._L is None:
            self.theta_matrix()
            self._L = sparse.eye(self._n_nodes) - self._THETA
        return self._L

    def update_hyedge_weights(self, w):
        assert isinstance(w, (np.ndarray, list)), \
            "The hyperedge array should be a numpy.ndarray or list"

        self.w = np.array(w).reshape(-1)
        assert w.shape[0] == self._n_edges

        self._DV = None
        self._DV2 = None
        self._THETA = None
        self._L = None

    def update_incident_matrix(self, H):
        assert sparse.issparse(H)
        assert H.ndim == 2
        assert H.shape[0] == self._n_nodes
        assert H.shape[1] == self._n_edges

        # TODO: reset hyperedge weights?

        self._H = H
        self._DE = None
        self._DV = None
        self._INVDE = None
        self._DV2 = None
        self._THETA = None
        self._L = None

def gen_attribute_hg(n_nodes, attr_dict, X=None):
    # 构建超图集合的子图
    """
    :param attr_dict: dict, eg. {'attri_1': [node_idx_1, node_idx_1, ...], 'attri_2':[...]} (zero-based indexing)
    :param n_nodes: int,
    :param X: numpy array, shape = (n_samples, n_features) (optional)
    :return: instance of HyperG
    """

    if X is not None:
        assert n_nodes == X.shape[0]

    n_edges = len(attr_dict)
    node_idx = []
    edge_idx = []

    for idx, attr in enumerate(attr_dict):
        nodes = sorted(attr_dict[attr])
        node_idx.extend(nodes)
        edge_idx.extend([idx] * len(nodes))

    node_idx = np.asarray(node_idx)
    edge_idx = np.asarray(edge_idx)
    values = np.ones(node_idx.shape[0])

    H = sparse.coo_matrix((values, (node_idx, edge_idx)), shape=(n_nodes, n_edges))
    return HyperG(H, X=X)


class Classifier(nn.Module):
    def __init__(self, n_hid, n_out):
        super(Classifier, self).__init__()
        self.n_hid = n_hid
        self.n_out = n_out
        self.linear = nn.Linear(n_hid, n_out)

    def forward(self, x):
        tx = self.linear(x)
        return torch.log_softmax(tx.squeeze(), dim=-1)

    def __repr__(self):
        return '{}(n_hid={}, n_out={})'.format(
            self.__class__.__name__, self.n_hid, self.n_out)


def  set_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    # os.environ['CUDA_LAUNCH_BLOCKING'] = str(1)
    # os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.enabled = False
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms(True)

# def construct_attr_dict():

In [62]:
import numpy as np
import json
fin = np.load('data/financial_statement.npy')
idx = [1,0]

file = 'data/mapper_dicts.json'
with open(file,"r") as f: 
    maps = json.load(f)

map1 = maps['mapper_idx2code']

invest_cc = np.load('data/relation_invest_cc.npy')
s1,s2 = zip(*invest_cc)
s = set(s1).union(set((s2)))

cnt = 0
for v in s:
    if v>=5321:
        cnt += 1



In [63]:
# ### 划分数据集

# from torch.utils.data import Dataset, DataLoader
# import torch

# class MyDataSet(Dataset):
#         def __init__(self, loaded_data):
#             self.data = loaded_data['data']
#             self.labels = loaded_data['labels']
    
#         def __len__(self):
#             return len(self.data)
        
#         def __getitem__(self, idx):
#             data = self.data[idx]
#             label = self.labels[idx]
#             return data,label

# data_mat = np.load('data/features_100.npy') #经过离散化处理
# labels = np.load('data/risk_label.npy')

# def load_data():

#     num_has_fin_comp = 5321

#     data1 = MyDataSet({'data':data_mat[:num_has_fin_comp],'labels':labels[:num_has_fin_comp]})
#     data2 = MyDataSet({'data':data_mat[num_has_fin_comp:],'labels':labels[num_has_fin_comp:]})


#     train_dataset1, validate_dataset1, test_dataset1 = torch.utils.data.random_split(data1, [0.7, 0.2, 0.1])
#     train_dataset2, validate_dataset2, test_dataset2 = torch.utils.data.random_split(data2, [0.7, 0.2, 0.1])

#     train_data1 = DataLoader(train_dataset1)
#     train_data2 = DataLoader(train_dataset2)

#     validate_data1 = DataLoader(validate_dataset1)
#     validate_data2 = DataLoader(validate_dataset2)

#     test_data1 = DataLoader(test_dataset1)
#     test_data2 = DataLoader(test_dataset2)

#     train_data_list = [iter(train_data1),iter(train_data2)]
#     validate_data_list = [iter(validate_data1),iter(validate_data2)]
#     test_data_list = [iter(test_data1),iter(test_data2)]

#     return train_data_list, validate_data_list, test_data_list

In [2]:
import dgl
import torch as th
import numpy as np

def load_hete_graph():
    rel_bc2bc = np.load('data/listed_comp/relation_invest_bc2bc.npy')
    rel_provide_bc2bc = np.load('data/listed_comp/relation_provide_bc2bc.npy')
    rel_sale_bc2bc = np.load('data/listed_comp/relation_sale_bc2bc.npy')

    feature_fin = np.load('data/listed_comp/features_norm_bc.npy')
    feature_basic = np.load('data/listed_comp/features_basic_bc.npy')
    # fin_feature_ttl = np.load('data/features_100.npy')

    src, tgt = zip(*rel_bc2bc)
    src_p,tgt_p = zip(*rel_provide_bc2bc)
    src_s,tgt_s = zip(*rel_sale_bc2bc)

    graph_data = { ('company','invest_bc2bc','company') : (src,tgt),
                   ('company','provide_bc2bc','company') : (src_p,tgt_p),
                   ('company','sale_bc2bc','company') : (src_s,tgt_s)}

    g = dgl.heterograph(graph_data)

    num = g.num_nodes()


    feats_fin = th.tensor(feature_fin[:num])
    feats_bsc = th.tensor(feature_basic[:num])
    feats = th.cat([feats_fin,feats_bsc],1)
    
    g.nodes['company'].data['feature'] = feats

    isnan = np.isnan(feats)
    print("空值：",True in isnan)

    dict_node_feats =  {'enterprise': feats}

    return g, feats, dict_node_feats


In [7]:


from sklearn.model_selection import train_test_split
import torch.utils.data as D

#   train_idx = train_data.indices
def split_data():
    g, feats, dict_node_features = load_hete_graph()
    labels_ttl = np.load('data/risk_label.npy')
    num_nodes = g.num_nodes()
    labels = th.tensor(labels_ttl[:num_nodes])
    train_size = int(num_nodes * 0.6)
    val_size = int(num_nodes * 0.2)
    test_size = num_nodes - train_size - val_size
    train_data, val_data, test_data = D.random_split(feats,[train_size,val_size,test_size])
    return train_data, val_data, test_data


In [11]:
# dict形式： { 'industry': {'K01':[0,1,2], ... } }
import json

def load_hyper_graph():
    filename = 'data/dicts_hyper.json'
    with open(filename,"r") as f: 
        hyper_graph = json.load(f)
    return hyper_graph

# def load_train_hyper_graph(train_data):
#     hyper_graph = load_hyper_graph()
#     train_idx = train_data.indices
#     dicts_industry = hyper_graph['industry']
#     dicts_train = { _key :[] for _key in dicts_industry}
#     for idx in train_idx:
#         for key in dicts_industry:
#             value = dicts_industry[key]
#             if idx in value:
#                 dicts_train[key].append(idx)
#     return dicts_train

def load_sub_hyper_graph(hyper_graph_data): # hyper_graph_data : dict
    hyper_graph = load_hyper_graph()
    train_idx = hyper_graph_data.indices
    dicts_industry = hyper_graph['industry']
    dicts_sub_hyper_graph = { _key :[] for _key in dicts_industry}
    for idx in train_idx:
        for key in dicts_industry:
            value = dicts_industry[key]
            if idx in value:
                dicts_sub_hyper_graph[key].append(idx)
    return dicts_sub_hyper_graph

# def get_sub_hete_graph(train_data):
    


    
train_data, val_data,test_data = split_data()

sub_hyp_graph = load_sub_hyper_graph(train_data)



In [1]:
g, feats, dict_node_feats = load_hete_graph()


NameError: name 'load_hete_graph' is not defined

In [9]:

# relation1 = g.edge_type_subgraph([''])

In [38]:
from collections import defaultdict
import scipy.sparse as sp
def sparse_to_adjlist(sp_matrix):
	"""
	Transfer sparse matrix to adjacency list
	:param sp_matrix: the sparse matrix
	:param filename: the filename of adjlist
	"""
	# add self loop
	print(sp_matrix.shape[0])
	homo_adj = sp_matrix+ sp.eye(sp_matrix.shape[0])
	# homo_adj = sp.eye(sp_matrix.shape[0])

	# create adj_list
	adj_lists = defaultdict(set)
	edges = homo_adj.nonzero()
	for index, node in enumerate(edges[0]):
		adj_lists[node].add(edges[1][index])
		adj_lists[edges[1][index]].add(node)

	return adj_lists

In [None]:
def converse_sp_to_adlist(adj1):
    indices1 = np.array(adj1.indices())
    values1  = np.array(adj1.val)
    aj1 = sp.coo_matrix((values1,(indices1[0],indices1[1])),shape=[5317,5317])
    ajlist1 =  sparse_to_adjlist(aj1)
    return ajlist1

In [42]:
import dgl
import scipy.sparse as sp


g = dgl.load_graphs('data/lst_comps.dgl')[0][0]
feat_data = g.ndata['feature']
labels = g.ndata['label']

relation1 = g.edge_type_subgraph(['invest_bc2bc'])
relation2 = g.edge_type_subgraph(['provide_bc2bc'])
relation3 = g.edge_type_subgraph(['sale_bc2bc'])

adj1 = relation1.adjacency_matrix()
adj2 = relation2.adjacency_matrix()
adj3 = relation3.adjacency_matrix()

# indices1 = np.array(adj1.indices())
# values1  = np.array(adj1.val)
# aj1 = sp.coo_matrix((values1,(indices1[0],indices1[1])),shape=[5317,5317])
# ajlist1 =  sparse_to_adjlist(aj1)

# indices2 = np.array(adj2.indices())
# values2  = np.array(adj2.val)
# aj2 = sp.coo_matrix((values2,(indices2[0],indices2[1])),shape=[5317,5317])
# ajlist2 =  sparse_to_adjlist(aj2)

ajlist1 = converse_sp_to_adlist(adj1)
ajlist2 = converse_sp_to_adlist(adj2)
ajlist3 = converse_sp_to_adlist(adj3)



5317
5317
5317


In [5]:
import dgl

g = dgl.load_graphs('data/lst_comps.dgl')[0][0]
feat_data = g.ndata['feature']
labels = g.ndata['label']

print(f'欺诈样本数量：{len(labels[labels==1])}')
print(f'样本总数量：{len(labels)}')
print(f'欺诈样本比例：{len(labels[labels==1]) / len(labels)}')

欺诈样本数量：559
样本总数量：5317
欺诈样本比例：0.10513447432762836
