In [2]:
# SVM模型
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import torch as th
from sklearn.metrics import classification_report # 结果评估

data_mat = np.load('data/features_100.npy') #经过离散化处理
labels = np.load('data/risk_label.npy')

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_mat[:5321], labels[:5321], test_size = 0.2, random_state = 0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.svm import SVC
classifier = SVC(kernel = 'linear',probability=True, random_state = 0)
classifier.fit(X_train, y_train)

# 预测
y_pred = classifier.predict(X_test)
y_score = classifier.predict_proba(X_test)[:, 1]




In [3]:
print(classification_report(y_test,y_pred,digits=4))

              precision    recall  f1-score   support

           0     0.6927    0.9277    0.7932       678
           1     0.6879    0.2791    0.3971       387

    accuracy                         0.6920      1065
   macro avg     0.6903    0.6034    0.5951      1065
weighted avg     0.6910    0.6920    0.6492      1065



In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

class MyDataSet(Dataset):
        def __init__(self, loaded_data):
            self.data = loaded_data['data']
            self.labels = loaded_data['labels']
    
        def __len__(self):
            return len(self.data)
        
        def __getitem__(self, idx):
            data = self.data[idx]
            label = self.labels[idx]
            return data,label

custom_data = MyDataSet({'data':data_mat,'labels':labels})


train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(custom_data, [0.5, 0.3, 0.2])

train_loader = DataLoader(train_dataset,shuffle=False)

In [None]:
# hypergnn + hetegnn

from utils import *
from models import HyperSTGNN
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import recall_score as rec
from sklearn.metrics import precision_score as pre
from sklearn.metrics import f1_score as f1
from sklearn.metrics import roc_auc_score as roc



n_epoch = 200
clip = 0.25

g, features, dict_node_feats = load_hete_graph()

labels_ttl = np.load('data/risk_label.npy')
num_nodes = g.num_nodes()
labels = torch.tensor(labels_ttl[:num_nodes])

input_dim = 49
output_dim = 20
total_company_num = g.num_nodes()
rel_num = 1
com_initial_emb = features

best_acc = 0
best_f1 =0 

device=torch.device("cpu")
# set_random_seed(14)

criterion = torch.nn.CrossEntropyLoss()

# todo:
train_data,val_data,test_data = split_data()

train_idx = train_data.indices
valid_idx = val_data.indices

train_hyp_graph = load_sub_hyper_graph(train_data)
val_hyp_graph = load_sub_hyper_graph(val_data)
test_hyp_graph = load_sub_hyper_graph(test_data)

def train():
    
    gnn = HyperSTGNN(input_dim,output_dim,
                     total_company_num,rel_num,
                     device,com_initial_emb,g,dict_node_feats,
                     num_heads=1,dropout=0.2,norm=True)

    classifier = Classifier(output_dim, 2).to(device)
    model = nn.Sequential(gnn, classifier)

    optimizer = torch.optim.Adam(model.parameters(),lr=0.01)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 20, eta_min=1e-6)

    ['industry', 'area', 'qualify']
    train_hyp=[]
    for i in ['industry']:
        train_hyp+=[gen_attribute_hg(total_company_num, train_hyp_graph[i], X=None)]
    valid_hyp=[]
    for i in ['industry']:
        valid_hyp+=[gen_attribute_hg(total_company_num, val_hyp_graph[i], X=None)]
    test_hyp=[]
    for i in ['industry']:
        test_hyp+=[gen_attribute_hg(total_company_num, test_hyp_graph[i], X=None)]


    for epoch in np.arange(n_epoch):

        st=time.time()

        '''
            Train 
        '''
        model.train()
        train_losses = []
        # torch.cuda.empty_cache()

        # TODO 
        company_emb=gnn.forward(g,dict_node_feats,train_hyp,train_idx)

        res = classifier.forward(company_emb)

        loss = criterion(res, torch.LongTensor(train_label))
        optimizer.zero_grad()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        train_losses += [loss.cpu().detach().tolist()]
        # train_step += 1
        scheduler.step()
        del res, loss

        '''
            Valid 
        '''
        model.eval()
        with torch.no_grad():
            company_emb=gnn.forward(g,dict_node_feats,valid_hyp,valid_idx)

            res = classifier.forward(company_emb)
            valid_label = labels[valid_idx]
            loss = criterion(res,torch.LongTensor(valid_label) )

            pred=res.argmax(dim=1)
            ac=acc(valid_label,pred)
            pr=pre(valid_label,pred)
            re=rec(valid_label,pred)
            f=f1(valid_label,pred)
            rc=roc(valid_label,res[:,1])

            if ac > best_acc and f>best_f1:
                best_acc = ac
                best_f1=f
                torch.save(model, './model_save/%s.pkl'%('best_model'))

                print('UPDATE!!!')


            et = time.time()
            print(("Epoch: %d (%.1fs)  LR: %.5f Train Loss: %.2f  Valid Loss: %.2f  Valid Acc: %.4f Valid Pre: %.4f  Valid Recall: %.4f Valid F1: %.4f  Valid Roc: %.4f"  ) % \
                (epoch, (et - st), optimizer.param_groups[0]['lr'], np.average(train_losses), \
                loss.cpu().detach().tolist(), ac,pr,re,f,rc))

            del res, loss

            if epoch+1==n_epoch:
                company_emb=gnn.forward(g,dict_node_feats,test_hyp,test_idx)
                # gnn.forward(g,dict_node_feats,valid_hyp,valid_idx)
                test_label = labels[test_idx]
                res = classifier.forward(company_emb)

                pred=res.argmax(dim=1)
                ac=acc(test_label,pred)
                pr=pre(test_label,pred)
                re=rec(test_label,pred)
                f=f1(test_label,pred)
                rc=roc(test_label,res[:,1])
                
                print('Last Test Acc: %.4f Last Test Pre: %.4f Last Test Recall: %.4f Last Test F1: %.4f Last Test ROC: %.4f' % (ac,pr,re,f,rc))


In [15]:


def split_data():
    g, feats, dict_node_features = load_hete_graph()
    labels_ttl = np.load('data/risk_label.npy')
    num_nodes = g.num_nodes()
    labels = torch.tensor(labels_ttl[:num_nodes])
    train_size = int(num_nodes * 0.6)
    val_size = int(num_nodes * 0.2)
    test_size = num_nodes - train_size - val_size
    train_data, val_data, test_data = torch.utils.data.random_split(feats,[train_size,val_size,test_size])
    return train_data, val_data, test_data

train_data,val_data,test_data = split_data()

def load_sub_hyper_graph(hyper_graph_data): # hyper_graph_data : dict
    hyper_graph = load_hyper_graph()
    train_idx = hyper_graph_data.indices
    dicts_industry = hyper_graph['industry']
    dicts_sub_hyper_graph = { _key :[] for _key in dicts_industry}
    for idx in train_idx:
        for key in dicts_industry:
            value = dicts_industry[key]
            if idx in value:
                dicts_sub_hyper_graph[key].append(idx)
    return dicts_sub_hyper_graph

train_idx = train_data.indices

train_hyp_graph = load_sub_hyper_graph(train_data)

In [16]:
# 随机森林 特征值重要性 评估
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import dgl as dl

def feature_importance(X, y):
    model = RandomForestClassifier()
    model.fit(X, y)
    print(model.estimators_)
    importances = model.feature_importances_
    return importances

# 数据
g = dl.load_graphs('data/lst_comps7.dgl')[0][0]
feats = g.nodes['company'].data['feature'] 
labels = np.load('data/listed_comp/labels_listed_comp.npy')

# 计算特征重要性
importances = feature_importance(feats, labels)
print("Feature Importances:", importances)
feats_sort_idx = np.argsort(importances)
print(feats_sort_idx)


[DecisionTreeClassifier(max_features='sqrt', random_state=1937093881), DecisionTreeClassifier(max_features='sqrt', random_state=558624822), DecisionTreeClassifier(max_features='sqrt', random_state=1651469672), DecisionTreeClassifier(max_features='sqrt', random_state=30501663), DecisionTreeClassifier(max_features='sqrt', random_state=961235350), DecisionTreeClassifier(max_features='sqrt', random_state=1329811137), DecisionTreeClassifier(max_features='sqrt', random_state=2047088086), DecisionTreeClassifier(max_features='sqrt', random_state=1550603074), DecisionTreeClassifier(max_features='sqrt', random_state=1805770369), DecisionTreeClassifier(max_features='sqrt', random_state=1635701129), DecisionTreeClassifier(max_features='sqrt', random_state=1584289669), DecisionTreeClassifier(max_features='sqrt', random_state=1859110834), DecisionTreeClassifier(max_features='sqrt', random_state=1042605351), DecisionTreeClassifier(max_features='sqrt', random_state=1359248650), DecisionTreeClassifier(

In [17]:
threshold = 0.02
x_selected = feats[:, importances > threshold]
x_selected.shape

torch.Size([5317, 26])