In [1]:
import os
import traceback
import openke
from openke.config import Trainer, Tester
from openke.module.model import TransH
from openke.module.loss import MarginLoss
from openke.module.strategy import NegativeSampling
from openke.data import TrainDataLoader, TestDataLoader
os.environ['CUDA_VISIBLE_DEVICES'] = "2"

# DIM=50

In [2]:
INPUT_PATH = './data/3_openKE/synthesize/'
OUTPUT_EMBEDDING_PATH = './data/4_embedding/synthesize'
OUTPUT_MODEL_PATH = './data/4_embedding/synthesize/model'
MODEL_NAME = 'transH'
DIM = 150 #可改：50, 100, 150
EMBEDDING_NAME = f'{MODEL_NAME}_{DIM}.vec.json'

In [3]:
# dataloader for training
print(f'Preparing train_dataloader...')
train_dataloader = TrainDataLoader(
	in_path = INPUT_PATH, 
	nbatches = 500,
	threads = os.cpu_count(), 
	sampling_mode = "normal", 
	bern_flag = 1, 
	filter_flag = 1, 
	neg_ent = 25,
	neg_rel = 0)

Preparing train_dataloader...


In [4]:
# define the model
print('Defining the model...')
transh = TransH(
	ent_tot = train_dataloader.get_ent_tot(),
	rel_tot = train_dataloader.get_rel_tot(),
	dim = DIM, 
	p_norm = 1, 
	norm_flag = True)

model_h = NegativeSampling(
	model = transh, 
	loss = MarginLoss(margin = 5.0),
	batch_size = train_dataloader.get_batch_size())

# train transh
trainer = Trainer(model = model_h, data_loader = train_dataloader, train_times = 500, alpha = 1.0, use_gpu = True)
trainer.run()
transh.save_checkpoint(f'{OUTPUT_MODEL_PATH}/{MODEL_NAME}_{DIM}.ckpt')
print('Finish training the model')

Defining the model...
Finish initializing...


Epoch 499 | loss: 0.496723: 100%|██████████| 500/500 [8:14:03<00:00, 59.29s/it]  


Finish training the model


In [5]:
# save the embeddings
print('Saving embedding...')
transh.save_parameters(f"{OUTPUT_EMBEDDING_PATH}/{EMBEDDING_NAME}")

Saving embedding...


### Loss and Time
##### dim = 100
Epoch 499 | loss: 1.202724: 100%|██████████| 500/500 [6:15:39<00:00, 45.08s/it]  
##### dim = 150
Epoch 499 | loss: 0.496723: 100%|██████████| 500/500 [8:14:03<00:00, 59.29s/it]  


### Testing

In [6]:
import json
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [7]:
def get_entity_labels():
    labels = list()
    with open(f'{INPUT_PATH}entity2id_objType.txt') as f:
        for line in f.readlines()[1:]:
             dst_type, id = line.split('\t')
             labels.append(dst_type)
    return labels

def dimension_reduce(data):
    ent_embeddings = np.array(data['ent_embeddings.weight'])
    rel_embeddings = np.array(data['rel_embeddings.weight'])
    print(ent_embeddings.shape)
    print(rel_embeddings.shape)
    pca = PCA(n_components=2)
    ent_X = pca.fit_transform(ent_embeddings)
    rel_X = pca.fit_transform(rel_embeddings)
    print(ent_X.shape)
    print(rel_X.shape)
    reduced_embedding = {
        'ent_embeddings': np.array(ent_X.tolist()),
        'rel_embeddings': np.array(rel_X.tolist())
    }
    return reduced_embedding

def draw_ent_embedding(X, labels, model, dim):
    colors = {'process': 'red', 'file': 'green', 'network': 'blue', 'registry': 'purple'}
    markers = {'process': '+', 'file': 'x', 'network': '1', 'registry': '2'}
    x1 = X[:, 0]
    x2 = X[:, 1]
    print_cat = ['process','file','network','registry']
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6))
    for cat in print_cat:
        flags = [True if l == cat else False for l in labels]
        ax1.scatter(x1[flags], x2[flags], color=colors[cat], marker=markers[cat], label=f'{cat}({sum(flags)})')
        ax2.scatter(x1[flags], x2[flags], color=colors[cat], marker=markers[cat], label=f'{cat}({sum(flags)})')
    ax1.set_xlim([np.percentile(x1, 1),np.percentile(x1, 99)])
    ax1.set_ylim([np.percentile(x2, 1),np.percentile(x2, 99)])
    ax2.set_xlim([np.percentile(x1, 0),np.percentile(x1, 99.999)])
    ax2.set_ylim([np.percentile(x2, 0),np.percentile(x2, 99.999)])
    fig.tight_layout()
    # Add a legend and labels
    plt.title(f'Entity Embedding({model}, dim={dim})')
    plt.legend()
    plt.xlabel(f'pca1')
    plt.ylabel(f'pca2')
    plt.show()
def draw_rel_embedding(X, y, model, dim):
    x1 = X[:, 0]
    x2 = X[:, 1]
    fig, ax = plt.subplots(figsize=(10, 10))
    print_cat = ['process','file','network','registry', 'other']
    for cat in print_cat:
        flags = [True if label_type[l] == cat else False for l in y]
        ax.scatter(x1[flags], x2[flags], color=colors[cat], label=f'{cat}({sum(flags)})')
    for i, txt in enumerate(y):
        ax.annotate(txt, (x1[i], x2[i]))
        
    plt.title(f'Relation Embedding({model}, dim={dim})')
    plt.legend()
    plt.xlabel(f'pca1')
    plt.ylabel(f'pca2')
    plt.show()