In [1]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf

### Load Data


`torvalds/linux` 같은 리파짓토리와 관련된 활동을 전개한 유저들만을 우선 뽑고, 그들의 활동 내역을 Knowledge Graph 형식으로 구성한 것입니다. Graph Embedding이 유효하게 동작하는지를 해당 Knowledge Graph 데이터로 확인해보도록 하겠습니다. 

In [2]:
def load_github_kg_dataset(name='linux'):
    """knowledge graph Dataset을 불러오는 함수
    현재 3가지 github knowledge graph가 구성되어 있음
    name : linux, tensorflow, vim 
    """
    from tensorflow.keras.utils import get_file
    fpath = get_file("github-playground.h5",
                     "https://storage.googleapis.com/github-playground/playground.h5")
    target_df = pd.read_hdf(fpath, key=name)    
    
    type_df = pd.read_hdf(fpath, key='type')
    target_df.type = target_df.type.map(type_df.type.to_dict())
    
    repository_df = pd.read_hdf(fpath, key='repository')
    df = pd.merge(target_df, repository_df)
    
    df.rename({
        "actor_id": 'subject',
        "type": 'relation', 
        "repo_name":"object"}, axis=1, inplace=True)
    
    return df

In [3]:
# tensorflow, vim 도 가능합니다.
df = load_github_kg_dataset(name='linux')

***caution*** : 아래와 같은 에러가 발생시, 링크를 타고 수정해주세요

* [ValueError: cannot set WRITEABLE flag to True of this array](https://github.com/pandas-dev/pandas/issues/24839)

## 데이터 파이프라인 구축

----

#### 불필요한 Event 제거하기

In [4]:
def trim_relations(df, event_types):    
    return df[df.relation.isin(event_types)]

In [5]:
# Embedding에 활용할 relation type을 지정
event_types = ('WatchEvent','PushEvent','IssuesEvent')

df = trim_relations(df, event_types)

#### K-Core Sampling 수행하기

In [6]:
def kcore_sampling(df, k_core=5):
    for i in range(100):
        prev_counts = len(df)
        if prev_counts == 0:
            raise ValueError("No data remains")
        
        sub_counts = df.subject.value_counts()
        obj_counts = df.object.value_counts()
        df = df[df.subject.isin(sub_counts[sub_counts>=k_core].index)
                & df.object.isin(obj_counts[obj_counts>=k_core].index)]

        if prev_counts == len(df):
            # 변화가 없으면 종료
            return df
            
    return df

In [7]:
df = kcore_sampling(df, k_core=5)

#### Negative Sampling 함수

> Both datasets contain only positive triplets. As in Bordes et al, we generated negatives using the local closed world assumption. That is, for a triple, we randomly change either the subject or the object at random, to form a negative example. ***This negative sampling is performed at runtime for each batch of training positive examples.***

배치 단위로 Negative Sampling하는 것이 핵심

- Complex Embeddings for Simple Link Prediction(paper)


In [9]:
import logging
from sklearn.utils import shuffle
AUTOTUNE = tf.data.experimental.AUTOTUNE

class GraphDataPipelineBuilder:
    logger = logging.getLogger("reco_training_service")
    __dir__ = ["create_complex_dataset", "create_transe_dataset"]
    
    def __init__(self, 
                 graph_dataframe:pd.DataFrame,
                 sub_col_name='subject', 
                 rel_col_name='relation',
                 obj_col_name='object'):
        self.df = graph_dataframe
        assert sub_col_name in self.df.columns
        assert rel_col_name in self.df.columns        
        assert obj_col_name in self.df.columns

        self._initialize_node_index(sub_col_name, obj_col_name)
        self._initialize_edge_index(rel_col_name)
        self._initialize_dataset(sub_col_name, rel_col_name, obj_col_name)
        
    def _initialize_node_index(self, subject_col_name, object_col_name):
        self.logger.info("start to initialize node index")
        nodes = (set(self.df[subject_col_name].unique()) 
                 | set(self.df[object_col_name].unique()))

        self.node2id = {node:i for i, node in enumerate(nodes)}
        self.num_nodes = len(nodes)
        
        id2node = {i:node for i, node in enumerate(nodes)}
        self.node_index = pd.Series(id2node).to_frame()
        
        
    def _initialize_edge_index(self, relation_col_name):
        self.logger.info("start to initialize edge index")
        edges = set(self.df[relation_col_name].unique())

        self.edge2id = {edge:i for i, edge in enumerate(edges)}
        self.num_edges = len(edges)
        
        id2edge = {i:edge for i, edge in enumerate(edges)}
        self.edge_index = pd.Series(id2edge).to_frame()
        
    def _initialize_dataset(self, 
                            sub_col_name, 
                            rel_col_name, 
                            obj_col_name):
        self.logger.info("start to initialize dataset")
        subs = self.df[sub_col_name].map(self.node2id).values
        rels = self.df[rel_col_name].map(self.edge2id).values
        objs = self.df[obj_col_name].map(self.node2id).values

        subs, rels, objs = shuffle(subs, rels, objs)
        self.dataset = tf.data.Dataset.from_tensor_slices({
                "subject":subs, "object":objs, "relation":rels})
        
    def create_complex_dataset(self, batch_size, num_neg):
        return (self.dataset
                .shuffle(batch_size*1000)
                .batch(batch_size)
                .map(complEx_negative_sampler(num_neg), AUTOTUNE))
    
    def create_transe_dataset(self, batch_size):
        return (self.dataset
                .shuffle(batch_size*1000)
                .batch(batch_size)
                .map(transE_negative_sampler, AUTOTUNE))
    

def complEx_negative_sampler(num_negs):
    """Edge Negative Sampling strategy in complEx Model
    params : 
        * num_neg: 1 positive sample 당 negative 비율
    """
    def sampler(triplet):
        t = triplet
        p_rel, n_rel = t['relation'], tf.tile(t['relation'], [num_negs])
        p_sub, n_sub = t['subject'], tf.tile(t['subject'], [num_negs])
        p_obj, n_obj = t['object'], tf.tile(t['object'], [num_negs])

        n_sub, n_obj = corrupt_head_or_tail(n_sub, n_obj)

        inputs = {'relation': tf.concat([p_rel, n_rel], axis=-1),
                  'subject' : tf.concat([p_sub, n_sub], axis=-1),
                  'object'  : tf.concat([p_obj, n_obj], axis=-1)}

        p_labels, n_labels = tf.ones_like(p_rel), tf.zeros_like(n_rel)
        labels = tf.concat([p_labels, n_labels], axis=-1)                                              

        return inputs, labels
    return sampler
        
def transE_negative_sampler(triplets):
    """Edge Negative Sampling strategy in transE Model
    """
    t = triplet
    p_sub, p_obj, rel = t['subject'], t['object'], t['relation']
    n_sub, n_obj = corrupt_head_or_tail(p_sub, p_obj) 
    return {"pos_subject": p_sub, "neg_subject": n_sub,
            "pos_object": p_obj, "neg_object": n_obj, 
            "relation": relation}
    
def corrupt_head_or_tail(heads, tails):
    """ 50% 확률로 head 혹은 tail을 corrupt
    """        
    h_flag = tf.random.uniform(tf.shape(heads)) < 0.5

    neg_heads = tf.where(
        h_flag, heads, tf.random.shuffle(heads))
    neg_tails = tf.where(
        h_flag, tf.random.shuffle(tails), tails)    
    return neg_heads, neg_tails

In [10]:
builder = GraphDataPipelineBuilder(df)

### Construct Model

#### `Score Function`

$
\phi(r,s,o;\Theta) = Re(<w_r, e_s, \bar e_{o}> \\
= Re(\sum ^{K}_{k=1} W_{rk} e_{sk} \bar e_{ok}) \\
= <Re(w_r), Re(e_s), Re(e_o)> \\
  + <Re(w_r), Im(e_s), Im(e_o)> \\
  + <Im(w_r), Re(e_s), Im(e_o)> \\
  - <Im(w_r), Im(e_s), Re(e_o)> \\
$

In [11]:
from tensorflow.keras.layers import Layer

class ComplexDotScore(Layer):
    """ complEx Scoring Function
        - Based on Hermitian (or sesquilinear) dot product
        - score = Re(<relation, subject, object>)
        - Embedding의 구성
           * embed[:,:len(embed)//2] : real-value
           * embed[:,len(embed)//2:] : imaginary-value
    """
    def __init__(self, l3_reg=0., **kwargs):
        self.l3_reg = l3_reg
        super().__init__(**kwargs)
    
    def call(self, inputs):
        rel, sub, obj = inputs
        
        if self.l3_reg:
            l3_loss = K.mean(K.sum(
                K.abs(rel)**3+K.abs(sub)**3+K.abs(obj)**3, axis=1))
            self.add_loss(self.l3_reg * l3_loss)
        
        re_rel, im_rel = tf.split(rel, 2, axis=-1)
        re_sub, im_sub = tf.split(sub, 2, axis=-1)
        re_obj, im_obj = tf.split(obj, 2, axis=-1)
        return K.sum(  re_rel * re_sub * re_obj
                     + re_rel * im_sub * im_obj
                     + im_rel * re_sub * im_obj
                     - im_rel * im_sub * re_obj, axis=-1)    

In [12]:
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.initializers import GlorotUniform
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model

def build_complex_model(num_nodes, num_edges, embed_size=50):
    sub_inputs = Input(shape=(), name='subject')
    obj_inputs = Input(shape=(), name='object')
    rel_inputs = Input(shape=(), name='relation')

    node_embed_layer = Embedding(input_dim=num_nodes,
                                 output_dim=embed_size,
                                 embeddings_initializer=GlorotUniform(),
                                 name='node_embed_layer')
    edge_embed_layer = Embedding(input_dim=num_edges, 
                                 output_dim=embed_size,
                                 embeddings_initializer=GlorotUniform(),
                                 name='edge_embed_layer')

    sub_embed = node_embed_layer(sub_inputs)
    obj_embed = node_embed_layer(obj_inputs)
    rel_embed = edge_embed_layer(rel_inputs)

    outputs = ComplexDotScore(l3_reg=1e-3)([rel_embed, sub_embed, obj_embed])
    
    model = Model({
        "subject":sub_inputs, "object":obj_inputs, "relation":rel_inputs}, 
        outputs, name='complEx')
    return model
    
model = build_complex_model(num_nodes=builder.num_nodes,
                            num_edges=builder.num_edges,
                            embed_size=50)

#### 모델 구성

> Models were trained using Stochastic Gradient Descent with mini-batches and AdaGrad for tuning the learning rate, by minimizing the negative log-likelihood of the logistic model with l2 regularization on the parameters theta of the considered model.


하지만 많은 구현체에서는 BinaryCrossEntropy 대신 Softplus를 보다 선호하는 듯

In [13]:
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras import metrics
from tensorflow.keras.optimizers import Adagrad

lr = 5e-1

loss = BinaryCrossentropy(from_logits=True,
                          reduction='sum')
model.compile(optimizer=Adagrad(lr), loss=loss, metrics=[loss])

### train model

In [None]:
from tensorflow.keras.callbacks import TensorBoard

num_epochs = 100
num_negs = 20
batch_size = 4096

model.fit(builder.create_complex_dataset(batch_size, num_negs),
          epochs=num_epochs, class_weight={1:1., 0:1/num_negs})

### Infer Model

In [26]:
node_embed = model.get_layer('node_embed_layer').get_weights()[0]
l2_norm = np.linalg.norm(node_embed,ord=2,axis=1)[:,None]
node_normalized = node_embed / l2_norm
node_df = pd.DataFrame(node_embed)

In [None]:
node_df.index = builder.node_index.values[:,0].astype(str)
repository_df = node_df[node_df.index.str.contains('/')]

In [None]:
(
    repository_df
    .dot(repository_df.loc['torvalds/linux'])
    .sort_values(ascending=False)
    .iloc[:20]
)