In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

### Load Data


`torvalds/linux` 같은 리파짓토리와 관련된 활동을 전개한 유저들만을 우선 뽑고, 그들의 활동 내역을 Knowledge Graph 형식으로 구성한 것입니다. Graph Embedding이 유효하게 동작하는지를 해당 Knowledge Graph 데이터로 확인해보도록 하겠습니다. 

In [3]:
def load_github_kg_dataset(name='linux'):
    """knowledge graph Dataset을 불러오는 함수
    현재 3가지 github knowledge graph가 구성되어 있음
    params
    * name : choose [linux, tensorflow, vim]
    """
    from tensorflow.keras.utils import get_file
    fpath = get_file(
        "github-playground.h5",
        "https://storage.googleapis.com/github-playground/playground.h5")
    target_df = pd.read_hdf(fpath, key=name)    
    
    # Type을 String으로 합치기
    type_df = pd.read_hdf(fpath, key='type')
    target_df.type = target_df.type.map(type_df.type.to_dict())
    
    # Repository Name과 Repository ID를 합치기
    repository_df = pd.read_hdf(fpath, key='repository')
    df = pd.merge(target_df, repository_df)
    return df


# tensorflow, vim 도 가능합니다.
df = load_github_kg_dataset(name='linux')

***caution*** : 아래와 같은 에러가 발생시, 링크를 타고 수정해주세요

* [ValueError: cannot set WRITEABLE flag to True of this array](https://github.com/pandas-dev/pandas/issues/24839)

### Transform Dataset

In [4]:
# column 이름 변경
df.rename({
    "actor_id": 'subject',
    "type": 'relation', 
    "repo_name":"object"},axis=1,inplace=True)

# Embedding에 활용할 relation type을 지정
train_df = df[df.relation.isin([
    'WatchEvent', 'IssuesEvent', 'PushEvent'])]

# K-core Sampling 수행
k_core = 5
for i in range(1, 10):
    prev_counts = len(train_df)    
    print(f"{i}th : {prev_counts}")
    
    sub_counts = train_df.subject.value_counts()
    obj_counts = train_df.object.value_counts()
    train_df = train_df[
        train_df.subject.isin(sub_counts[sub_counts>=k_core].index)
        & train_df.object.isin(obj_counts[obj_counts>=k_core].index)]
    
    if prev_counts == len(train_df):
        # 변화가 없으면 종료
        break

1th : 12973855
2th : 12093033
3th : 12092756
4th : 12092685
5th : 12092682


### Construct Data Pipeline

#### Convert from Values to indices

주어진 Node와 Edge들의 값을 Embedding Index값으로 변경합니다.

In [5]:
nodes = set(train_df.subject.unique()) | set(train_df.object.unique())

id2node = {i:node for i, node in enumerate(nodes)}
node2id = {node:i for i, node in enumerate(nodes)}

edges = set(train_df.relation.unique())

id2edge = {i:edge for i, edge in enumerate(edges)}
edge2id = {edge:i for i, edge in enumerate(edges)}

# node와 edge를 모두 index로 변경
subjects = train_df.subject.map(node2id).values
relations = train_df.relation.map(edge2id).values
objects = train_df.object.map(node2id).values

#### Corrupting function

Graph Embedding에서는 Negative Triplet Sampling이 핵심입니다.<br>
TransE에서는 존재하는 Triplet과 triplet의 head 혹은 tail을 무작위로 섞은 Negative Sampling을 동시에 학습시키게 됩니다.

In [6]:
def corrupt_triplet(triplet):
    """ 50% 확률로 head 혹은 tail을 corrupt한 negative sample을 추가
    """
    mask = tf.random.uniform(tf.shape(triplet['pos_subject']))<0.5
    
    triplet['neg_subject'] = tf.where(
        mask, triplet['pos_subject'], tf.random.shuffle(triplet['pos_subject']))
    triplet['neg_object'] = tf.where(
        mask, tf.random.shuffle(triplet['pos_object']), triplet['pos_object'])    
    return triplet

def generate_triplet_dataset(subjects, relations, objects, batch_size=10000):
    """ 학습 데이터셋을 생성하는 tf.data.Dataset을 구성
    """
    from sklearn.utils import shuffle
    subjects, relations, objects = shuffle(subjects, relations, objects)
    
    AUTOTUNE = tf.data.experimental.AUTOTUNE
    return (tf.data.Dataset
            .from_tensor_slices({"pos_subject":subjects, "pos_object":objects, "relation":relations})
            .batch(batch_size=batch_size)
            .prefetch(AUTOTUNE)        
            .map(corrupt_triplet, AUTOTUNE))

### Construct Model

#### Layer 구성

In [7]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.initializers import RandomUniform

num_nodes = len(nodes)
num_edges = len(edges)
embed_size = 50

# 초기화 방식은 논문에 나와있는 방식으로 구성
init_range = 6/np.sqrt(embed_size)
init_op = RandomUniform(-init_range, init_range)

node_embed_layer = Embedding(input_dim=num_nodes,
                             output_dim=embed_size,
                             embeddings_initializer=init_op,
                             name='node_embed_layer')
edge_embed_layer = Embedding(input_dim=num_edges, 
                             output_dim=embed_size,
                             embeddings_initializer=init_op,
                             name='edge_embed_layer')

#### 층 연결

In [8]:
from tensorflow.keras.layers import Input
from tensorflow.keras import backend as K

# 입력층 구성
pos_sub_inputs = Input(shape=(), name='pos_subject')
neg_sub_inputs = Input(shape=(), name='neg_subject')
pos_obj_inputs = Input(shape=(), name='pos_object')
neg_obj_inputs = Input(shape=(), name='neg_object')
rel_inputs = Input(shape=(), name='relation')

# 입력층을 임베딩층으로 연결
pos_sub_embed = K.l2_normalize(node_embed_layer(pos_sub_inputs),axis=1)
neg_sub_embed = K.l2_normalize(node_embed_layer(neg_sub_inputs),axis=1)
pos_obj_embed = K.l2_normalize(node_embed_layer(pos_obj_inputs),axis=1)
neg_obj_embed = K.l2_normalize(node_embed_layer(neg_obj_inputs),axis=1)
rel_embed = edge_embed_layer(rel_inputs)

#### 손실함수 구성

In [9]:
def distance(src_embed, dst_embed, norm='l1'):
    """
    src_embed와 dst_embed의 거리
    
    src_embed : subject + relation
    dst_embed : object
    """
    if norm == 'l1':
        return K.sum(K.abs(src_embed-dst_embed),1)
    elif norm == 'l2':
        return K.sum(K.square(src_embed-dst_embed),1)
    else:
        raise NotImplemented

pos_dist = distance(pos_sub_embed+rel_embed,pos_obj_embed)
neg_dist = distance(neg_sub_embed+rel_embed,neg_obj_embed)

margin = 1
loss = K.maximum(margin + pos_dist - neg_dist, 0.)

#### 모델 구성

In [10]:
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adagrad

inputs = (pos_sub_inputs, neg_sub_inputs, 
          pos_obj_inputs, neg_obj_inputs, rel_inputs)

model = Model(inputs, loss)
model.add_loss(loss)
model.compile(optimizer=Adagrad(2e-1))

### train model

In [None]:
num_epochs = 100
batch_size = 1024

for i in range(num_epochs):
    triplets = generate_triplet_dataset(
        subjects, relations, objects, batch_size)
    model.fit(x=triplets)

### Infer Model

In [12]:
node_embed = model.get_layer('node_embed_layer').get_weights()[0]
l2_norm = np.linalg.norm(node_embed,ord=2,axis=1)[:,None]
node_normalized = node_embed / l2_norm
node_df = pd.DataFrame(node_normalized)
node_df.index = node_df.index.map(id2node)

In [13]:
repository_df = node_df[node_df.index.isin(train_df.object.unique())]

In [14]:
(
    repository_df
    .dot(repository_df.loc['torvalds/linux'])
    .sort_values(ascending=False)
    .iloc[:20]
)

torvalds/linux                                1.000000
git/git                                       0.557233
GNOME/gimp                                    0.488659
aamine/cbc                                    0.473231
robotframework/RIDE                           0.471318
lupoDharkael/flameshot                        0.465037
torproject/tor                                0.464315
ikatyang/emoji-cheat-sheet                    0.450000
mszep/pandoc_resume                           0.447250
Mohist-Community/Mohist                       0.437677
cezanne/usbip-win                             0.437552
morris821028/UVa                              0.437386
pbatard/rufus                                 0.437079
nirewen/discord-netflix                       0.428762
baldengineer/bit-preserve                     0.426989
ellisonleao/magictools                        0.426127
PFCraft/Mohist                                0.426030
python/cpython                                0.422073
DeadManWal