In [1]:
import os


In [2]:
# if you are running a kaggle notebook
DATA_DIR = "/kaggle/input"
OUTPUT_DIR = "/kaggle/output"

In [3]:
# if you are running it on local machine
# download input files
# create directory kaggle/input
os.makedirs("input/kg_embeddings", exist_ok=True)
os.makedirs("output/kg_embeddings", exist_ok=True)
# # download files from kaggle
#! kaggle datasets download -d latebloomer/fb15k-237
#!unzip -d input/kg_embeddings fb15k-237.zip
DATA_DIR = "input/kg_embeddings"
OUTPUT_DIR = "output/kg_embeddings"

In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk(DATA_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

input/kg_embeddings/valid.txt
input/kg_embeddings/test.txt
input/kg_embeddings/train.txt


In [5]:
# data exploration
import pandas as pd

train_df = pd.read_csv(f"{DATA_DIR}/train.txt", delimiter="\t", names=["head", "relation", "tail"])
test_df = pd.read_csv(f"{DATA_DIR}/test.txt", delimiter="\t", names=["head", "relation", "tail"])
val_df = pd.read_csv(f"{DATA_DIR}/valid.txt", delimiter="\t", names=["head", "relation", "tail"])


In [6]:
# generate vocabulary
entities = pd.concat(
    [train_df["head"], test_df["head"], val_df["head"], train_df["tail"], test_df["tail"], val_df["tail"]], axis=0)
entities = sorted(list(set(entities)))
print(f"len(entities): {len(entities)}")
relations = pd.concat([train_df["relation"], test_df["relation"], val_df["relation"]])
relations = sorted(list(set(relations)))

print(f"len(relation): {len(relations)}")

id2word = entities + relations
word2id = {word: i for i, word in enumerate(id2word)}

len(entities): 14541
len(relation): 237


In [None]:
# initalize embedding layer
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.utils import shuffle
from tqdm.auto import tqdm

def relation_data_loader(df):
    df = shuffle(df)
    for i, row in df.iterrows():
        positive_example = [word2id[item] for item in row]
        # randomly sample negative example by replacing head or tail with random entity
        #   with probability 0.5
        if np.random.rand() > 0.5:
            negative_example = [word2id[item] for item in row]
            negative_example[0] = word2id[np.random.choice(entities)]
        else:
            negative_example = [word2id[item] for item in row]
            negative_example[2] = word2id[np.random.choice(entities)]
        yield positive_example, negative_example


def distance_function(t1, t2):
    # implement L1 norm as distance function
    return torch.sum(torch.abs(t1 - t2), dim=-1)


# validation code
def test_accuracy(val_df):
    total = val_df.shape[0]
    correct = 0
    for i, (positive_sample, *_) in tqdm(enumerate(relation_data_loader(val_df)),total=val_df.shape[0]):
        head, relation, actual_entity = positive_sample
        entity_relation_vector = embed_layer(torch.LongTensor([head, relation]))
        # print(entity_relation_vector.shape)
        pred_vector = entity_relation_vector[0] + entity_relation_vector[1] 
        # print(pred_vector.shape)
        closest_entity = torch.argmin(distance_function(pred_vector, embed_layer.weight.data), dim=-1)
        # print(f"closest_entity: {closest_entity}")
        if closest_entity == actual_entity:
            correct += 1
    return correct/total



# hyper parameters
# k = 50, λ = 0.01, γ = 1, and d = L1 on FB15k;
embedding_dim = 50
lr =  0.01
gamma = 1
epoch = 1000


# define layer
embed_layer = nn.Embedding(len(id2word), embedding_dim)
# input = torch.LongTensor([[0, 2, 0, 5]])
# embed_layer(input)




for e in range(epoch):
    for i, (positive_sample, negative_sample)  in tqdm(enumerate(relation_data_loader(train_df)), total=train_df.shape[0]):
        positive_sample = torch.LongTensor(positive_sample)
        negative_sample = torch.LongTensor(negative_sample)
        positive_input = embed_layer(positive_sample)
        negative_input = embed_layer(negative_sample)
        # calculate distance
        positive_distance = distance_function(positive_input[0] + positive_input[1], positive_input[2])
        negative_distance = distance_function(negative_input[0] + negative_input[1], negative_input[2])
        # calculate loss
        loss = gamma + positive_distance - negative_distance
        # backpropagation
        loss.backward()
        # update weights
        embed_layer.weight.data -= lr * embed_layer.weight.grad.data
    # if e%10==0:
    accuracy = test_accuracy(val_df)
    print(f"accuracy @epoch{e}: {accuracy}")



  0%|          | 0/272115 [00:00<?, ?it/s]

  0%|          | 0/20466 [00:00<?, ?it/s]

Accuracy: 0.02897488517541288


# Embedding models

## TransE

### Implementation

In [None]:
from torch import nn


# sample database
# forward 
# loss 
# backward propagation
# minibatch 
# 



### References
1. [Translating Embeddings for Modeling
Multi-relational Data(Paper)](https://papers.nips.cc/paper/2013/file/1cecc7a77928ca8133fa24680a88d2f9-Paper.pdf)