In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import torch
from torch.utils.data import DataLoader
import os
import tensorflow as tf
import torch.nn as nn
import numpy as np
from dask import delayed
from torch.autograd import Variable
from os.path import join

Создаем класс нашего датасета. Реализуем в нем все необходимые нам методы


In [0]:
class WN18_train:
    def __init__(self, file_path):
        self.file_path = file_path
        self.pos_triplets = None
        self.num_ent = None
        self.num_rel = None
        self.neg_triplets = None

        # These structures match head+rel with tail and tail+rel with head
        self.head_rel_to_tail = {}
        self.tail_rel_to_head = {}
        
    def download_triplets(self):
        """
        This method reads triplets from the file and creates data structures which 
        will contribute to making negative samples 
        """
        t = []
        
        with open(self.file_path, 'r') as f:
            data = f.readlines()
            for el in data:
                t.append(el.split())
                head, rel, tail = [int(el) for el in el.split()]
                
                # Fill the heads, rels and tails into the dictionary of dictionaries head_rel_to_tail
                if head not in self.head_rel_to_tail.keys():
                    self.head_rel_to_tail[head] = {rel: [tail]}
                else:
                    if rel not in self.head_rel_to_tail[head].keys():
                        self.head_rel_to_tail[head][rel] = [tail]
                    else:
                        self.head_rel_to_tail[head][rel].append(tail)
                        
                # Fill the tails, rels and heads into the dictionary of dictionaries tail_rel_to_head
                if tail not in self.tail_rel_to_head.keys():
                    self.tail_rel_to_head[tail] = {rel: [head]}
                else:
                    if rel not in self.tail_rel_to_head[tail]:
                        self.tail_rel_to_head[tail][rel] = [head]
                    else:
                        self.tail_rel_to_head[tail][rel].append(head)
                
                
        self.pos_triplets = np.array(t, dtype=np.int)
        self.num_ent, self.num_rel = self.pos_triplets.max(axis=0)[:2] + 1
        
    def generate_neg_triplets(self):
        """
        This method generates negative triplet for each posotive triplet. 
        In a negative triplet we just replace either tail or head with prob = 0.5 
        """
        n = []

        for i in range(self.pos_triplets.shape[0]):
            head, rel, tail = self.pos_triplets[i, :]

            # Choose what to replace: tail or head 
            if np.random.rand(1) > 0.5:
                # Choose the index of head at random
                neg_head = np.random.uniform(0, self.num_ent)
                # If the random head occurs with the pair tail and rel, 
                # we choose the random index again
                while neg_head in self.tail_rel_to_head[tail][rel] or neg_head == head:
                    neg_head = np.random.uniform(0, self.num_ent)
                head = neg_head
            
            else:
                # Choose the index of tail at random
                neg_tail = np.random.uniform(0, self.num_ent)
                # If the random head occurs with the pair tail and rel, 
                # we choose the random index again
                while neg_tail in self.head_rel_to_tail[head][rel] or neg_tail == tail:
                    neg_tail = np.random.uniform(0, self.num_ent)
                tail = neg_tail
                
            n.append([head, rel, tail])
            
        self.neg_triplets = np.array(n, dtype=np.int)
                    
    def __getitem__(self, index):
        return self.pos_triplets[index], self.neg_triplets[index]
    
    def __len__(self):
        return self.pos_triplets.shape[0]

        

Это наша TransE модель. В ней мы создаем entity и relation ембеддинги и реализуем прямой проход.


In [0]:
# Pytorch 
class TransE(nn.Module):
    def __init__(self, dataset, vector_length=300):
        super(TransE, self).__init__()
        self.dataset = dataset
        self.num_ent = self.dataset.num_ent
        self.num_rel = self.dataset.num_rel
        self.vector_length = vector_length
        self.ent_emb = nn.Embedding(self.num_ent, self.vector_length)
        self.rel_emb = nn.Embedding(self.num_rel, self.vector_length)
        
        
    def forward(self, pos_triplet, neg_triplet):
        # We put triplets on GPU
        pos_triplet = pos_triplet.cuda()
        neg_triplet = neg_triplet.cuda()

        # Take embeddings which correspond to indexes in trilets  
        pos_head_emb = self.ent_emb(pos_triplet[:, 0]).cuda()
        pos_rel_emb = self.rel_emb(pos_triplet[:, 1]).cuda()
        pos_tail_emb = self.ent_emb(pos_triplet[:, 2]).cuda()
        
        neg_head_emb = self.ent_emb(neg_triplet[:, 0]).cuda()
        neg_rel_emb = self.rel_emb(neg_triplet[:, 1]).cuda()
        neg_tail_emb = self.ent_emb(neg_triplet[:, 2]).cuda()
        
        # Calculate the score which is a L-2 norm 
        neg_score = torch.norm((neg_head_emb + neg_rel_emb - neg_tail_emb), 2, 1)
        pos_score = torch.norm((pos_head_emb + pos_rel_emb - pos_tail_emb), 2, 1)

        losses = torch.stack((pos_score, neg_score), dim=1)
        
        return losses   

Это класс для проверки перформанса нашей модели. Как итог, этот класс рассчитывает MRR. 


In [0]:
class validation:
    def __init__(self, file_path, ent_emb, rel_emb):
        self.file_path = file_path 
        self.ent_emb = ent_emb
        self.rel_emb = rel_emb
        self.y_indices = None
        self.prediction = None
        self.similarity = None
        self.y = None
        self.mrr = None

    def generate_prediction(self):
        """ 
        Make the prediction based in validation data and stores the desired 
        prediction. 
        """
        b = []
        with open(self.file_path, 'r') as f:
            data = f.readlines()
            for el in data:
                b.append([int(a) for a in el.split()])

        b = np.array(b)

        # Store the prediction, the embedding of the desired y and index of desired y
        self.prediction = torch.stack([self.ent_emb[b[i, 0]] + self.rel_emb[b[i, 1]] for i in range(b.shape[0])], dim=0)
        self.y = torch.stack([self.ent_emb[b[i, 2]] for i in range(b.shape[0])], dim=0)
        self.y_indices = b[:, 2]

    def generate_similarity(self):
        """
        We create the self.similariry matrix, which contains the cosine 
        similarities. Each row represents the predicted vector, and column
        shows the similariry between this vector and embedding[column]. 
        We need it to calculate MRR. 
        """
        n = self.y.shape[0]
        m = self.ent_emb.shape[0]

        # Calculate the similarities matrix
        dot_products = torch.matmul(self.prediction, self.ent_emb.t())

        lengths_pred = torch.matmul(self.prediction, self.prediction.t())[range(n), range(n)]
        lengths_pred = torch.sqrt(lengths_pred).view(-1, 1)

        lengths_y = torch.matmul(self.ent_emb, self.ent_emb.t())[range(m), range(m)]
        lengths_y = torch.sqrt(lengths_y).view(1, -1)

        self.similarity = dot_products / (lengths_pred * lengths_y)

        # Sort the similariry matrix. We want to see how many vectors are 
        # more similar to a target vector than the supposed one. The fewer the better  
        ordered, indices = self.similarity.sort(dim=1, descending=True)

        l = torch.stack([torch.nonzero(indices[i] == self.y_indices[i]) for i in range(self.y_indices.shape[0])], dim=0).view(-1).float() + 1
        l = 1 / l
        self.mrr = torch.mean(l)


In [0]:
path = '/content/drive/My Drive/model_transE-master/wn18/train.txt'

In [0]:
a = WN18_train(path)
a.download_triplets()
a.generate_neg_triplets()

In [0]:
model = TransE(a).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=0)
train_loader = DataLoader(a, batch_size=128)

In [0]:
def train(model, dataset, train_loader, optimizer, num_epochs=40, margin=5):
    loss = nn.MarginRankingLoss(margin).cuda()
    y = Variable(torch.Tensor([-1])).cuda()

    for epoch in range(1, num_epochs+1):
        model.train()
        loss_accum = 0
        
        for i_step, (pos_triplet, neg_triplet) in enumerate(train_loader):
            prediction = model(pos_triplet, neg_triplet)
            loss_value = loss(prediction[:, 0], prediction[:, 1], y).cuda()
            optimizer.zero_grad()
            loss_value.backward()
            optimizer.step()
            
            loss_accum += loss_value.item()
            
    # if epoch % val_every == 0:
    #     print("Performing validation.....")
    #     ent_emb, rel_emb = model.parameters()
    #     ent_emb = ent_emb.to("cpu")
    #     rel_emb = rel_emb.to("cpu")

    #     valid_path = '/content/drive/My Drive/model_transE-master/wn18/valid.txt'

    #     v = validation(valid_path, ent_emb, rel_emb)
    #     v.generate_prediction()
    #     v.generate_similarity()
    #     print(v.mrr)

        average_loss = loss_accum / i_step
        print(f'Epoch: {epoch}, loss: {average_loss}')

In [10]:
train(model, a, train_loader, optimizer, num_epochs=40, margin=15)

Epoch: 1, loss: 14.201415748941413
Epoch: 2, loss: 10.858612067451304
Epoch: 3, loss: 7.897460328184102
Epoch: 4, loss: 5.5077528202695545
Epoch: 5, loss: 3.784234319013708
Epoch: 6, loss: 2.573308495914235
Epoch: 7, loss: 1.7133325825449568
Epoch: 8, loss: 1.1051314752835495
Epoch: 9, loss: 0.6810341488991388
Epoch: 10, loss: 0.40049869904960445
Epoch: 11, loss: 0.22281927262091528
Epoch: 12, loss: 0.1188880741731074
Epoch: 13, loss: 0.06224026698197714
Epoch: 14, loss: 0.03272085758895356
Epoch: 15, loss: 0.018373072208052846
Epoch: 16, loss: 0.010328764808933119
Epoch: 17, loss: 0.007059221306807315
Epoch: 18, loss: 0.0045647915267297045
Epoch: 19, loss: 0.003363843983654523
Epoch: 20, loss: 0.002614208253530356
Epoch: 21, loss: 0.001870625557133515
Epoch: 22, loss: 0.001821123636685885
Epoch: 23, loss: 0.001849787206941061
Epoch: 24, loss: 0.0014867544443898611
Epoch: 25, loss: 0.0014869474784820868
Epoch: 26, loss: 0.0011650566466793217
Epoch: 27, loss: 0.0009984758577195768
Epoch

In [0]:
ent_emb, rel_emb = model.parameters()
ent_emb = ent_emb.to("cpu")
rel_emb = rel_emb.to("cpu")

In [0]:
valid_path = '/content/drive/My Drive/model_transE-master/wn18/valid.txt'

In [13]:
v = validation(valid_path, ent_emb, rel_emb)
v.generate_prediction()
v.generate_similarity()
print(v.mrr)

tensor(0.1885)
