In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [2]:
import torch
from torch.utils.data import DataLoader
import os
import tensorflow as tf
import torch.nn as nn
import numpy as np
from dask import delayed
from torch.autograd import Variable
from os.path import join
from scipy.spatial.distance import cosine

Создаем класс нашего датасета. Реализуем в нем все необходимые нам методы


In [0]:
class WN18_train:
    def __init__(self, file_path):
        self.file_path = file_path
        self.pos_triplets = None
        self.num_ent = None
        self.num_rel = None
        self.neg_triplets = None

        # These structures match head+rel with tail and tail+rel with head
        self.head_rel_to_tail = {}
        self.tail_rel_to_head = {}
        
    def download_triplets(self):
      """
      This method reads triplets from the file and creates data structures which 
      will contribute to making negative samples 
      """
        t = []
        
        with open(self.file_path, 'r') as f:
            data = f.readlines()
            for el in data:
                t.append(el.split())
                head, rel, tail = [int(el) for el in el.split()]
                
                # Fill the heads, rels and tails into the dictionary of dictionaries head_rel_to_tail
                if head not in self.head_rel_to_tail.keys():
                    self.head_rel_to_tail[head] = {rel: [tail]}
                else:
                    if rel not in self.head_rel_to_tail[head].keys():
                        self.head_rel_to_tail[head][rel] = [tail]
                    else:
                        self.head_rel_to_tail[head][rel].append(tail)
                        
                # Fill the tails, rels and heads into the dictionary of dictionaries tail_rel_to_head
                if tail not in self.tail_rel_to_head.keys():
                    self.tail_rel_to_head[tail] = {rel: [head]}
                else:
                    if rel not in self.tail_rel_to_head[tail]:
                        self.tail_rel_to_head[tail][rel] = [head]
                    else:
                        self.tail_rel_to_head[tail][rel].append(head)
                
                
        self.pos_triplets = np.array(t, dtype=np.int)
        self.num_ent, self.num_rel = self.pos_triplets.max(axis=0)[:2] + 1
        
    def generate_neg_triplets(self):
        """
        This method generates negative triplet for each posotive triplet. 
        In a negative triplet we just replace either tail or head with prob = 0.5 
        """
        n = []

        for i in range(self.pos_triplets.shape[0]):
            head, rel, tail = self.pos_triplets[i, :]

            # Choose what to replace: tail or head 
            if np.random.rand(1) > 0.5:
                # Choose the index of head at random
                neg_head = np.random.uniform(0, self.num_ent)
                # If the random head occurs with the pair tail and rel, 
                # we choose the random index again
                while neg_head in self.tail_rel_to_head[tail][rel] or neg_head == head:
                    neg_head = np.random.uniform(0, self.num_ent)
                head = neg_head
            
            else:
                # Choose the index of tail at random
                neg_tail = np.random.uniform(0, self.num_ent)
                # If the random head occurs with the pair tail and rel, 
                # we choose the random index again
                while neg_tail in self.head_rel_to_tail[head][rel] or neg_tail == tail:
                    neg_tail = np.random.uniform(0, self.num_ent)
                tail = neg_tail
                
            n.append([head, rel, tail])
            
        self.neg_triplets = np.array(n, dtype=np.int)
                    
    def __getitem__(self, index):
        return self.pos_triplets[index], self.neg_triplets[index]
    
    def __len__(self):
        return self.pos_triplets.shape[0]
        

Это наша TransE модель. В ней мы создаем entity и relation ембеддинги и реализуем прямой проход.


In [0]:
# Pytorch 
class TransE(nn.Module):
    def __init__(self, dataset, vector_length=150):
        super(TransE, self).__init__()
        self.dataset = dataset
        self.num_ent = self.dataset.num_ent
        self.num_rel = self.dataset.num_rel
        self.vector_length = vector_length
        self.ent_emb = nn.Embedding(self.num_ent, self.vector_length).cuda()
        self.rel_emb = nn.Embedding(self.num_rel, self.vector_length).cuda()
        
        
    def forward(self, pos_triplet, neg_triplet):
        # We put triplets on GPU
        pos_triplet = pos_triplet.cuda()
        neg_triplet = neg_triplet.cuda()

        # Take embeddings which correspond to indexes in trilets  
        pos_head_emb = self.ent_emb(pos_triplet[:, 0]).cuda()
        pos_rel_emb = self.rel_emb(pos_triplet[:, 1]).cuda()
        pos_tail_emb = self.ent_emb(pos_triplet[:, 2]).cuda()
        
        neg_head_emb = self.ent_emb(neg_triplet[:, 0]).cuda()
        neg_rel_emb = self.rel_emb(neg_triplet[:, 1]).cuda()
        neg_tail_emb = self.ent_emb(neg_triplet[:, 2]).cuda()
        
        # Calculate the score which is a L-2 norm 
        neg_score = torch.norm((neg_head_emb + neg_rel_emb - neg_tail_emb), 2, 1)
        pos_score = torch.norm((pos_head_emb + pos_rel_emb - pos_tail_emb), 2, 1)

        losses = torch.stack((pos_score, neg_score), dim=1)
        
        return losses 
        
        

In [0]:
path = '/content/drive/My Drive/model_transE-master/wn18/train.txt'

In [0]:
a = WN18_train(path)
a.download_triplets()
a.generate_neg_triplets()

In [0]:
model = TransE(a).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0)
train_loader = DataLoader(a, batch_size=128)

In [0]:
def train(model, dataset, train_loader, optimizer, num_epochs=10):
    loss = nn.MarginRankingLoss(0.1).cuda()
    y = Variable(torch.Tensor([-1])).cuda()
    for epoch in range(num_epochs):
        model.train()
        loss_accum = 0
        
        for i_step, (pos_triplet, neg_triplet) in enumerate(train_loader):
            prediction = model(pos_triplet, neg_triplet)
            loss_value = loss(prediction[:, 0], prediction[:, 1], y).cuda()
            optimizer.zero_grad()
            loss_value.backward()
            optimizer.step()
            
            loss_accum += loss_value.item()
            
        average_loss = loss_accum / i_step
        print(f'Epoch: {epoch}, loss: {average_loss}')

In [14]:
train(model, a, train_loader, optimizer)

Epoch: 0, loss: 0.00023060381698013362
Epoch: 1, loss: 0.0001502393564205128
Epoch: 2, loss: 0.00012811400034207834
Epoch: 3, loss: 0.00011845291407812572
Epoch: 4, loss: 0.00010524566698296846
Epoch: 5, loss: 9.198086905162528e-05
Epoch: 6, loss: 7.882698163036173e-05
Epoch: 7, loss: 6.505557203168109e-05
Epoch: 8, loss: 5.558507224098185e-05
Epoch: 9, loss: 4.043237946725269e-05
