# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import os

# Step 1: Data Preprocessing

In [11]:
# Load FB15K-237 dataset (example paths; update accordingly)
train_path = '/content/data/train.txt'
valid_path = '/content/data/valid.txt'
test_path = '/content/data/test.txt'

# Load triples into DataFrame
def load_triples(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, names=['head', 'relation', 'tail'])
    return df

train_triples = load_triples(train_path)
valid_triples = load_triples(valid_path)
test_triples = load_triples(test_path)

# Create entity and relation mappings
all_entities = set(train_triples['head']).union(set(train_triples['tail']))
all_relations = set(train_triples['relation'])

entity_to_id = {entity: idx for idx, entity in enumerate(all_entities)}
id_to_entity = {idx: entity for entity, idx in entity_to_id.items()}
relation_to_id = {relation: idx for idx, relation in enumerate(all_relations)}
id_to_relation = {idx: relation for relation, idx in relation_to_id.items()}

# Map entities and relations to IDs
def map_triples(triples):
    triples['head'] = triples['head'].map(entity_to_id)
    triples['relation'] = triples['relation'].map(relation_to_id)
    triples['tail'] = triples['tail'].map(entity_to_id)
    return triples[['head', 'relation', 'tail']]

train_triples = map_triples(train_triples)
valid_triples = map_triples(valid_triples)
test_triples = map_triples(test_triples)


# Step 2: Dataset Preparation

In [12]:
# KnowledgeGraphDataset class
class KnowledgeGraphDataset(Dataset):
    def __init__(self, triples):
        self.triples = torch.tensor(triples.values, dtype=torch.long)

    def __len__(self):
        return len(self.triples)

    def __getitem__(self, idx):
        return self.triples[idx]


# Step 3: Define the TransE Model

In [29]:
# TransE Model
class TransE(nn.Module):
    def __init__(self, num_entities, num_relations, embedding_dim, margin, distance_metric='L1'):
        super(TransE, self).__init__()
        self.entity_embeddings = nn.Embedding(num_entities, embedding_dim)
        self.relation_embeddings = nn.Embedding(num_relations, embedding_dim)
        self.margin = margin
        self.distance_metric = distance_metric

        nn.init.xavier_uniform_(self.entity_embeddings.weight)
        nn.init.xavier_uniform_(self.relation_embeddings.weight)

    def forward(self, positive_triplets, negative_triplets):
        pos_heads, pos_rels, pos_tails = positive_triplets[:, 0], positive_triplets[:, 1], positive_triplets[:, 2]
        neg_heads, neg_rels, neg_tails = negative_triplets[:, 0], negative_triplets[:, 1], negative_triplets[:, 2]

        pos_head_emb = self.entity_embeddings(pos_heads)
        pos_rel_emb = self.relation_embeddings(pos_rels)
        pos_tail_emb = self.entity_embeddings(pos_tails)

        neg_head_emb = self.entity_embeddings(neg_heads)
        neg_rel_emb = self.relation_embeddings(neg_rels)
        neg_tail_emb = self.entity_embeddings(neg_tails)

        p = 1 if self.distance_metric == 'L1' else 2

        pos_distance = torch.norm(pos_head_emb + pos_rel_emb - pos_tail_emb, p=p, dim=1)
        neg_distance = torch.norm(neg_head_emb + neg_rel_emb - neg_tail_emb, p=p, dim=1)

        return pos_distance, neg_distance

# Loss Function
class MarginLoss(nn.Module):
    def __init__(self, margin):
        super(MarginLoss, self).__init__()
        self.margin = margin

    def forward(self, pos_distance, neg_distance):
        return torch.mean(torch.relu(pos_distance - neg_distance + self.margin))

# Initialize Model, Optimizer, and Loss
num_entities = len(entity_to_id)
num_relations = len(relation_to_id)
embedding_dim = 20
margin = 1.0
learning_rate = 0.001
distance_metric = 'L2'  # Choose either 'L1' or 'L2'

model = TransE(num_entities, num_relations, embedding_dim, margin, distance_metric).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = MarginLoss(margin)

# Precompute relation-specific tail entities
def compute_relation_tails(triples):
    relation_to_tails = {}
    for _, row in triples.iterrows():
        relation, tail = row['relation'], row['tail']
        if relation not in relation_to_tails:
            relation_to_tails[relation] = set()
        relation_to_tails[relation].add(tail)
    return relation_to_tails

relation_to_tails = compute_relation_tails(train_triples)

# Step 4: Training the Model

In [30]:
def train_model(model, optimizer, criterion, train_loader, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            batch = batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

            positive_triplets = batch
            negative_triplets = batch.clone()
            for i in range(len(negative_triplets)):
                if np.random.rand() < 0.5:
                    # Corrupt tail with a plausible entity
                    relation = positive_triplets[i, 1].item()
                    negative_triplets[i, 2] = np.random.choice(list(relation_to_tails[relation]))
                else:
                    # Corrupt head with a random entity
                    negative_triplets[i, 0] = np.random.choice(num_entities)

            pos_distance, neg_distance = model(positive_triplets, negative_triplets)
            loss = criterion(pos_distance, neg_distance)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss:.4f}")


# Step 5: Prediction and Evaluation

In [31]:
def evaluate_model(model, test_loader):
    model.eval()
    hits_at_10 = 0
    mean_rank = 0
    mean_reciprocal_rank = 0

    with torch.no_grad():
        for batch in test_loader:
            batch = batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

            heads, relations, tails = batch[:, 0], batch[:, 1], batch[:, 2]
            for i in range(len(heads)):
                head, relation, tail = heads[i], relations[i], tails[i]

                head_emb = model.entity_embeddings(head)
                rel_emb = model.relation_embeddings(relation)
                target_emb = model.entity_embeddings(tail)

                all_entities = model.entity_embeddings.weight
                p = 1 if model.distance_metric == 'L1' else 2
                scores = torch.norm(head_emb + rel_emb - all_entities, p=p, dim=1)
                sorted_indices = torch.argsort(scores)

                matches = (sorted_indices == tail).nonzero(as_tuple=True)
                if matches[0].numel() > 0:  # Check if there are matches
                    rank = matches[0].item() + 1
                    mean_rank += rank
                    mean_reciprocal_rank += 1 / rank
                    if rank <= 10:
                        hits_at_10 += 1

    num_samples = len(test_loader.dataset)
    print(f"Hits@10: {hits_at_10 / num_samples:.4f}, Mean Rank: {mean_rank / num_samples:.4f}, MRR: {mean_reciprocal_rank / num_samples:.4f}")


# Run evaluation on the dataset

learning rate λ for the stochastic gradient descent among {0.001,0.01,0.1}, the margin γ among {1,2,10} and the latent dimension
kamong {20,50} on the validation set of each data set. The dissimilarity measure dwas set either
to the L1 or L2 distance according to validation performance as well. Optimal configurations were:
k = 20, λ = 0.01, γ = 2, and d= L1 on Wordnet; k = 50, λ = 0.01, γ = 1, and d= L1 on
FB15k; k= 50, λ= 0.01, γ = 1, and d= L2 on FB1M

In [32]:
# Create Data Loaders
train_data = KnowledgeGraphDataset(train_triples)
valid_data = KnowledgeGraphDataset(valid_triples)
test_data = KnowledgeGraphDataset(test_triples)

train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=128)
test_loader = DataLoader(test_data, batch_size=128)

# Train and Evaluate
train_model(model, optimizer, criterion, train_loader, num_epochs=10)
evaluate_model(model, test_loader)

# Save the Model
def save_model(model, file_path):
    torch.save(model.state_dict(), file_path)
    print(f"Model saved to {file_path}")

save_model(model, "/content/data/TransE_FB15K237.pth")

Epoch 1/10, Loss: 1502.2268
Epoch 2/10, Loss: 621.8619
Epoch 3/10, Loss: 361.2473
Epoch 4/10, Loss: 287.7936
Epoch 5/10, Loss: 248.1222
Epoch 6/10, Loss: 223.8606
Epoch 7/10, Loss: 203.2400
Epoch 8/10, Loss: 189.5553
Epoch 9/10, Loss: 180.4434
Epoch 10/10, Loss: 171.6860
Hits@10: 0.3488, Mean Rank: 272.4749, MRR: 0.2018
Model saved to /content/data/TransE_FB15K237.pth
