<a href="https://colab.research.google.com/github/Abhijit85/FederatedRAG/blob/main/TransE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [12]:
# whether you are using a GPU to run this Colab
use_gpu = True
# whether you are using a custom GCE env to run the Colab (uses different CUDA)
custom_GCE_env = False

In [13]:
%pip install openai
%pip install python-dotenv
# %pip install torch-geometric
# %pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
# %pip install torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu111.html

from dotenv import load_dotenv
from openai import OpenAI
import os
import re
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
import math
# from torch_geometric.data import InMemoryDataset, DataLoader
# import torch_geometric



# Data Preparation and Processing

In [14]:
class CustomDataset:
    def __init__(self, data_path: str):
        """
        Custom Dataset class for loading and processing data without PyTorch Geometric.

        Args:
            data_path (str): Path to the dataset directory.
        """

        #data_path = '/Users/abhi/GitHUB/FederatedRAG1/DataSets/FB15k-237'
        # Paths to files
        self.entity_dict_path = os.path.join(data_path, 'entities.dict')
        self.relation_dict_path = os.path.join(data_path, 'relations.dict')
        self.train_data_path = os.path.join(data_path, 'train.txt')
        self.valid_data_path = os.path.join(data_path, 'valid.txt')
        self.test_data_path = os.path.join(data_path, 'test.txt')

        # Load dictionaries and datasets
        self.entity_dict = self._read_dict(self.entity_dict_path)
        self.relation_dict = self._read_dict(self.relation_dict_path)

        self.train_data = self._read_data(self.train_data_path)
        self.valid_data = self._read_data(self.valid_data_path)
        self.test_data = self._read_data(self.test_data_path)

        self.num_entities = len(self.entity_dict)
        self.num_relations = len(self.relation_dict)

    # def _read_dict(self, file_path):
    #     """Read a dictionary file mapping strings to integers."""
    #     with open(file_path, 'r') as f:
    #         lines = f.readlines()
    #     return {line.split('\t')[0]: int(line.split('\t')[1]) for line in lines}

    def _read_dict(self, file_path: str):
        """
        Read entity / relation dict.
        Format: dict({id: entity / relation})
        """

        element_dict = {}
        with open(file_path, 'r') as f:
            for line in f:
                id_, element = line.strip().split('\t')
                element_dict[element] = int(id_)

        return element_dict

    def _read_data(self, file_path):
        """Read triples data and map to indices."""
        with open(file_path, 'r') as f:
            lines = f.readlines()
        triples = [line.strip().split('\t') for line in lines]
        return [(self.entity_dict[h], self.relation_dict[r], self.entity_dict[t]) for h, r, t in triples]

    def get_edge_indices_and_types(self, data):
        """Convert triples into edge indices and types for PyTorch tensors."""
        heads, relations, tails = zip(*data)
        edge_index = torch.tensor([heads, tails], dtype=torch.long)  # Shape: (2, num_edges)
        edge_type = torch.tensor(relations, dtype=torch.long)  # Shape: (num_edges,)
        return edge_index, edge_type


# Models

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransE(nn.Module):
    def __init__(self, num_entities, num_relations, embedding_dim, margin=1.0, p_norm=1):
        """
        TransE model constructor.
        Args:
            num_entities (int): Total number of entities.
            num_relations (int): Total number of relations.
            embedding_dim (int): Dimensionality of embeddings.
            margin (float): Margin for the loss function.
            p_norm (int): Norm to use (1 for L1 norm, 2 for L2 norm).
        """
        super(TransE, self).__init__()
        self.num_entities = num_entities
        self.num_relations = num_relations
        self.embedding_dim = embedding_dim
        self.margin = margin
        self.p_norm = p_norm

        # Define entity and relation embeddings
        self.entity_embeddings = nn.Embedding(num_entities, embedding_dim)
        self.relation_embeddings = nn.Embedding(num_relations, embedding_dim)

        # Loss function
        self.margin_ranking_loss = nn.MarginRankingLoss(margin=self.margin)

        # Initialize embeddings
        self._init_embeddings()

    def _init_embeddings(self):
        """Initialize embeddings using Xavier uniform initialization."""
        nn.init.xavier_uniform_(self.entity_embeddings.weight.data)
        nn.init.xavier_uniform_(self.relation_embeddings.weight.data)

    def forward(self, head, relation, tail, mode="normal"):
        """
        Compute the TransE score.
        Args:
            head (torch.Tensor): Head entity indices.
            relation (torch.Tensor): Relation indices.
            tail (torch.Tensor): Tail entity indices.
            mode (str): Evaluation mode ('normal', 'head-batch', or 'tail-batch').
        Returns:
            torch.Tensor: Computed scores.
        """
        h = self.entity_embeddings(head)
        r = self.relation_embeddings(relation)
        t = self.entity_embeddings(tail)

        if mode == "normal":
            score = h + r - t
        elif mode == "head-batch":
            score = h.view(-1, 1, self.embedding_dim) + r - t
        elif mode == "tail-batch":
            score = h + r - t.view(-1, 1, self.embedding_dim)
        else:
            raise ValueError(f"Unsupported mode: {mode}")

        return -torch.norm(score, p=self.p_norm, dim=-1)

    def loss(self, positive_score, negative_score):
        """
        Compute the margin-based ranking loss.
        Args:
            positive_score (torch.Tensor): Scores for positive triples.
            negative_score (torch.Tensor): Scores for negative triples.
        Returns:
            torch.Tensor: Loss value.
        """
        target = torch.ones_like(positive_score)
        return self.margin_ranking_loss(positive_score, negative_score, target)

    def evaluate(self, head, relation, tail, all_entities):
        """
        Evaluate the model by replacing entities in a triple.
        Args:
            head (torch.Tensor): Head entity indices.
            relation (torch.Tensor): Relation indices.
            tail (torch.Tensor): Tail entity indices.
            all_entities (torch.Tensor): All entity indices.
        Returns:
            torch.Tensor: Scores for all replaced entities.
        """
        batch_size = head.size(0)

        # Expand for replacement
        head_exp = head.view(-1, 1).repeat(1, all_entities.size(0))
        relation_exp = relation.view(-1, 1).repeat(1, all_entities.size(0))
        tail_exp = tail.view(-1, 1).repeat(1, all_entities.size(0))

        # Flatten for embedding lookup
        head_flat = head_exp.view(-1)
        relation_flat = relation_exp.view(-1)
        tail_flat = all_entities.repeat(batch_size)

        # Compute scores
        scores = self.forward(head_flat, relation_flat, tail_flat, mode="normal")
        return scores.view(batch_size, -1)


# Train Function

## Model: TransEE
**Embeddings:**
Each entity and relation is represented as a vector in a high-dimensional space.
The embeddings are initialized randomly and updated during training.
**Distance Metric:**
TransE predicts relationships by minimizing the distance between embeddings of head + relation - tail.
A lower distance indicates a more likely relationship.

## Model TransE

In [26]:
def train(model, optimizer, train_loader, device):
    model.train()
    total_loss = 0

    for batch in train_loader:
        batch_h, batch_r, batch_t, batch_neg_h, batch_neg_t = [b.to(device) for b in batch]

        # Positive and negative scores
        positive_score = model(batch_h, batch_r, batch_t)
        negative_score_h = model(batch_neg_h, batch_r, batch_t)
        negative_score_t = model(batch_h, batch_r, batch_neg_t)

        # Combine negative scores and reshape to match positive score size
        negative_score = torch.cat([negative_score_h, negative_score_t], dim=0)

        # Reshape negative_score to match the size of positive_score
        negative_score = negative_score.view(positive_score.shape[0], -1)
        # Calculate loss for each negative sample type (head/tail corruption) separately
        loss_h = model.loss(positive_score, negative_score_h)
        loss_t = model.loss(positive_score, negative_score_t)

        # Average the losses to get the total loss
        loss = (loss_h + loss_t) / 2

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)

# Evaluate Model

## Prediction:
After training, the model can predict missing relationships by ranking possible tail entities for a given (head, relation, ?).
Example Query:
Input: (Steve Jobs, FounderOf, ?)
Output: Apple (highest-ranked entity).

In [36]:
def evaluate(model, eval_loader, all_entities, device, batch_size=64):  # Reduced batch size
    model.eval()
    mrr, mr, hits_at_10, hits_at_3, hits_at_1 = 0, 0, 0, 0, 0
    num_samples = 0

    with torch.no_grad():
        for batch in eval_loader:
            batch_h, batch_r, batch_t = [b.to(device) for b in batch]

            # Split the evaluation into smaller chunks to reduce memory usage
            for i in range(0, batch_h.shape[0], batch_size):
                chunk_h = batch_h[i:i + batch_size]
                chunk_r = batch_r[i:i + batch_size]
                chunk_t = batch_t[i:i + batch_size]  # Get the corresponding chunk of batch_t

                scores = model.evaluate(chunk_h, chunk_r, chunk_t, all_entities)

                # Compute rankings using chunk_t
                ranks = (scores.argsort(dim=1, descending=True).eq(chunk_t.view(-1, 1))).nonzero(as_tuple=True)[1] + 1

                # Metrics
                mrr += (1.0 / ranks.float()).sum().item()
                mr += ranks.float().sum().item()
                hits_at_10 += (ranks <= 10).float().sum().item()
                hits_at_3 += (ranks <= 3).float().sum().item()
                hits_at_1 += (ranks == 1).float().sum().item()

                num_samples += chunk_t.size(0)  # Update num_samples based on chunk size

    return {
        "MRR": mrr / num_samples,
        "MR": mr / num_samples,
        "Hits@10": hits_at_10 / num_samples,
        "Hits@3": hits_at_3 / num_samples,
        "Hits@1": hits_at_1 / num_samples,
    }

# Start Training

## Positive Triplets:
The dataset provides positive examples in the form of valid (head, relation, tail) triplets.
## Negative Sampling:
For each positive triplet, a corrupted version is generated by replacing either the head or tail with a random entity.
## Loss Function:
The model uses margin-based ranking loss:
Ensures valid triplets are closer in embedding space than invalid ones by at least a predefined margin.

# Last good run
Validation score: MRR = 0.2749, MR = 164.3813, Hits@10 = 0.4441, Hits@3 = 0.2908, Hits@1 = 0.1938
Test scores from the best model (MMR, MR, Hits@10): (0.26878729462623596, 170.3550767125965, 0.4360891234242158, 0.28647512948304504, 0.1866021694517737)

In [37]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Define training parameters
lr = 0.001
use_gpu = torch.cuda.is_available()
if use_gpu:
    num_epochs = 190
    valid_freq = 10
else:
    num_epochs = 10
    valid_freq = 10

device = torch.device('cuda' if use_gpu else 'cpu')

# Load dataset using CustomDataset class
data_path = '/content/sample_data'
dataset = CustomDataset(data_path)

# Extract edge indices and types
train_edge_index, train_edge_type = dataset.get_edge_indices_and_types(dataset.train_data)
valid_edge_index, valid_edge_type = dataset.get_edge_indices_and_types(dataset.valid_data)
test_edge_index, test_edge_type = dataset.get_edge_indices_and_types(dataset.test_data)

# Negative Sampling Function
def negative_sampling(edge_index, num_entities):
    """
    Generate negative samples by replacing the head or tail with a random entity.
    Args:
        edge_index (torch.Tensor): The edge index (head and tail indices).
        num_entities (int): Total number of entities.
    Returns:
        torch.Tensor: Negative samples.
    """
    num_edges = edge_index.size(1)
    negative_samples = edge_index.clone()
    random_entities = torch.randint(0, num_entities, (num_edges,))
    mask = torch.rand(num_edges) > 0.5  # Randomly replace head or tail
    negative_samples[0, mask] = random_entities[mask]  # Replace heads
    negative_samples[1, ~mask] = random_entities[~mask]  # Replace tails
    return negative_samples

# Prepare training data
train_heads = train_edge_index[0]
train_tails = train_edge_index[1]
train_relations = train_edge_type

# Generate negative samples for training
negative_samples = negative_sampling(train_edge_index, dataset.num_entities)

# Create TensorDataset and DataLoader for training
train_dataset = TensorDataset(train_heads, train_relations, train_tails,
                               negative_samples[0], negative_samples[1])
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)

# Create DataLoader for validation
valid_dataset = TensorDataset(valid_edge_index[0], valid_edge_type, valid_edge_index[1])
valid_loader = DataLoader(valid_dataset, batch_size=256, shuffle=False)

# Create DataLoader for testing
test_dataset = TensorDataset(test_edge_index[0], test_edge_type, test_edge_index[1])
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

# Initialize the TransE model
model = TransE(
    num_entities=dataset.num_entities,
    num_relations=dataset.num_relations,
    embedding_dim=200,  # Dimensionality of embeddings
    margin=9.0          # Margin for ranking loss
).to(device)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

# Training and Validation Loop
for epoch in range(num_epochs):
    # Training step
    train(
        model=model,
        train_loader=train_loader,
        optimizer=optimizer,
        device=device
    )

    # Validation step
    if (epoch + 1) % valid_freq == 0:
        all_entities = torch.arange(dataset.num_entities, device=device)
        val_metrics = evaluate(
            model=model,
            eval_loader=valid_loader,
            all_entities=all_entities,
            device=device
        )
        print(f"Epoch {epoch + 1}/{num_epochs} - Validation Metrics: {val_metrics}")

# Evaluate the model on the test set
all_entities = torch.arange(dataset.num_entities, device=device)
test_metrics = evaluate(
    model=model,
    eval_loader=test_loader,
    all_entities=all_entities,
    device=device
)

# Print test metrics
print("Test Metrics:", test_metrics)


Epoch 10/190 - Validation Metrics: {'MRR': 0.18528308281029668, 'MR': 372.79669232962647, 'Hits@10': 0.32671799258625606, 'Hits@3': 0.19880239520958085, 'Hits@1': 0.11348731109210151}
Epoch 20/190 - Validation Metrics: {'MRR': 0.18744408523455147, 'MR': 378.4536070715711, 'Hits@10': 0.3137154262902766, 'Hits@3': 0.19731964642144284, 'Hits@1': 0.12033076703735386}
Epoch 30/190 - Validation Metrics: {'MRR': 0.189440855342233, 'MR': 381.97502138579983, 'Hits@10': 0.32859994297120043, 'Hits@3': 0.20290846877673224, 'Hits@1': 0.11833475905332193}
Epoch 40/190 - Validation Metrics: {'MRR': 0.17984598356670622, 'MR': 380.99458226404334, 'Hits@10': 0.32848588537211293, 'Hits@3': 0.19749073282007415, 'Hits@1': 0.10287995437696036}
Epoch 50/190 - Validation Metrics: {'MRR': 0.19152619914176425, 'MR': 384.67676076418593, 'Hits@10': 0.3291132021670944, 'Hits@3': 0.20747077274023382, 'Hits@1': 0.11873396065012831}
Epoch 60/190 - Validation Metrics: {'MRR': 0.19622006235484354, 'MR': 383.41345879669

In [None]:
# import torch

# # Check if CUDA is available
# if torch.cuda.is_available():
#     # Get the device properties
#     device_properties = torch.cuda.get_device_properties(0)  # 0 for the first GPU

#     # Get total memory in bytes
#     total_memory = device_properties.total_memory

#     # Get allocated memory in bytes
#     allocated_memory = torch.cuda.memory_allocated(0)

#     # Get reserved memory in bytes
#     reserved_memory = torch.cuda.memory_reserved(0)

#     # Calculate free memory in bytes
#     free_memory = total_memory - allocated_memory - reserved_memory

#     # Print the results in GB
#     print(f"Total CUDA memory: {total_memory / (1024**3):.2f} GB")
#     print(f"Allocated CUDA memory: {allocated_memory / (1024**3):.2f} GB")
#     print(f"Reserved CUDA memory: {reserved_memory / (1024**3):.2f} GB")
#     print(f"Free CUDA memory: {free_memory / (1024**3):.2f} GB")

# else:
#     print("CUDA is not available.")

# Example Workflow:
## Input:

**Dataset**: (Barack Obama, PresidentOf, United States), (Elon Musk, FounderOf, Tesla).
**Embedding Initialization**:

**Entities**: Barack Obama, United States, Elon Musk, Tesla.
**Relations**: PresidentOf, FounderOf.
**Training:**

**Positive Triplets**: (Barack Obama, PresidentOf, United States).
**Negative Sampling**: (Barack Obama, PresidentOf, RandomEntity).
Evaluation:

Metrics like **MRR, MR,** and **Hits@10** are computed during validation to measure the model’s performance.
Prediction:

**Query**: *(Elon Musk, FounderOf, ?)*
**Prediction**: Tesla (most likely tail).
