# Import Libraries

In [13]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error, mean_absolute_error, f1_score
from math import sqrt
import os

# Step 1: Data Preprocessing

In [14]:
# Load MovieLens 100K dataset
ratings = pd.read_csv('/Users/abhi/GitHUB/FederatedRAG/DataSets/ml-100k/u.data', sep='\t', names=['user', 'movie', 'rating', 'timestamp'], engine='python')

# Create knowledge graph triples
ratings['relation'] = 'rates'
triples = ratings[['user', 'relation', 'movie']]

# Create entity and relation mappings
all_entities = list(set(ratings['user']).union(set(ratings['movie'])))
entity_to_id = {entity: idx for idx, entity in enumerate(all_entities)}
id_to_entity = {idx: entity for entity, idx in entity_to_id.items()}
relation_to_id = {'rates': 0}
id_to_relation = {0: 'rates'}

# Map entities and relations to IDs
triples['head'] = triples['user'].map(entity_to_id)
triples['tail'] = triples['movie'].map(entity_to_id)
triples['relation'] = triples['relation'].map(relation_to_id)
triples = triples[['head', 'relation', 'tail']]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  triples['head'] = triples['user'].map(entity_to_id)


# Step 2: Dataset Preparation

In [15]:

class KnowledgeGraphDataset(Dataset):
    def __init__(self, triples):
        self.triples = triples.values

    def __len__(self):
        return len(self.triples)

    def __getitem__(self, idx):
        return self.triples[idx]

# Create dataset and dataloader
kg_dataset = KnowledgeGraphDataset(triples)
kg_dataloader = DataLoader(kg_dataset, batch_size=128, shuffle=True)

# Step 3: Define the TransE Model

In [16]:

class TransEModel(nn.Module):
    def __init__(self, num_entities, num_relations, embedding_dim):
        super(TransEModel, self).__init__()
        self.entity_embeddings = nn.Embedding(num_entities, embedding_dim)
        self.relation_embeddings = nn.Embedding(num_relations, embedding_dim)

    def forward(self, head, relation, tail):
        head_embedding = self.entity_embeddings(head)
        relation_embedding = self.relation_embeddings(relation)
        tail_embedding = self.entity_embeddings(tail)
        return head_embedding + relation_embedding - tail_embedding

    def get_embeddings(self):
        return self.entity_embeddings.weight.detach().cpu().numpy()

# Function to initialize model parameters
def initialize_model(embedding_dim=50, learning_rate=0.01, margin=1.0):
    global model, criterion, optimizer
    model = TransEModel(num_entities, num_relations, embedding_dim).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
    criterion = nn.MarginRankingLoss(margin=margin)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)




# Initialize model parameters

In [17]:
num_entities = len(entity_to_id)
num_relations = len(relation_to_id)
initialize_model(embedding_dim=50, learning_rate=0.01, margin=1.0)

# Step 4: Training the Model

In [20]:

def generate_negative_samples(batch):
    batch_size = batch.shape[0]
    negative_samples = batch.clone()
    for i in range(batch_size):
        if np.random.rand() < 0.5:
            negative_samples[i, 2] = np.random.choice(num_entities)  # Replace tail
        else:
            negative_samples[i, 0] = np.random.choice(num_entities)  # Replace head
    return negative_samples

def train_model(num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in kg_dataloader:
            batch = batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

            # Positive samples
            head, relation, tail = batch[:, 0], batch[:, 1], batch[:, 2]
            pos_scores = torch.norm(model(head, relation, tail), p=2, dim=1)

            # Negative samples
            negative_batch = generate_negative_samples(batch)
            neg_head, neg_relation, neg_tail = negative_batch[:, 0], negative_batch[:, 1], negative_batch[:, 2]
            neg_scores = torch.norm(model(neg_head, neg_relation, neg_tail), p=2, dim=1)

            # Calculate loss
            target = torch.ones_like(pos_scores)
            loss = criterion(pos_scores, neg_scores, target)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss}")




# Train the model

In [28]:
train_model(num_epochs=50)

Epoch 1, Loss: 280.82981127500534
Epoch 2, Loss: 280.6624404489994
Epoch 3, Loss: 271.064123660326
Epoch 4, Loss: 269.6384253948927
Epoch 5, Loss: 267.86956648528576
Epoch 6, Loss: 261.50204269587994
Epoch 7, Loss: 262.65105402469635
Epoch 8, Loss: 257.9821886867285
Epoch 9, Loss: 256.6142937541008
Epoch 10, Loss: 255.15003564953804
Epoch 11, Loss: 254.77102488279343
Epoch 12, Loss: 255.4610629826784
Epoch 13, Loss: 253.64148126542568
Epoch 14, Loss: 251.7117628455162
Epoch 15, Loss: 248.0394583940506
Epoch 16, Loss: 248.79728235304356
Epoch 17, Loss: 247.320734500885
Epoch 18, Loss: 248.19750559329987
Epoch 19, Loss: 246.27577474713326
Epoch 20, Loss: 243.11865992844105
Epoch 21, Loss: 243.50109888613224
Epoch 22, Loss: 242.17093713581562
Epoch 23, Loss: 242.40418204665184
Epoch 24, Loss: 242.8535503745079
Epoch 25, Loss: 241.09464220702648
Epoch 26, Loss: 239.61285869777203
Epoch 27, Loss: 237.74580751359463
Epoch 28, Loss: 237.7809741050005
Epoch 29, Loss: 238.19058573246002
Epoch 3

# Step 5: Save the Model

In [29]:
def save_model(model, file_path):
    version = 1
    while os.path.exists(file_path):
        file_path = f"TransE_fine_tuned_model_v{version}.pth"
        version += 1  # If a file with the specified name already exists, the function appends a version number to the file name
    torch.save(model.state_dict(), file_path)
    print(f"Model saved to {file_path}")
# Change the path based on local directory
save_model(model, "/Users/abhi/GitHUB/FederatedRAG/TunedModels/TransE_fine_tuned_model.pth")

Model saved to /Users/abhi/GitHUB/FederatedRAG/TunedModels/TransE_fine_tuned_model.pth


# Step 6: Prediction and Evaluation

In [30]:
def calculate_metrics(triples, model, entity_to_id, device):
    hits_at_10 = 0
    mean_rank = 0
    mean_reciprocal_rank = 0
    total_samples = len(triples)

    for head, relation, tail in triples:
        head, relation, tail = torch.tensor(head).to(device), torch.tensor(relation).to(device), torch.tensor(tail).to(device)

        head_embedding = model.entity_embeddings(head)
        relation_embedding = model.relation_embeddings(relation)
        target_tail_embedding = model.entity_embeddings(tail)

        # Compute scores for all entities as tail
        all_entities = torch.arange(model.entity_embeddings.num_embeddings).to(device)
        all_entity_embeddings = model.entity_embeddings(all_entities)
        scores = torch.norm(head_embedding + relation_embedding - all_entity_embeddings, p=2, dim=1)

        # Rank entities based on the scores
        sorted_indices = torch.argsort(scores)
        rank = (sorted_indices == tail).nonzero(as_tuple=True)[0].item() + 1

        # Update metrics
        mean_rank += rank
        mean_reciprocal_rank += 1 / rank
        if rank <= 10:
            hits_at_10 += 1

    hits_at_10 = hits_at_10 / total_samples
    mean_rank = mean_rank / total_samples
    mean_reciprocal_rank = mean_reciprocal_rank / total_samples

    print(f"Hits@10: {hits_at_10}")
    print(f"Mean Rank: {mean_rank}")
    print(f"Mean Reciprocal Rank (MRR): {mean_reciprocal_rank}")



# Run evaluation on the dataset

In [31]:
# Run evaluation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
evaluate_triples = triples.values.tolist()
calculate_metrics(evaluate_triples, model, entity_to_id, device)

Hits@10: 0.00096
Mean Rank: 1465.12757
Mean Reciprocal Rank (MRR): 0.0016460369044324753
