In [33]:
import json
import torch
import numpy as np
from transformers import BertTokenizer, BertModel

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Load the JSON file with nodes and relationships
with open('final_graph.json', 'r') as f:
    data = json.load(f)

# Function to generate BERT embeddings for a list of texts
def get_bert_embeddings(text_list, max_length=512):
    inputs = tokenizer(text_list, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return embeddings  # Returns a NumPy array

# Prepare lists to store node texts and IDs
node_texts = []
node_ids = []

# Generate texts for nodes
for node in data.get('Nodes', []):
    node_id = node.get('id', '')
    node_type = node.get('type', '')
    node_name = node.get('name', 'id'),
    properties = node.get('properties', {})

    # Start building the text with id, type, and name
    node_text = f"{node_id} {node_type} {node_name}"

    # Concatenate properties into the text
    for key, value in properties.items():
        # If the value is a list, join it into a string
        if isinstance(value, list):
            value = ', '.join(value)
        else:
            value = str(value)
        node_text += f" {key}: {value}"

    # Append to lists
    node_texts.append(node_text)
    node_ids.append(node_id)

# Process node texts in batches
batch_size = 32  # Adjust based on your system's capability
node_embeddings = {}

for i in range(0, len(node_texts), batch_size):
    batch_texts = node_texts[i:i + batch_size]
    batch_ids = node_ids[i:i + batch_size]
    embeddings_batch = get_bert_embeddings(batch_texts)
    for idx, embedding in enumerate(embeddings_batch):
        node_embeddings[batch_ids[idx]] = embedding
        print(f"Node ID: {batch_ids[idx]}, Embedding generated.")

# Save the embeddings and corresponding IDs to files
node_ids_list = list(node_embeddings.keys())
node_embeddings_array = np.array([node_embeddings[node_id] for node_id in node_ids_list])

np.save('Embeddings/node_embeddings.npy', node_embeddings_array)
with open('Embeddings/node_ids.json', 'w') as f:
    json.dump(node_ids_list, f)

print("Embeddings for nodes have been generated and saved.")


Node ID: Standard, Embedding generated.
Node ID: Standard_for_Project_Management_Fourth_Edition, Embedding generated.
Node ID: PMI_Standards_Collection, Embedding generated.
Node ID: Project_Risk_Management, Embedding generated.
Node ID: ProjectRiskManagement, Embedding generated.
Node ID: Risk_Identification, Embedding generated.
Node ID: Risk_Analysis, Embedding generated.
Node ID: Risk_Response_Planning, Embedding generated.
Node ID: Risk_Monitoring_and_Control, Embedding generated.
Node ID: SWOT_Analysis, Embedding generated.
Node ID: Strengths, Embedding generated.
Node ID: Weaknesses, Embedding generated.
Node ID: Opportunities, Embedding generated.
Node ID: Threats, Embedding generated.
Node ID: Diagramming_Techniques, Embedding generated.
Node ID: Data_Gathering_Techniques, Embedding generated.
Node ID: Critical_Path, Embedding generated.
Node ID: Effective_Scheduling, Embedding generated.
Node ID: Critical_Path_Activity, Embedding generated.
Node ID: Work_Breakdown_Structure_C

In [34]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
import json

# Load the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to generate BERT embeddings for a list of texts
def get_bert_embeddings(text_list, max_length=128, batch_size=16):
    all_embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch_texts = text_list[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Use the CLS token representation
        all_embeddings.extend(embeddings)
    return np.array(all_embeddings)

# Prepare lists to store relationship texts and keys
relation_texts = []
relation_keys = []

# Generate texts for relationships and store their keys
for relationship in data.get('Relationships', []):
    source = relationship.get('source', '')
    target = relationship.get('target', '')
    rel_type = relationship.get('type', '')
    properties = relationship.get('properties', {})

    # Build the text input for BERT including source, type, and target
    relation_text = f"{source} {rel_type} {target}"

    # Add all properties to the text
    for key, value in properties.items():
        if isinstance(value, list):
            value = ', '.join(value)
        else:
            value = str(value)
        relation_text += f" {key}: {value}"

    # Append the built text and unique key
    relation_texts.append(relation_text)
    relation_keys.append(f"{source}_{rel_type}_{target}")

# Generate BERT embeddings for the complete relationship texts
main_relation_embeddings = get_bert_embeddings(relation_texts)

# --- Generate embeddings for individual attributes ---

# Prepare texts for individual attributes
description_texts = [relationship['properties'].get('description', '') for relationship in data['Relationships']]
strength_texts = [relationship['properties'].get('strength', '') for relationship in data['Relationships']]
reasoning_texts = [relationship['properties'].get('reasoning', '') for relationship in data['Relationships']]
directionality_texts = [relationship['properties'].get('directionality', '') for relationship in data['Relationships']]
action_required_texts = [relationship['properties'].get('action_required', '') for relationship in data['Relationships']]

# Generate embeddings for each attribute using BERT
description_embeddings = get_bert_embeddings(description_texts)
strength_embeddings = get_bert_embeddings(strength_texts)
reasoning_embeddings = get_bert_embeddings(reasoning_texts)
directionality_embeddings = get_bert_embeddings(directionality_texts)
action_required_embeddings = get_bert_embeddings(action_required_texts)

# --- Combine all embeddings (main text + individual attributes) ---

combined_embeddings = np.concatenate([
    main_relation_embeddings, description_embeddings, strength_embeddings, 
    reasoning_embeddings, directionality_embeddings, action_required_embeddings
], axis=1)

# Convert combined embeddings to PyTorch tensor and save to file
np.save('Embeddings/relation_embeddings.npy', combined_embeddings)
with open('Embeddings/relation_keys.json', 'w') as f:
    json.dump(relation_keys, f)

print("All relationship and attribute embeddings have been generated and saved in 'Embeddings/relation_embeddings.npy'.")


All relationship and attribute embeddings have been generated and saved in 'Embeddings/relation_embeddings.npy'.


In [30]:
import json

# Load the graph data from 'final_graph.json'
with open('final_graph.json', 'r') as f:
    data = json.load(f)
with open('Embeddings/node_ids.json', 'r') as file:
    node_ids_list = json.load(file)


In [31]:
import torch

# Assume 'data' is your graph data loaded from the JSON file
# Extract all unique edge types from the data
unique_edge_types = set(relationship['type'] for relationship in data.get('Relationships', []))

# Create a mapping from edge type strings to unique integers
edge_type_mapping = {edge_type: idx for idx, edge_type in enumerate(unique_edge_types)}

# Print the number of unique edge types
print("Number of unique edge types:", len(edge_type_mapping))
edge_type_mapping

Number of unique edge types: 452


{'is responsible for': 0,
 'benefits from': 1,
 'considers': 2,
 'follows': 3,
 'provides insight into': 4,
 'assigned to': 5,
 'monitors_progress_towards': 6,
 'is responsible for selecting': 7,
 'is transformed into': 8,
 'is based on': 9,
 'is developed from': 10,
 'resulted in': 11,
 'manages': 12,
 'enhances understanding of': 13,
 'provides': 14,
 'illustrates': 15,
 'compiles': 16,
 'recommends improvement for': 17,
 'derived_from': 18,
 'contributes to': 19,
 'illustrate': 20,
 'evaluates': 21,
 'determines selection of': 22,
 'may repeat': 23,
 'creates_value_for': 24,
 'is assessed using': 25,
 'collects feedback from': 26,
 'is variant of': 27,
 'performs': 28,
 'contributes_to': 29,
 'requires attention to': 30,
 'focuses on': 31,
 'risks': 32,
 'used in conjunction with': 33,
 'imposes': 34,
 'operates within': 35,
 'enhances': 36,
 'facilitates': 37,
 'tracks': 38,
 'ensures': 39,
 'is derived from': 40,
 'has dependency': 41,
 'may include': 42,
 'has role': 43,
 'result

In [32]:
# Initialize lists to store the source and target node indices and edge types
source_nodes = []
target_nodes = []
edge_types = []

# Example mapping of node IDs to indices (you should have this from your node embeddings)
node_id_to_index = {node_id: index for index, node_id in enumerate(node_ids_list)}

# Populate the source_nodes, target_nodes, and edge_types lists using your relationships data
for relationship in data.get('Relationships', []):
    source = relationship.get('source')
    target = relationship.get('target')
    rel_type = relationship.get('type')

    # Convert node IDs to indices using the mapping
    if source in node_id_to_index and target in node_id_to_index:
        source_index = node_id_to_index[source]
        target_index = node_id_to_index[target]
        source_nodes.append(source_index)
        target_nodes.append(target_index)
        edge_types.append(edge_type_mapping[rel_type])  # Map the relationship type to an integer

# Construct the edge_index and edge_type tensors
edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
edge_type = torch.tensor(edge_types, dtype=torch.long)

print("Edge index tensor created successfully:", edge_index)
print("Edge type tensor created successfully:", edge_type)


Edge index tensor created successfully: tensor([[   0,    0,    1,  ..., 2808, 2809, 2809],
        [ 304,  323,    2,  ..., 2809, 2810, 2811]])
Edge type tensor created successfully: tensor([ 98, 108, 369,  ...,  44, 381, 381])


In [33]:
import torch
from torch_geometric.data import Data
import numpy as np

# --- Load Node and Relationship Embeddings ---

# Load node embeddings (e.g., from your saved .npy file)
node_embeddings_array = np.load('Embeddings/node_embeddings.npy')
node_features = torch.tensor(node_embeddings_array, dtype=torch.float)

# Load relationship embeddings (edge attributes)
combined_embeddings = np.load('Embeddings/relation_embeddings.npy')
edge_features = torch.tensor(combined_embeddings, dtype=torch.float)


# --- Create the Data Object for PyTorch Geometric ---
data_pyg = Data(x=node_features, edge_index=edge_index, edge_attr=edge_features)

print("Data object for PyTorch Geometric created successfully.")


Data object for PyTorch Geometric created successfully.


## Modeling

In [6]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import RGCNConv
import torch_scatter


In [3]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import RGCNConv
from torch_scatter import scatter_mean
from torch.nn import Dropout, LayerNorm, Linear
from torch_geometric.nn import GraphNorm

class SimplifiedRGCNModel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_relations, edge_feature_dim):
        super(SimplifiedRGCNModel, self).__init__()
        
        # R-GCN layers to process node features
        self.rgcn1 = RGCNConv(in_channels, hidden_channels, num_relations)
        self.rgcn2 = RGCNConv(hidden_channels, out_channels, num_relations)
        
        # Dropout and normalization
        self.dropout = Dropout(0.5)  # Increased dropout rate to combat overfitting
        self.graph_norm = GraphNorm(out_channels)
        
        # Linear layer for edge features
        self.edge_feature_transform = Linear(edge_feature_dim, out_channels)
    
    def forward(self, x, edge_index, edge_type, edge_attr):
        # Transform edge features
        edge_attr_transformed = F.relu(self.edge_feature_transform(edge_attr))
        
        # First R-GCN layer with dropout
        x = self.rgcn1(x, edge_index, edge_type)
        x = F.relu(x)
        x = self.dropout(x)
        
        # Second R-GCN layer with normalization
        x = self.rgcn2(x, edge_index, edge_type)
        x = self.graph_norm(x)
        
        # Aggregate edge information
        edge_aggr = scatter_mean(edge_attr_transformed, edge_index[0], dim=0, dim_size=x.size(0))
        x = x + edge_aggr
        
        return x

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
import torch
import torch.nn as nn
from torch.optim import Adam

# Define the device (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model and move it to the device
model = SimplifiedRGCNModel(
    in_channels=node_features.size(1),
    hidden_channels=64,  # You can adjust this value
    out_channels=32,     # You can adjust this value
    num_relations=len(edge_type_mapping),  # Number of unique edge types
    edge_feature_dim=edge_features.size(1)  # Dimension of edge features
).to(device)

# Move data and edge types to the device
data_pyg = data_pyg.to(device)
edge_type = edge_type.to(device)

# Define the optimizer
optimizer = Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# Define the loss function (Binary Cross-Entropy Loss with Logits)
loss_fn = nn.BCEWithLogitsLoss()  # Use BCEWithLogitsLoss for binary classification

print("Model, optimizer, and loss function defined successfully.")


Model, optimizer, and loss function defined successfully.


In [13]:
import torch
import random
from torch_geometric.data import Data
from torch.utils.data import random_split

# Function to sample negative edges
def sample_negative_edges(num_nodes, existing_edges, num_samples):
    negative_edges = set()
    existing_edges_set = set(map(tuple, existing_edges.t().tolist()))

    while len(negative_edges) < num_samples:
        source = random.randint(0, num_nodes - 1)
        target = random.randint(0, num_nodes - 1)
        if source != target and (source, target) not in existing_edges_set and (target, source) not in existing_edges_set:
            negative_edges.add((source, target))

    return torch.tensor(list(negative_edges), dtype=torch.long).t().contiguous()

# Split positive edges into train and test sets
positive_samples = data_pyg.edge_index  # Shape: [2, num_edges]
num_positive_edges = positive_samples.size(1)
train_size = int(0.85 * num_positive_edges)  # 80% for training
test_size = num_positive_edges - train_size  # 20% for testing

# Use random_split to split the positive samples
train_indices, test_indices = random_split(torch.arange(num_positive_edges), [train_size, test_size])

# Convert indices to tensors
train_positive_samples = positive_samples[:, train_indices]
test_positive_samples = positive_samples[:, test_indices]

# Sample negative edges for training and testing
train_negative_samples = sample_negative_edges(data_pyg.num_nodes, train_positive_samples, train_size)
test_negative_samples = sample_negative_edges(data_pyg.num_nodes, test_positive_samples, test_size)

# Function to compute accuracy
def compute_accuracy(scores, labels):
    predictions = (scores > 0).float()  # Convert logits to binary predictions
    correct = (predictions == labels).sum().item()
    accuracy = correct / labels.size(0)
    return accuracy




In [58]:
# Parameters for early stopping
best_test_accuracy = 0
patience = 20  # Number of epochs to wait before stopping if no improvement
epochs_without_improvement = 0

# Training loop
num_epochs = 52
model.train()

for epoch in range(num_epochs):
    optimizer.zero_grad()

    # Forward pass
    out = model(data_pyg.x, data_pyg.edge_index, edge_type, data_pyg.edge_attr)

    # Training scores and labels
    train_positive_scores = (out[train_positive_samples[0]] * out[train_positive_samples[1]]).sum(dim=1)
    train_negative_scores = (out[train_negative_samples[0]] * out[train_negative_samples[1]]).sum(dim=1)
    train_scores = torch.cat([train_positive_scores, train_negative_scores])
    train_labels = torch.cat([
        torch.ones(train_positive_scores.size(0)),
        torch.zeros(train_negative_scores.size(0))
    ]).to(device)
    
    # Compute training loss
    loss = loss_fn(train_scores, train_labels)
    loss.backward()
    optimizer.step()

    # Compute training accuracy
    train_accuracy = compute_accuracy(train_scores, train_labels)

    # Validation (test) scores and labels
    with torch.no_grad():
        model.eval()
        test_positive_scores = (out[test_positive_samples[0]] * out[test_positive_samples[1]]).sum(dim=1)
        test_negative_scores = (out[test_negative_samples[0]] * out[test_negative_samples[1]]).sum(dim=1)
        test_scores = torch.cat([test_positive_scores, test_negative_scores])
        test_labels = torch.cat([
            torch.ones(test_positive_scores.size(0)),
            torch.zeros(test_negative_scores.size(0))
        ]).to(device)
        
        # Compute test accuracy
        test_accuracy = compute_accuracy(test_scores, test_labels)

    # Print loss and accuracies for each epoch
    print(f"Epoch {epoch}, Loss: {loss.item():.4f}, Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}")

    # Early stopping logic
    if test_accuracy > best_test_accuracy:
        best_test_accuracy = test_accuracy
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1

    if epochs_without_improvement >= patience:
        print("Early stopping triggered.")
        break

print("Training complete.")


Epoch 0, Loss: 2.8868, Train Accuracy: 0.5133, Test Accuracy: 0.5205
Epoch 1, Loss: 187.2629, Train Accuracy: 0.6196, Test Accuracy: 0.6191
Epoch 2, Loss: 11.6727, Train Accuracy: 0.6625, Test Accuracy: 0.6055
Epoch 3, Loss: 1.4535, Train Accuracy: 0.6151, Test Accuracy: 0.5801
Epoch 4, Loss: 1.0243, Train Accuracy: 0.6481, Test Accuracy: 0.6074
Epoch 5, Loss: 0.8480, Train Accuracy: 0.6689, Test Accuracy: 0.6182
Epoch 6, Loss: 0.7426, Train Accuracy: 0.6842, Test Accuracy: 0.6318
Epoch 7, Loss: 0.6506, Train Accuracy: 0.6982, Test Accuracy: 0.6318
Epoch 8, Loss: 0.5925, Train Accuracy: 0.7146, Test Accuracy: 0.6289
Epoch 9, Loss: 0.5622, Train Accuracy: 0.7279, Test Accuracy: 0.6318
Epoch 10, Loss: 0.5422, Train Accuracy: 0.7408, Test Accuracy: 0.6436
Epoch 11, Loss: 0.5302, Train Accuracy: 0.7411, Test Accuracy: 0.6396
Epoch 12, Loss: 0.5189, Train Accuracy: 0.7454, Test Accuracy: 0.6426
Epoch 13, Loss: 0.5096, Train Accuracy: 0.7544, Test Accuracy: 0.6426
Epoch 14, Loss: 0.5014, Tra

In [63]:
import torch

# Save the model
model_path = "rgcn_model.pth"
torch.save(model.state_dict(), model_path)
print(f"Model saved successfully to {model_path}")


Model saved successfully to rgcn_model.pth


In [15]:
import torch

# Initialize the RGCN model architecture
model_rgcn = SimplifiedRGCNModel(
    in_channels=node_features.size(1),
    hidden_channels=64,  # Use the same values as used during training
    out_channels=32,     # Use the same values as used during training
    num_relations=len(edge_type_mapping),  # Number of unique edge types
    edge_feature_dim=edge_features.size(1)  # Dimension of edge features
).to(device)  # Move to the correct device (CPU or GPU)

# Load the model state dictionary
model_path = "rgcn_model.pth"
model_rgcn.load_state_dict(torch.load(model_path, map_location=device))

# Set the model to evaluation mode
model_rgcn.eval()

print(f"Model loaded successfully from {model_path}")


Model loaded successfully from rgcn_model.pth


  model_rgcn.load_state_dict(torch.load(model_path, map_location=device))


In [11]:
# Example of how to create the mapping if you haven't already
node_id_to_index = {node_id: index for index, node_id in enumerate(node_ids_list)}

# Example usage of the mapping
source_node_id = "Schedule_Performance"
target_node_id = "Feature_Completion_Rate"

source_node_index = node_id_to_index[source_node_id]
target_node_index = node_id_to_index[target_node_id]



In [12]:
import torch.nn.functional as F

def predict_link(source_node_id, target_node_id):
    model.eval()  # Set the model to evaluation mode

    # Convert node IDs to indices
    source_node_index = node_id_to_index[source_node_id]
    target_node_index = node_id_to_index[target_node_id]
    
    # Compute the embeddings for the nodes
    source_embedding = model(data_pyg.x, data_pyg.edge_index, edge_type, data_pyg.edge_attr)[source_node_index]
    target_embedding = model(data_pyg.x, data_pyg.edge_index, edge_type, data_pyg.edge_attr)[target_node_index]
    
    # Compute the cosine similarity between the two embeddings
    score = F.cosine_similarity(source_embedding.unsqueeze(0), target_embedding.unsqueeze(0)).item()
    
    # Normalize the score to be between 0 and 1
    normalized_score = (score + 1) / 2  # Transform from [-1, 1] to [0, 1]
    
    return normalized_score



In [13]:
def recommend_links_for_node(node_id, top_k=5):
    model.eval()  # Set the model to evaluation mode

    # Convert the node ID to an index
    node_index = node_id_to_index[node_id]

    scores = []
    # Compute the embedding for the given node
    node_embedding = model(data_pyg.x, data_pyg.edge_index, edge_type, data_pyg.edge_attr)[node_index]
    
    # Compare with all other nodes to compute scores
    all_embeddings = model(data_pyg.x, data_pyg.edge_index, edge_type, data_pyg.edge_attr)
    for other_node_index in range(data_pyg.num_nodes):
        if other_node_index != node_index:  # Skip self-loops
            score = F.cosine_similarity(
                node_embedding.unsqueeze(0),
                all_embeddings[other_node_index].unsqueeze(0)
            ).item()
            normalized_score = (score + 1) / 2  # Transform from [-1, 1] to [0, 1]
            scores.append((other_node_index, normalized_score))
    
    # Sort by score in descending order and get the top_k recommendations
    scores.sort(key=lambda x: x[1], reverse=True)
    recommended_indices = [node for node, score in scores[:top_k]]
    
    # Convert indices back to node IDs
    index_to_node_id = {index: node_id for node_id, index in node_id_to_index.items()}
    recommended_node_ids = [index_to_node_id[idx] for idx in recommended_indices]
    
    return recommended_node_ids



In [14]:
# Example: Predict the link score between two nodes using their IDs
source_node_id = "Agile_Methodology"
target_node_id = "Project_Management_Standard"
link_score = predict_link(source_node_id, target_node_id)
print(f"Link score between '{source_node_id}' and '{target_node_id}': {link_score:.4f}")

# Example: Recommend links for a specific node using its ID
node_id = "Project_Management_Standard"
top_k_recommendations = recommend_links_for_node(node_id, top_k=5)
print(f"Top {len(top_k_recommendations)} recommendations for '{node_id}': {top_k_recommendations}")


Link score between 'Agile_Methodology' and 'Project_Management_Standard': 0.5919
Top 5 recommendations for 'Project_Management_Standard': ['ADKAR_Model', 'Cost_of_Quality_Methodology', 'Software_as_a_Differentiator', 'Theory_X', 'Standard_for_Project_Management_Fourth_Edition']
