In [1]:
import json
import torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, SAGEConv

# Load JSON data
with open('Embadded_graph.json', 'r') as f:
    graph_data = json.load(f)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def make_all_lowercase(data):
    # If the data is a dictionary
    if isinstance(data, dict):
        return {k.lower(): make_all_lowercase(v) for k, v in data.items()}
    # If the data is a list
    elif isinstance(data, list):
        return [make_all_lowercase(item) for item in data]
    # If the data is a string
    elif isinstance(data, str):
        return data.lower()
    # For other data types, return as is
    else:
        return data

In [3]:
graph_data = make_all_lowercase(graph_data)

In [4]:


nodes = graph_data['nodes']
relationships = graph_data['relationships']

In [5]:
# Prepare nodes and edges
node_to_idx = {node['name']: idx for idx, node in enumerate(nodes)}

In [6]:
# Check for missing nodes in relationships
missing_nodes = []
for rel in relationships:
    if rel['source'] not in node_to_idx:
        missing_nodes.append(rel['source'])
    if rel['target'] not in node_to_idx:
        missing_nodes.append(rel['target'])

print("Missing nodes:", missing_nodes)


Missing nodes: []


In [7]:
len(nodes)

2552

In [8]:
# Prepare des edges
edges = [(node_to_idx[rel['source']], node_to_idx[rel['target']]) for rel in relationships]


In [9]:
type(edges)

list

In [10]:
Embeddings = graph_data['x']

In [11]:
node_features = torch.tensor(Embeddings)

In [14]:
edges_list = [[node1_index, node2_index] for node1_index, node2_index in edges]



In [17]:
# Convert edge list to tensor
edge_index = torch.tensor(edges_list, dtype=torch.long).t().contiguous()


In [19]:
#extracting created labes from the LLM

# Define a mapping from criticality levels to numeric labels
label_mapping = {"low": 0, "medium": 1, "high": 2}

# Initialize list for storing labels
node_labels = []

# Iterate over each node to extract the risk criticality level
for node in nodes:
    # Access the node attributes
    attributes = node.get('attributes', {})

    # Get the risk criticality from attributes; default to 'Low' if missing
    risk_criticality = attributes.get('risk_criticality', 'low')

    # Map the risk criticality to a numeric label using the label_mapping dictionary
    label = label_mapping.get(risk_criticality, 0)  # Default to 0 if risk_criticality is missing
    node_labels.append(label)



In [22]:
# Convert the list of labels to a tensor
node_labels = torch.tensor(node_labels)

In [23]:
from torch_geometric.data import Data

# Convert graph data to PyTorch Geometric format
data = Data(
    x=node_features,
    edge_index=edge_index,
    y= node_labels # Labels for node classification
)


  from .autonotebook import tqdm as notebook_tqdm


In [24]:
# Save the data object to a file
torch.save(data, 'graph_data_object.pt')
print("Graph data saved successfully to 'graph_data.pt'.")

Graph data saved successfully to 'graph_data.pt'.


In [25]:
# Load the data object from the file
loaded_data = torch.load('graph_data_object.pt')
print("Graph data loaded successfully from 'graph_data.pt'.")

# You can now use `loaded_data` just like the original `data` object
print("Node features (x):", loaded_data.x.shape)

Graph data loaded successfully from 'graph_data.pt'.
Node features (x): torch.Size([2552, 1536])


  loaded_data = torch.load('graph_data_object.pt')


In [26]:
# Final graph data with node features and edges
graph_data = {
    'x': node_features,  # Node features (embeddings)
    'edge_index': edge_index,  # Edges
    'y': node_labels,  # Node labels (risk criticality)
}

In [28]:
import json
import torch
import numpy as np
from torch_geometric.data import HeteroData
from sklearn.model_selection import train_test_split

# Load BERT embeddings for nodes
node_embeddings = np.load('Embeddings/node_embeddings.npy')
with open('Embeddings/node_ids.json', 'r') as f:
    node_ids = json.load(f)

# Convert node embeddings to a tensor
x = torch.tensor(node_embeddings, dtype=torch.float)


In [29]:
# Load the graph data from your JSON file
with open('final_updated_graph.json', 'r') as f:
    graph_data = json.load(f)


In [30]:
from torch_geometric.data import HeteroData

# Initialize the HeteroData object
data = HeteroData()

# Create mappings from node ID to index and from node ID to type
node_id_to_index = {node_id: idx for idx, node_id in enumerate(node_ids)}
node_id_to_type = {node['id']: node.get('type', 'Unknown') for node in graph_data.get('Nodes', [])}


In [31]:
# Collect unique node types
node_types = set(node_id_to_type.values())

# Assign node features for each node type
for node_type in node_types:
    indices = [idx for idx, node_id in enumerate(node_ids) if node_id_to_type[node_id] == node_type]
    if indices:
        data[node_type].x = x[indices]


In [None]:
from collections import defaultdict

# Initialize dictionaries to store edge indices and edge type mappings
edge_index_dict = defaultdict(list)
edge_type_mapping = {}
edge_type_counter = 0

# Iterate over relationships to build edge indices and map edge types
for relationship in graph_data.get('Relationships', []):
    source_id = relationship['source']
    target_id = relationship['target']
    rel_type = relationship.get('type', 'unknown')

    # Check if source and target nodes are valid
    if source_id in node_id_to_index and target_id in node_id_to_index:
        src_index = node_id_to_index[source_id]
        dst_index = node_id_to_index[target_id]
        src_type = node_id_to_type[source_id]
        dst_type = node_id_to_type[target_id]
        edge_key = (src_type, rel_type, dst_type)

        # Assign an integer ID to each unique edge type
        if edge_key not in edge_type_mapping:
            edge_type_mapping[edge_key] = edge_type_counter
            edge_type_counter += 1

        # Store the edge index
        edge_index_dict[edge_key].append([src_index, dst_index])


In [33]:
# Assign edge indices to HeteroData
for edge_key, edge_list in edge_index_dict.items():
    src_type, rel_type, dst_type = edge_key
    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
    data[(src_type, rel_type, dst_type)].edge_index = edge_index


In [34]:
from torch_geometric.utils import negative_sampling

# Choose an edge key to use for training
edge_key = ('Project', 'mitigated by', 'Strategy')
if edge_key in edge_index_dict:
    edge_index = torch.tensor(edge_index_dict[edge_key], dtype=torch.long).t().contiguous()
    num_positive_samples = edge_index.size(1)
else:
    raise ValueError(f"Edge key {edge_key} not found in the graph data.")

# Generate negative samples
neg_edge_index = negative_sampling(
    edge_index=edge_index,
    num_nodes=x.size(0),
    num_neg_samples=num_positive_samples,
    method='sparse'
)


In [35]:
# Create indices for positive and negative samples
pos_indices = torch.arange(num_positive_samples)
neg_indices = torch.arange(num_positive_samples)

# Split indices into training and validation sets
pos_train_indices, pos_val_indices = train_test_split(pos_indices, test_size=0.2, random_state=42)
neg_train_indices, neg_val_indices = train_test_split(neg_indices, test_size=0.2, random_state=42)

# Combine positive and negative samples for training and validation
train_edge_index = torch.cat([edge_index[:, pos_train_indices], neg_edge_index[:, neg_train_indices]], dim=1)
train_edge_label = torch.cat([torch.ones(len(pos_train_indices)), torch.zeros(len(neg_train_indices))])

val_edge_index = torch.cat([edge_index[:, pos_val_indices], neg_edge_index[:, neg_val_indices]], dim=1)
val_edge_label = torch.cat([torch.ones(len(pos_val_indices)), torch.zeros(len(neg_val_indices))])


In [36]:
# Create an edge type tensor for training
edge_type_tensor = torch.tensor([edge_type_mapping[edge_key]] * num_positive_samples, dtype=torch.long)


In [37]:
import torch.nn as nn
from torch_geometric.nn import RGCNConv

# Define the R-GCN model
class RGCNModel(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_relations):
        super(RGCNModel, self).__init__()
        self.conv1 = RGCNConv(in_channels, hidden_channels, num_relations=num_relations)
        self.conv2 = RGCNConv(hidden_channels, out_channels, num_relations=num_relations)

    def forward(self, x, edge_index, edge_type):
        x = self.conv1(x, edge_index, edge_type)
        x = torch.relu(x)
        x = self.conv2(x, edge_index, edge_type)
        return x


In [38]:
# Initialize the R-GCN model
num_relations = len(edge_type_mapping)
model = RGCNModel(in_channels=x.size(1), hidden_channels=64, out_channels=64, num_relations=num_relations)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [39]:
def compute_accuracy(logits, labels):
    preds = torch.sigmoid(logits) > 0.5
    return (preds == labels).float().mean().item()


In [45]:
# Split the indices of the edges into training and validation sets
num_edges = edge_index.size(1)
edge_indices = torch.arange(num_edges)
train_indices, val_indices = train_test_split(edge_indices, test_size=0.2, random_state=42)

# Split edge_index and edge_type_tensor into training and validation sets
train_edge_index = edge_index[:, train_indices]
train_edge_type_tensor = edge_type_tensor[train_indices]  # Split the edge types for training

val_edge_index = edge_index[:, val_indices]
val_edge_type_tensor = edge_type_tensor[val_indices]  # Split the edge types for validation


In [48]:
# Use the total number of edges for creating labels
num_edges = edge_index.size(1)
edge_indices = torch.arange(num_edges)

# Split the edges into training and validation sets
train_indices, val_indices = train_test_split(edge_indices, test_size=0.2, random_state=42)

# Create training and validation edge indices and edge types
train_edge_index = edge_index[:, train_indices]
train_edge_type_tensor = edge_type_tensor[train_indices]

val_edge_index = edge_index[:, val_indices]
val_edge_type_tensor = edge_type_tensor[val_indices]

# Create labels for training and validation
# Note: The number of positive and negative samples must match the edge indices
train_edge_label = torch.cat([
    torch.ones(len(train_indices) // 2),  # Positive labels
    torch.zeros(len(train_indices) // 2)  # Negative labels
])

val_edge_label = torch.cat([
    torch.ones(len(val_indices) // 2),  # Positive labels
    torch.zeros(len(val_indices) // 2)  # Negative labels
])

# Check the shapes
print(f"train_edge_index shape: {train_edge_index.shape}")
print(f"train_edge_label shape: {train_edge_label.shape}")
print(f"val_edge_index shape: {val_edge_index.shape}")
print(f"val_edge_label shape: {val_edge_label.shape}")


train_edge_index shape: torch.Size([2, 2728])
train_edge_label shape: torch.Size([2728])
val_edge_index shape: torch.Size([2, 683])
val_edge_label shape: torch.Size([682])


In [51]:
# Calculate the number of edges
num_train_edges = train_edge_index.size(1)
num_val_edges = val_edge_index.size(1)

# Ensure we have an equal number of positive and negative labels
num_train_pos = num_train_edges // 2
num_train_neg = num_train_edges - num_train_pos  # Adjust to match the total

num_val_pos = num_val_edges // 2
num_val_neg = num_val_edges - num_val_pos  # Adjust to match the total

# Create labels for training and validation
train_edge_label = torch.cat([
    torch.ones(num_train_pos),  # Positive labels
    torch.zeros(num_train_neg)  # Negative labels
])

val_edge_label = torch.cat([
    torch.ones(num_val_pos),  # Positive labels
    torch.zeros(num_val_neg)  # Negative labels
])

# Check the shapes to ensure they match
print(f"train_edge_index shape: {train_edge_index.shape}, train_edge_label shape: {train_edge_label.shape}")
print(f"val_edge_index shape: {val_edge_index.shape}, val_edge_label shape: {val_edge_label.shape}")


train_edge_index shape: torch.Size([2, 2728]), train_edge_label shape: torch.Size([2728])
val_edge_index shape: torch.Size([2, 683]), val_edge_label shape: torch.Size([683])


In [52]:
def train():
    model.train()
    optimizer.zero_grad()
    # Use the training edge index and edge types
    out = model(x, train_edge_index, train_edge_type_tensor)
    src_emb = out[train_edge_index[0]]
    dst_emb = out[train_edge_index[1]]
    logits = (src_emb * dst_emb).sum(dim=1)
    loss = criterion(logits, train_edge_label)
    loss.backward()
    optimizer.step()
    return loss.item(), compute_accuracy(logits, train_edge_label)

def evaluate():
    model.eval()
    with torch.no_grad():
        # Use the validation edge index and edge types
        out = model(x, val_edge_index, val_edge_type_tensor)
        src_emb = out[val_edge_index[0]]
        dst_emb = out[val_edge_index[1]]
        logits = (src_emb * dst_emb).sum(dim=1)
        loss = criterion(logits, val_edge_label)
        return loss.item(), compute_accuracy(logits, val_edge_label)


In [53]:
# Training loop
num_epochs = 50
for epoch in range(1, num_epochs + 1):
    train_loss, train_acc = train()
    val_loss, val_acc = evaluate()
    if epoch % 5 == 0:
        print(f"Epoch {epoch}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")


Epoch 5/50, Train Loss: 9.5645, Train Acc: 0.6492, Val Loss: 1.9695, Val Acc: 0.4758
Epoch 10/50, Train Loss: 2.7208, Train Acc: 0.6899, Val Loss: 3.2431, Val Acc: 0.5051
Epoch 15/50, Train Loss: 1.6858, Train Acc: 0.7071, Val Loss: 4.4670, Val Acc: 0.4802
Epoch 20/50, Train Loss: 1.2341, Train Acc: 0.7115, Val Loss: 5.4499, Val Acc: 0.4846
Epoch 25/50, Train Loss: 2.1249, Train Acc: 0.7221, Val Loss: 6.1484, Val Acc: 0.4890
Epoch 30/50, Train Loss: 0.6712, Train Acc: 0.7324, Val Loss: 6.8609, Val Acc: 0.4905
Epoch 35/50, Train Loss: 1.0514, Train Acc: 0.7375, Val Loss: 7.4789, Val Acc: 0.4934
Epoch 40/50, Train Loss: 0.6353, Train Acc: 0.7401, Val Loss: 7.9624, Val Acc: 0.4934
Epoch 45/50, Train Loss: 0.4992, Train Acc: 0.7445, Val Loss: 8.2845, Val Acc: 0.4817
Epoch 50/50, Train Loss: 0.5380, Train Acc: 0.7526, Val Loss: 8.4538, Val Acc: 0.4876


In [56]:
def recommend_strategies(project_id, risk_ids, top_k=3):
    model.eval()  # Set the model to evaluation mode

    # Convert project and risk IDs to indices
    project_index = node_id_to_index[project_id]
    risk_indices = [node_id_to_index[risk_id] for risk_id in risk_ids]

    # Run the model to get the node embeddings
    out = model(x, edge_index, edge_type_tensor)  # Pass edge_index and edge_type_tensor to the model
    recommendations = []

    for risk_index in risk_indices:
        src_emb = out[project_index]
        dst_emb = out[risk_index]
        score = torch.sigmoid((src_emb * dst_emb).sum()).item()  # Calculate the similarity score
        recommendations.append((risk_index, score))

    # Sort recommendations by score in descending order
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
    top_recommendations = recommendations[:top_k]  # Get the top-k recommendations

    # Map indices back to risk names for clearer output
    result = []
    for risk_index, score in top_recommendations:
        risk_name = node_ids[risk_index]  # Retrieve the risk name using the index
        result.append((risk_name, score))

    return result

# Example usage
project_id = 'project_global_manufacturing_risk_management'
risk_ids = ['risk_prioritization_issues', 'risk_supply_chain_disruption']
top_recommendations = recommend_strategies(project_id, risk_ids)

# Print the recommendations in a clearer format
print("Top Recommendations:")
for risk_name, score in top_recommendations:
    print(f"Risk: {risk_name}, Similarity Score: {score:.4f}")


Top Recommendations:
Risk: risk_supply_chain_disruption, Similarity Score: 0.4988
Risk: risk_prioritization_issues, Similarity Score: 0.4900


In [57]:
def recommend_strategies_for_risks(project_id, risk_ids, top_k=3):
    model.eval()  # Set the model to evaluation mode

    # Convert project and risk IDs to indices
    project_index = node_id_to_index[project_id]
    risk_indices = [node_id_to_index[risk_id] for risk_id in risk_ids]

    # Run the model to get the node embeddings
    out = model(x, edge_index, edge_type_tensor)  # Pass edge_index and edge_type_tensor to the model

    # Collect all strategy node indices
    strategy_indices = [idx for idx, node_id in enumerate(node_ids) if node_id_to_type[node_id] == "Strategy"]

    # Prepare a dictionary to store recommendations for each risk
    risk_to_strategy_recommendations = {}

    # Iterate over each risk and calculate similarity scores with strategies
    for risk_index in risk_indices:
        recommendations = []
        risk_emb = out[risk_index]

        for strategy_index in strategy_indices:
            strategy_emb = out[strategy_index]
            score = torch.sigmoid((risk_emb * strategy_emb).sum()).item()  # Calculate the similarity score
            recommendations.append((strategy_index, score))

        # Sort recommendations by score in descending order and select top-k strategies
        recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:top_k]

        # Map strategy indices back to strategy names for clearer output
        recommended_strategies = [(node_ids[strategy_index], score) for strategy_index, score in recommendations]
        risk_name = node_ids[risk_index]  # Get the name of the risk
        risk_to_strategy_recommendations[risk_name] = recommended_strategies

    return risk_to_strategy_recommendations

# Example usage
project_id = 'project_global_manufacturing_risk_management'
risk_ids = ['risk_prioritization_issues', 'risk_supply_chain_disruption']
recommendations = recommend_strategies_for_risks(project_id, risk_ids)

# Print the recommendations in a clearer format
print("Strategy Recommendations for Each Risk:")
for risk, strategies in recommendations.items():
    print(f"\nRisk: {risk}")
    for strategy_name, score in strategies:
        print(f"  Strategy: {strategy_name}, Similarity Score: {score:.4f}")


Strategy Recommendations for Each Risk:

Risk: risk_prioritization_issues
  Strategy: Mitigation_Strategies, Similarity Score: 1.0000
  Strategy: Value_Maximization, Similarity Score: 1.0000
  Strategy: strategy_risk_assessment_in_Professional_Services, Similarity Score: 1.0000

Risk: risk_supply_chain_disruption
  Strategy: Value_Maximization, Similarity Score: 1.0000
  Strategy: strategy_risk_assessment_in_Professional_Services, Similarity Score: 1.0000
  Strategy: strategy_hedging, Similarity Score: 1.0000


In [58]:
# Save the model
torch.save(model.state_dict(), 'rgcn_model.pth')
print("Model saved successfully.")


Model saved successfully.


In [None]:
import torch
import torch.nn as nn

# Binary cross-entropy loss for link prediction
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

def train():
    model.train()  # Set the model to training mode
    optimizer.zero_grad()  # Clear the gradients

    # Forward pass
    out = model(x, edge_index, edge_type=edge_type_tensor)

    # Get embeddings for the source and target nodes of edges
    src_emb = out[train_edge_index[0]]
    dst_emb = out[train_edge_index[1]]

    # Compute the dot product for link prediction
    logits = (src_emb * dst_emb).sum(dim=1)


In [29]:
from torch_geometric.data import HeteroData

# Initialize HeteroData object
data = HeteroData()

# Assign node features per node type
for n_type in node_types:
    node_indices = [idx for idx, node_id in enumerate(node_ids) if node_id_to_type[node_id] == n_type]
    if node_indices:
        data[n_type].x = x[node_indices]
        data[n_type].node_ids = [node_ids[idx] for idx in node_indices]
        # Map original indices to new indices for this node type
        node_id_to_local_index = {node_ids[idx]: i for i, idx in enumerate(node_indices)}
        data[n_type].node_id_to_local_index = node_id_to_local_index

# Create edge indices per edge type
from collections import defaultdict

edge_dict = defaultdict(list)

for i in range(edge_index.size(1)):
    src_idx = edge_index[0, i].item()
    dst_idx = edge_index[1, i].item()
    rel_type = edge_type[i]
    rel_type_str = edge_type[i]

    src_id = node_ids[src_idx]
    dst_id = node_ids[dst_idx]
    src_type = node_id_to_type[src_id]
    dst_type = node_id_to_type[dst_id]

    # Get local indices within the node type
    src_local_idx = data[src_type].node_id_to_local_index[src_id]
    dst_local_idx = data[dst_type].node_id_to_local_index[dst_id]

    edge_key = (src_type, rel_type_str, dst_type)
    edge_dict[edge_key].append([src_local_idx, dst_local_idx])

# Assign edge indices to HeteroData
for edge_key, edge_list in edge_dict.items():
    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
    data[edge_key].edge_index = edge_index


In [30]:
import torch.nn as nn
from torch_geometric.nn import HeteroConv, SAGEConv

class HeteroGNN(nn.Module):
    def __init__(self, metadata, hidden_channels):
        super(HeteroGNN, self).__init__()
        self.convs = nn.ModuleList()
        self.convs.append(HeteroConv({
            edge_type: SAGEConv((-1, -1), hidden_channels)
            for edge_type in metadata[1]
        }, aggr='sum'))

        # Final linear layer per node type
        self.lin_dict = nn.ModuleDict()
        for node_type in metadata[0]:
            self.lin_dict[node_type] = nn.Linear(hidden_channels, hidden_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.convs[0](x_dict, edge_index_dict)
        for node_type in x_dict.keys():
            x_dict[node_type] = self.lin_dict[node_type](x_dict[node_type])
        return x_dict


In [31]:
metadata = data.metadata()  # Contains node types and edge types
model = HeteroGNN(metadata, hidden_channels=64)




In [None]:
edge_type = ('Risk', 'mitigated_by', 'Strategy')
edge_index = data[edge_type].edge_index
num_edges = edge_index.size(1)


In [32]:
from torch_geometric.utils import negative_sampling

# Generate negative samples
neg_edge_index = negative_sampling(
    edge_index=edge_index,
    num_nodes=(data['Risk'].x.size(0), data['Strategy'].x.size(0)),
    num_neg_samples=edge_index.size(1),
    method='sparse'
)


In [33]:
train_edge_index = torch.cat([edge_index, neg_edge_index], dim=1)
train_edge_label = torch.cat([torch.ones(edge_index.size(1)), torch.zeros(neg_edge_index.size(1))])


In [34]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [35]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    # Get embeddings for the source and target nodes
    src_emb = out['Risk'][train_edge_index[0]]
    dst_emb = out['Strategy'][train_edge_index[1]]
    # Compute logits
    logits = (src_emb * dst_emb).sum(dim=1)
    loss = criterion(logits, train_edge_label)
    loss.backward()
    optimizer.step()
    return loss.item()

for epoch in range(1, 51):
    loss = train()
    if epoch % 5 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')


Epoch 5, Loss: 0.0000
Epoch 10, Loss: 0.0000
Epoch 15, Loss: 0.0000
Epoch 20, Loss: 0.0000
Epoch 25, Loss: 0.0000
Epoch 30, Loss: 0.0000
Epoch 35, Loss: 0.0000
Epoch 40, Loss: 0.0000
Epoch 45, Loss: 0.0000
Epoch 50, Loss: 0.0000


In [None]:
def recommend_strategies_for_risk(risk_id, top_k=5):
    model.eval()
    with torch.no_grad():
        # Get risk embedding
        risk_idx = data['Risk'].node_ids.index(risk_id)
        risk_emb = model(data.x_dict, data.edge_index_dict)['Risk'][risk_idx]

        # Get all strategy embeddings
        strategy_embs = model(data.x_dict, data.edge_index_dict)['Strategy']

        # Compute similarity scores
        scores = torch.matmul(strategy_embs, risk_emb.unsqueeze(1)).squeeze(1)

        # Get top K strategies
        top_scores, top_indices = torch.topk(scores, k=top_k)
        recommended_strategies = [data['Strategy'].node_ids[idx] for idx in top_indices.tolist()]
    return recommended_strategies

# Example usage
risk_id = 'Risk_123'  # Replace with an actual risk ID from your data
recommended_strategies = recommend_strategies_for_risk(risk_id)
print(f"Recommended strategies for {risk_id}: {recommended_strategies}")


In [25]:
import torch.nn as nn
from torch_geometric.nn import HeteroConv, TransformerConv

class HeteroGNNWithEdgeFeatures(nn.Module):
    def __init__(self, metadata, hidden_channels):
        super(HeteroGNNWithEdgeFeatures, self).__init__()
        self.convs = nn.ModuleDict()
        self.edge_type_mapping = {}
        
        for edge_type in metadata[1]:
            # Create a unique string key without quotes
            edge_type_str = '__'.join(edge_type)
            self.edge_type_mapping[edge_type_str] = edge_type
            self.convs[edge_type_str] = TransformerConv(
                in_channels=(-1, -1),
                out_channels=hidden_channels,
                edge_dim=relation_embeddings.shape[1],  # Edge feature dimension
                heads=2,
                dropout=0.3
            )
        
        self.lin_dict = nn.ModuleDict()
        for node_type in metadata[0]:
            self.lin_dict[node_type] = nn.Linear(hidden_channels, hidden_channels)
    
    def forward(self, x_dict, edge_index_dict, edge_attr_dict):
        x_dict_updated = {}
        for node_type in x_dict:
            x_dict_updated[node_type] = x_dict[node_type]
        
        for edge_type_str, conv in self.convs.items():
            # Split the edge_type_str back into components
            src_type, rel_type, dst_type = edge_type_str.split('__')
            edge_type = (src_type, rel_type, dst_type)
            edge_index = edge_index_dict[edge_type]
            edge_attr = edge_attr_dict[edge_type]
            x_src = x_dict[src_type]
            x_dst = x_dict[dst_type]
            x = conv((x_src, x_dst), edge_index, edge_attr)
            x = x.relu()
            x_dict_updated[dst_type] = x  # Update the destination node embeddings
        
        for node_type in x_dict_updated:
            x_dict_updated[node_type] = self.lin_dict[node_type](x_dict_updated[node_type])
        
        return x_dict_updated


In [26]:
model = HeteroGNNWithEdgeFeatures(hetero_data.metadata(), hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-5)
criterion = nn.BCEWithLogitsLoss()

for epoch in range(50):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    out = model(hetero_data.x_dict, hetero_data.edge_index_dict, hetero_data.edge_attr_dict)
    
    total_loss = 0
    for edge_key, edge_index, labels in train_data:
        src_type, _, dst_type = edge_key
        src_emb = out[src_type][edge_index[0]]
        dst_emb = out[dst_type][edge_index[1]]
        logits = (src_emb * dst_emb).sum(dim=1)
        loss = criterion(logits, labels)
        total_loss += loss
    
    total_loss.backward()
    optimizer.step()
    
    # Optionally, evaluate the model on a validation set
    
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1}, Loss: {total_loss.item():.4f}")


IndexError: Found indices in 'edge_index' that are larger than 58 (got 304). Please ensure that all indices in 'edge_index' point to valid indices in the interval [0, 59) in your node feature matrix and try again.

In [55]:
# Create a mapping from node IDs to indices
node_id_to_index = {node_id: idx for idx, node_id in enumerate(node_ids)}


In [56]:
# Load the graph data to get node types
with open('final_updated_graph.json', 'r') as f:
    data = json.load(f)

# Create a mapping from node ID to type
node_id_to_type = {}
for node in data.get('Nodes', []):
    node_id = node.get('id', '')
    node_type = node.get('type', '')  # Get the 'type' field
    node_id_to_type[node_id] = node_type

# Create a list to store node types in the same order as node_ids
node_types = [node_id_to_type.get(node_id, 'Unknown') for node_id in node_ids]


In [57]:
# Identify node indices for risks and strategies
risk_node_indices = [idx for idx, n_type in enumerate(node_types) if n_type == 'Risk']
strategy_node_indices = [idx for idx, n_type in enumerate(node_types) if n_type == 'Strategy']

print(f"Number of risk nodes: {len(risk_node_indices)}")
print(f"Number of strategy nodes: {len(strategy_node_indices)}")


Number of risk nodes: 419
Number of strategy nodes: 220


In [58]:
edge_index_list = []

for relationship in data.get('Relationships', []):
    source_id = relationship['source']
    target_id = relationship['target']

    # Get the node indices
    if source_id in node_id_to_index and target_id in node_id_to_index:
        source_idx = node_id_to_index[source_id]
        target_idx = node_id_to_index[target_id]
        edge_index_list.append([source_idx, target_idx])

# Convert to tensor
edge_index = torch.tensor(edge_index_list, dtype=torch.long).t().contiguous()
print(f"Edge index has shape: {edge_index.shape}")


Edge index has shape: torch.Size([2, 3411])


In [59]:
pos_edge_index_list = []

for i in range(edge_index.size(1)):
    src_idx = edge_index[0, i].item()
    tgt_idx = edge_index[1, i].item()

    if src_idx in risk_node_indices and tgt_idx in strategy_node_indices:
        pos_edge_index_list.append([src_idx, tgt_idx])

# Convert to tensor
pos_edge_index = torch.tensor(pos_edge_index_list, dtype=torch.long).t()
pos_labels = torch.ones(pos_edge_index.size(1), dtype=torch.float)
print(f"Number of positive samples: {pos_edge_index.size(1)}")


Number of positive samples: 44


In [60]:
from itertools import product
import random

# Generate all possible risk-strategy pairs
all_possible_pairs = list(product(risk_node_indices, strategy_node_indices))

# Convert positive edges to a set for fast lookup
pos_edge_set = set(map(tuple, pos_edge_index.t().tolist()))

# Exclude existing positive edges
negative_candidates = [pair for pair in all_possible_pairs if tuple(pair) not in pos_edge_set]

# Number of negative samples equal to number of positive samples
num_neg_samples = pos_edge_index.size(1)

# Ensure we have enough negative candidates
if len(negative_candidates) < num_neg_samples:
    print("Not enough negative candidates. Using all available negative pairs.")
    num_neg_samples = len(negative_candidates)

# Randomly sample negative examples
neg_edge_index_list = random.sample(negative_candidates, num_neg_samples)

# Convert to tensor
neg_edge_index = torch.tensor(neg_edge_index_list, dtype=torch.long).t()
neg_labels = torch.zeros(neg_edge_index.size(1), dtype=torch.float)
print(f"Number of negative samples: {neg_edge_index.size(1)}")


Number of negative samples: 44


In [61]:
# Combine edges and labels
all_edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=1)
all_labels = torch.cat([pos_labels, neg_labels])

# Shuffle the data
perm = torch.randperm(all_edge_index.size(1))
all_edge_index = all_edge_index[:, perm]
all_labels = all_labels[perm]


In [62]:
from sklearn.model_selection import train_test_split

# Convert edges to list of tuples
edge_tuples = all_edge_index.t().tolist()
labels = all_labels.tolist()

# Split into training and test sets
train_edges, test_edges, train_labels, test_labels = train_test_split(
    edge_tuples, labels, test_size=0.2, random_state=42
)

# Convert back to tensors
train_edge_index = torch.tensor(train_edges, dtype=torch.long).t()
train_labels = torch.tensor(train_labels, dtype=torch.float)

test_edge_index = torch.tensor(test_edges, dtype=torch.long).t()
test_labels = torch.tensor(test_labels, dtype=torch.float)

print(f"Training edges: {train_edge_index.shape[1]}, Test edges: {test_edge_index.shape[1]}")


Training edges: 70, Test edges: 18


In [63]:
import torch.nn as nn
import torch.nn.functional as F

class LinkPredictor(nn.Module):
    def __init__(self, embedding_dim):
        super(LinkPredictor, self).__init__()
        self.fc1 = nn.Linear(embedding_dim * 2, embedding_dim)
        self.fc2 = nn.Linear(embedding_dim, 1)
        self.dropout = nn.Dropout(p=0.3)
        
    def forward(self, node_embeddings, edge_index):
        src_embeddings = node_embeddings[edge_index[0]]
        tgt_embeddings = node_embeddings[edge_index[1]]
        combined = torch.cat([src_embeddings, tgt_embeddings], dim=1)
        x = F.relu(self.fc1(combined))
        x = self.dropout(x)
        x = self.fc2(x)
        return x.squeeze()

# Initialize the model
embedding_dim = x.size(1)
model = LinkPredictor(embedding_dim)


In [64]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [66]:
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    logits = model(x, train_edge_index)
    loss = criterion(logits, train_labels)
    
    # Backward and optimize
    loss.backward()
    optimizer.step()
    
    # Evaluation on test set
    if (epoch + 1) % 5 == 0:
        model.eval()
        with torch.no_grad():
            test_logits = model(x, test_edge_index)
            test_loss = criterion(test_logits, test_labels)
            test_probs = torch.sigmoid(test_logits)
            test_preds = (test_probs > 0.5).float()
            accuracy = (test_preds == test_labels).float().mean()
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Test Loss: {test_loss.item():.4f}, Test Accuracy: {accuracy.item():.4f}")


Epoch [5/100], Loss: 0.0325, Test Loss: 0.6892, Test Accuracy: 0.7778
Epoch [10/100], Loss: 0.0266, Test Loss: 0.7444, Test Accuracy: 0.7222
Epoch [15/100], Loss: 0.0221, Test Loss: 0.7649, Test Accuracy: 0.7222
Epoch [20/100], Loss: 0.0154, Test Loss: 0.7867, Test Accuracy: 0.7222
Epoch [25/100], Loss: 0.0141, Test Loss: 0.7972, Test Accuracy: 0.7222
Epoch [30/100], Loss: 0.0109, Test Loss: 0.8096, Test Accuracy: 0.7222
Epoch [35/100], Loss: 0.0091, Test Loss: 0.8210, Test Accuracy: 0.7222
Epoch [40/100], Loss: 0.0095, Test Loss: 0.8323, Test Accuracy: 0.7222
Epoch [45/100], Loss: 0.0082, Test Loss: 0.8482, Test Accuracy: 0.7222
Epoch [50/100], Loss: 0.0080, Test Loss: 0.8550, Test Accuracy: 0.7222
Epoch [55/100], Loss: 0.0049, Test Loss: 0.8697, Test Accuracy: 0.7222
Epoch [60/100], Loss: 0.0050, Test Loss: 0.8747, Test Accuracy: 0.7222
Epoch [65/100], Loss: 0.0059, Test Loss: 0.8826, Test Accuracy: 0.7222
Epoch [70/100], Loss: 0.0051, Test Loss: 0.8892, Test Accuracy: 0.7222
Epoch [

In [67]:
from sklearn.metrics import roc_auc_score, average_precision_score

model.eval()
with torch.no_grad():
    test_logits = model(x, test_edge_index)
    test_probs = torch.sigmoid(test_logits).cpu().numpy()
    test_labels_np = test_labels.cpu().numpy()

roc_auc = roc_auc_score(test_labels_np, test_probs)
ap_score = average_precision_score(test_labels_np, test_probs)

print(f"ROC AUC: {roc_auc:.4f}, Average Precision: {ap_score:.4f}")


ROC AUC: 0.8765, Average Precision: 0.8734


In [68]:
# All possible risk-strategy pairs
all_possible_pairs = list(product(risk_node_indices, strategy_node_indices))

# Convert all existing edges to set
existing_edge_set = set(map(tuple, edge_index.t().tolist()))

# Exclude pairs that already exist in the graph
candidate_pairs = [pair for pair in all_possible_pairs if pair not in existing_edge_set]

# Convert to tensor
candidate_edge_index = torch.tensor(candidate_pairs, dtype=torch.long).t()


In [69]:
model.eval()
with torch.no_grad():
    candidate_logits = model(x, candidate_edge_index)
    candidate_probs = torch.sigmoid(candidate_logits)


In [70]:
import pandas as pd

# Create a DataFrame with results
results = pd.DataFrame({
    'risk_idx': candidate_edge_index[0].cpu().numpy(),
    'strategy_idx': candidate_edge_index[1].cpu().numpy(),
    'probability': candidate_probs.cpu().numpy()
})

# Map indices back to node IDs
idx_to_node_id = {idx: node_id for idx, node_id in enumerate(node_ids)}
results['risk_id'] = results['risk_idx'].map(idx_to_node_id)
results['strategy_id'] = results['strategy_idx'].map(idx_to_node_id)

# For each risk, get top N recommended strategies
top_N = 3
recommendations = results.groupby('risk_id').apply(lambda x: x.nlargest(top_N, 'probability')).reset_index(drop=True)

print("Top strategy recommendations for each risk:")
for risk_id, group in recommendations.groupby('risk_id'):
    print(f"\nRisk: {risk_id}")
    for _, row in group.iterrows():
        strategy_id = row['strategy_id']
        probability = row['probability']
        print(f"  Strategy: {strategy_id}, Probability: {probability:.4f}")


Top strategy recommendations for each risk:

Risk: Activity_Cost_Estimates
  Strategy: Accept, Probability: 0.9996
  Strategy: Risk_Response_Strategies, Probability: 0.9993
  Strategy: Process-based_Approaches, Probability: 0.9973

Risk: Agile_Methodology
  Strategy: Accept, Probability: 0.9574
  Strategy: Risk_Response_Strategies, Probability: 0.9303
  Strategy: Process-based_Approaches, Probability: 0.7667

Risk: Assumption
  Strategy: Accept, Probability: 0.9997
  Strategy: Risk_Response_Strategies, Probability: 0.9995
  Strategy: Process-based_Approaches, Probability: 0.9980

Risk: Bias
  Strategy: Accept, Probability: 0.9618
  Strategy: Risk_Response_Strategies, Probability: 0.9373
  Strategy: Process-based_Approaches, Probability: 0.7832

Risk: Bias_in_Data_Gathering
  Strategy: Accept, Probability: 0.9642
  Strategy: Risk_Response_Strategies, Probability: 0.9407
  Strategy: Process-based_Approaches, Probability: 0.7965

Risk: Blockers_Impacts
  Strategy: Accept, Probability: 0.9

  recommendations = results.groupby('risk_id').apply(lambda x: x.nlargest(top_N, 'probability')).reset_index(drop=True)


In [71]:
# Create a mapping from node ID to properties
node_id_to_properties = {node['id']: node['properties'] for node in data.get('Nodes', [])}

# Example of displaying detailed recommendations
for risk_id, group in recommendations.groupby('risk_id'):
    risk_props = node_id_to_properties.get(risk_id, {})
    risk_description = risk_props.get('description', 'No description available.')
    print(f"\nRisk: {risk_id}")
    print(f"Description: {risk_description}")
    for _, row in group.iterrows():
        strategy_id = row['strategy_id']
        strategy_props = node_id_to_properties.get(strategy_id, {})
        strategy_description = strategy_props.get('description', 'No description available.')
        probability = row['probability']
        print(f"  Strategy: {strategy_id}, Probability: {probability:.4f}")
        print(f"    Description: {strategy_description}")



Risk: Activity_Cost_Estimates
Description: Estimates covering all components and types of costs for planned activities.
  Strategy: Accept, Probability: 0.9996
    Description: Deciding not to take any action on the opportunity.
  Strategy: Risk_Response_Strategies, Probability: 0.9993
    Description: Strategies chosen by risk owners to manage risks effectively.
  Strategy: Process-based_Approaches, Probability: 0.9973
    Description: Focuses on systematic processes and workflows.

Risk: Agile_Methodology
Description: Agile methodologies focus on continuous improvement and flexibility.
  Strategy: Accept, Probability: 0.9574
    Description: Deciding not to take any action on the opportunity.
  Strategy: Risk_Response_Strategies, Probability: 0.9303
    Description: Strategies chosen by risk owners to manage risks effectively.
  Strategy: Process-based_Approaches, Probability: 0.7667
    Description: Focuses on systematic processes and workflows.

Risk: Assumption
Description: Assum

In [44]:
# Load your graph data
with open('final_updated_graph.json', 'r') as f:
    data = json.load(f)

edge_index_list = []
edge_type_list = []
node_id_to_index = {node_id: idx for idx, node_id in enumerate(node_ids_in_order)}
relation_types = {}
relation_type_counter = 0

for relationship in data.get('Relationships', []):
    source_id = relationship['source']
    target_id = relationship['target']
    rel_type = relationship['type']

    if source_id in node_id_to_index and target_id in node_id_to_index:
        source_idx = node_id_to_index[source_id]
        target_idx = node_id_to_index[target_id]
        edge_index_list.append([source_idx, target_idx])

        # Assign a unique index to each relation type
        if rel_type not in relation_types:
            relation_types[rel_type] = relation_type_counter
            relation_type_counter += 1
        edge_type_list.append(relation_types[rel_type])

# Convert to tensors
edge_index = torch.tensor(edge_index_list, dtype=torch.long).t().contiguous()
edge_type = torch.tensor(edge_type_list, dtype=torch.long)


In [None]:
from itertools import product

# Get all possible risk-strategy pairs
all_possible_pairs = list(product(risk_node_indices, strategy_node_indices))

# Convert positive edge indices to a set of tuples for faster lookup
pos_edge_set = set(map(tuple, pos_edge_index.t().tolist()))

# Exclude existing positive edges
negative_candidates = [pair for pair in all_possible_pairs if pair not in pos_edge_set]


In [45]:
# Identify node indices for risks and strategies
risk_node_indices = [idx for idx, node_id in enumerate(node_ids_in_order) if 'Risk' in node_id]
strategy_node_indices = [idx for idx, node_id in enumerate(node_ids_in_order) if 'Strategy' in node_id]


In [46]:
# Extract edges between risks and strategies
pos_edge_index_list = []
for i in range(edge_index.size(1)):
    src = edge_index[0, i].item()
    dst = edge_index[1, i].item()
    if src in risk_node_indices and dst in strategy_node_indices:
        pos_edge_index_list.append([src, dst])

pos_edge_index = torch.tensor(pos_edge_index_list, dtype=torch.long).t()
pos_labels = torch.ones(pos_edge_index.size(1), dtype=torch.float)


In [48]:
from torch_geometric.utils import negative_sampling

num_nodes = x.size(0)
num_neg_samples = pos_edge_index.size(1)  # Equal number of negative samples

neg_edge_index = negative_sampling(
    edge_index=edge_index,
    num_nodes=num_nodes,
    num_neg_samples=num_neg_samples,
    method='sparse'
)

# Filter negative samples to only include risk-strategy pairs
neg_edge_index_list = []
for i in range(neg_edge_index.size(1)):
    src = neg_edge_index[0, i].item()
    dst = neg_edge_index[1, i].item()
    if src in risk_node_indices and dst in strategy_node_indices:
        neg_edge_index_list.append([src, dst])

neg_edge_index = torch.tensor(neg_edge_index_list, dtype=torch.long).t()
neg_labels = torch.zeros(neg_edge_index.size(1), dtype=torch.float)


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [36]:
# Extract positive training edges for graph structure
train_pos_edge_index = train_edges[:, train_labels == 1]


In [37]:
input_dim = x.size(1)  # Dimension of node features (e.g., 768 from BERT)
hidden_dim = 128       # You can adjust this based on your resources
output_dim = 2         # For binary classification in link prediction
num_heads = 4
ff_hidden_dim = 128


In [38]:
class MixedGATGraphSAGEModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_heads=4, ff_hidden_dim=128):
        super(MixedGATGraphSAGEModel, self).__init__()

        # First layer: GAT for attention-based interactions
        self.gat_layer = GATConv(input_dim, hidden_dim, heads=num_heads, dropout=0.2)

        # Second and third layers: GraphSAGE to capture local information
        self.sage_layer1 = SAGEConv(hidden_dim * num_heads, hidden_dim)
        self.sage_layer2 = SAGEConv(hidden_dim, hidden_dim)

        # Feedforward network for classification
        self.ff_layer1 = nn.Linear(hidden_dim, ff_hidden_dim)
        self.ff_layer2 = nn.Linear(ff_hidden_dim, hidden_dim)
        self.classifier = nn.Linear(hidden_dim, output_dim)

        # Link prediction
        self.link_predictor = nn.Linear(hidden_dim * 2, 1)

        # Residual connections
        self.residual1 = nn.Linear(input_dim, hidden_dim * num_heads)
        self.residual2 = nn.Linear(hidden_dim * num_heads, hidden_dim)

        # Dropout layer
        self.dropout = nn.Dropout(0.4)

    def forward(self, x, edge_index):
        # First layer: GAT with residual connection
        residual = self.residual1(x)
        x = self.gat_layer(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x += residual

        # Second layer: GraphSAGE with residual connection
        residual = self.residual2(x)
        x = self.sage_layer1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x += residual

        # Final layer: GraphSAGE without residual connection
        x = self.sage_layer2(x, edge_index)

        # Feedforward network
        x = F.relu(self.ff_layer1(x))
        x = self.dropout(x)
        x = self.ff_layer2(x)

        # Classification prediction
        node_logits = self.classifier(x)
        self.node_embeddings = x

        return node_logits

    def predict_links(self, node_pair_indices):
        node1_embeddings = self.node_embeddings[node_pair_indices[0]]
        node2_embeddings = self.node_embeddings[node_pair_indices[1]]

        concatenated_embeddings = torch.cat([node1_embeddings, node2_embeddings], dim=1)
        link_logits = self.link_predictor(concatenated_embeddings)

        return link_logits  # Return raw logits



In [39]:
model = MixedGATGraphSAGEModel(input_dim, hidden_dim, output_dim, num_heads, ff_hidden_dim)


In [40]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=5e-4)


In [41]:
model.train()
num_epochs = 400

for epoch in range(num_epochs):
    optimizer.zero_grad()
    
    # Forward pass
    model(x, train_pos_edge_index)
    
    # Prepare node pairs for link prediction
    edge_indices = train_edges  # Use both positive and negative samples
    labels = train_labels.unsqueeze(1)
    
    # Predict links
    link_logits = model.predict_links((edge_indices[0], edge_indices[1]))
    
    # Compute loss
    loss = criterion(link_logits, labels)
    
    # Backward pass and optimization
    loss.backward()
    optimizer.step()
    
    # Optionally evaluate on validation set
    if epoch % 10 == 0:
        model.eval()
        with torch.no_grad():
            model(x, train_pos_edge_index)  # Update embeddings
            val_logits = model.predict_links((val_edges[0], val_edges[1]))
            val_loss = criterion(val_logits, val_labels.unsqueeze(1))
            val_preds = torch.sigmoid(val_logits).round()
            val_accuracy = (val_preds.squeeze() == val_labels).float().mean()
        print(f'Epoch {epoch}, Loss: {loss.item()}, Val Loss: {val_loss.item()}, Val Acc: {val_accuracy.item()}')
        model.train()


Epoch 0, Loss: 0.6944808959960938, Val Loss: 0.6993808150291443, Val Acc: 0.5043988227844238
Epoch 10, Loss: 0.574546754360199, Val Loss: 1.311208724975586, Val Acc: 0.4149560034275055
Epoch 20, Loss: 0.5401127934455872, Val Loss: 1.5167396068572998, Val Acc: 0.46041056513786316
Epoch 30, Loss: 0.5230329632759094, Val Loss: 1.9168989658355713, Val Acc: 0.47360703349113464
Epoch 40, Loss: 0.506766676902771, Val Loss: 2.1105918884277344, Val Acc: 0.5
Epoch 50, Loss: 0.48064985871315, Val Loss: 1.9774619340896606, Val Acc: 0.5175952911376953
Epoch 60, Loss: 0.4531894624233246, Val Loss: 1.874300479888916, Val Acc: 0.4985337257385254
Epoch 70, Loss: 0.4217136800289154, Val Loss: 1.940106749534607, Val Acc: 0.5234603881835938
Epoch 80, Loss: 0.40247824788093567, Val Loss: 1.967966079711914, Val Acc: 0.5307917594909668
Epoch 90, Loss: 0.3890913128852844, Val Loss: 1.9332618713378906, Val Acc: 0.5322580933570862
Epoch 100, Loss: 0.3669375479221344, Val Loss: 2.1510236263275146, Val Acc: 0.541

In [21]:
model.eval()
with torch.no_grad():
    model(x, train_pos_edge_index)
    val_logits = model.predict_links((val_edges[0], val_edges[1]))
    val_probs = torch.sigmoid(val_logits)
    val_preds = (val_probs > 0.5).float()
    print("Validation Predictions:", val_preds.squeeze())
    print("Validation Labels:", val_labels)


Validation Predictions: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 

In [16]:
model.eval()
with torch.no_grad():
    model(x, train_pos_edge_index)  # Use full graph or training edges for embeddings
    test_logits = model.predict_links((test_edges[0], test_edges[1]))
    test_loss = criterion(test_logits, test_labels.unsqueeze(1))
    test_preds = torch.sigmoid(test_logits).round()
    test_accuracy = (test_preds.squeeze() == test_labels).float().mean()
    
    from sklearn.metrics import roc_auc_score, average_precision_score

    y_true = test_labels.cpu().numpy()
    y_scores = torch.sigmoid(test_logits).cpu().numpy()
    roc_auc = roc_auc_score(y_true, y_scores)
    ap_score = average_precision_score(y_true, y_scores)
    
print(f'Test Loss: {test_loss.item()}, Test Acc: {test_accuracy.item()}, ROC AUC: {roc_auc}, AP Score: {ap_score}')


Test Loss: 0.7055755853652954, Test Acc: 0.5134575366973877, ROC AUC: 0.5372083047357584, AP Score: 0.5869865976491768


In [17]:
model.eval()
with torch.no_grad():
    model(x, train_pos_edge_index)
    
    # Generate all possible pairs or a subset
    potential_edges = torch.combinations(torch.arange(x.size(0)), r=2).t()
    
    # Exclude existing edges
    existing_edges_set = set([tuple(e) for e in edge_index.t().tolist()])
    potential_edges_list = [tuple(e) for e in potential_edges.t().tolist()]
    new_edges = [e for e in potential_edges_list if e not in existing_edges_set]
    new_edges = torch.tensor(new_edges, dtype=torch.long).t()
    
    # Predict links for new edges
    link_logits = model.predict_links((new_edges[0], new_edges[1]))
    link_probs = torch.sigmoid(link_logits).squeeze()
    
    # Select top recommendations
    top_k = 10  # Number of recommendations
    _, top_indices = torch.topk(link_probs, k=top_k)
    recommended_edges = new_edges[:, top_indices]
    
    # Map indices back to node IDs
    recommended_node_pairs = [(node_ids_in_order[i], node_ids_in_order[j]) for i, j in recommended_edges.t().tolist()]
    
print("Top recommended links:")
for source_id, target_id in recommended_node_pairs:
    print(f"{source_id} -> {target_id}")


Top recommended links:
Bias -> Hybrid_Approach
Bias -> Estimates
Bias -> Organizational_Culture
Bias -> Project_Success
Bias -> RiskResponseStrategies
Bias -> Trust
Qualitative_Risk_Analysis -> Bias
Qualitative_Risk_Analysis -> Hybrid_Approach
Trigger_Conditions -> Bias
Decision_Making -> Bias


In [None]:
# Model parameter initialization
input_dim = data.x.shape[1]    
hidden_dim = 128                
output_dim = 3                  
num_heads = 4                  
ff_hidden_dim = 128             

model = MixedGATGraphSAGEModel(input_dim, hidden_dim, output_dim, num_heads=num_heads, ff_hidden_dim=ff_hidden_dim)

# Training hyperparameters
learning_rate = 0.001           
weight_decay = 5e-5             
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
loss_fn = torch.nn.CrossEntropyLoss()

# Early stopping parameters
patience = 50                   
best_val_loss = float('inf')
patience_counter = 0

# Maximum number of epochs
max_epochs = 30

# Masks for training and validation
train_mask = torch.rand(data.y.size(0)) < 0.8
val_mask = ~train_mask                         

# Model training
model.train()
epoch = 0

# Lists to track performance
train_losses, train_accuracies = [], []
val_losses, val_accuracies = [], []

while epoch < max_epochs:
    optimizer.zero_grad()
    node_logits = model(data.x, data.edge_index)

    # Compute training loss
    train_loss = loss_fn(node_logits[train_mask], data.y[train_mask])
    train_loss.backward()
    optimizer.step()

    # Compute training accuracy
    with torch.no_grad():
        train_pred = node_logits[train_mask].argmax(dim=1)
        train_acc = (train_pred == data.y[train_mask]).float().mean()

        # Compute validation loss and accuracy
        val_loss = loss_fn(node_logits[val_mask], data.y[val_mask])
        val_pred = node_logits[val_mask].argmax(dim=1)
        val_acc = (val_pred == data.y[val_mask]).float().mean()

    # Store metrics
    train_losses.append(train_loss.item())
    train_accuracies.append(train_acc.item())
    val_losses.append(val_loss.item())
    val_accuracies.append(val_acc.item())

    if epoch % 2 == 0:
        print(f"Epoch {epoch}, Train Loss: {train_loss.item():.4f}, "
              f"Train Accuracy: {train_acc:.4f}, "
              f"Val Loss: {val_loss.item():.4f}, Val Accuracy: {val_acc:.4f}")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print(f"Early stopping at epoch {epoch} with best validation loss: {best_val_loss:.4f}")
        break

    epoch += 1


Epoch 0, Train Loss: 1.0903, Train Accuracy: 0.4659, Val Loss: 1.0879, Val Accuracy: 0.4862
Epoch 2, Train Loss: 1.0628, Train Accuracy: 0.4659, Val Loss: 1.0550, Val Accuracy: 0.4862
Epoch 4, Train Loss: 1.0509, Train Accuracy: 0.4659, Val Loss: 1.0394, Val Accuracy: 0.4862
Epoch 6, Train Loss: 1.0236, Train Accuracy: 0.4659, Val Loss: 1.0175, Val Accuracy: 0.4862
Epoch 8, Train Loss: 0.9785, Train Accuracy: 0.4659, Val Loss: 0.9735, Val Accuracy: 0.4862
Epoch 10, Train Loss: 0.8886, Train Accuracy: 0.4778, Val Loss: 0.8854, Val Accuracy: 0.4936
Epoch 12, Train Loss: 0.7527, Train Accuracy: 0.5919, Val Loss: 0.7577, Val Accuracy: 0.5963
Epoch 14, Train Loss: 0.5790, Train Accuracy: 0.7424, Val Loss: 0.5930, Val Accuracy: 0.6862
Epoch 16, Train Loss: 0.4039, Train Accuracy: 0.8590, Val Loss: 0.4257, Val Accuracy: 0.8202
Epoch 18, Train Loss: 0.2583, Train Accuracy: 0.9183, Val Loss: 0.2916, Val Accuracy: 0.9028
Epoch 20, Train Loss: 0.1814, Train Accuracy: 0.9372, Val Loss: 0.2172, Val