In [4]:
import pandas as pd
import numpy as np
import h5py
import torch
from sklearn.preprocessing import OneHotEncoder
from torch_geometric.utils import to_undirected

# Load the dataset
file_path = '../Incidents.xlsx'
data = pd.read_excel(file_path)

# Convert the Job Substation column to strings and clean the data
data['Job Substation'] = data['Job Substation'].astype(str).str.strip().str.upper()

# Columns with mixed types - convert these to strings
columns_to_convert = [
    'CAD_ID', 'Job City', 'Device Address', 'STRCTUR_NO/Job Device ID',
    'Device Type', 'Dev Subtype', 'Lead Crew', 'Lead Crew Phone', 
    'AM Notes', 'Ark Grid Mod or OK Grid Enhancement Circuits'
]
data[columns_to_convert] = data[columns_to_convert].astype(str)

# Define the columns used for constructing the layers of the multilayer network
layer_columns = ['Job Region', 'Month/Day/Year', 'Custs Affected', 
                 'OGE Causes', 'Major Storm Event  Y (Yes) or N (No)', 
                 'Distribution, Substation, Transmission']

# Initialize the graph by extracting unique Job Substations
nodes = data['Job Substation'].unique()
node_to_idx = {node: idx for idx, node in enumerate(nodes)}

# Verify node mapping
print(f"Number of unique Job Substations (nodes): {len(nodes)}")
print(f"Node to index mapping sample: {list(node_to_idx.items())[:10]}")

# Create a list to hold edge indices for each layer
edge_indices = []

for col in layer_columns:
    edges = set()  # Use a set to ensure unique edges
    for value in data[col].unique():
        same_layer_nodes = data[data[col] == value]['Job Substation'].unique()
        for i in range(len(same_layer_nodes)):
            for j in range(i + 1, len(same_layer_nodes)):
                node1 = same_layer_nodes[i]
                node2 = same_layer_nodes[j]
                if node1 in node_to_idx and node2 in node_to_idx:
                    node1_idx = node_to_idx[node1]
                    node2_idx = node_to_idx[node2]
                    # Add direct debug output for node indices
                    if node1_idx >= len(nodes) or node2_idx >= len(nodes):
                        print(f"Out-of-bounds node indices found: {node1_idx}, {node2_idx}")
                    if 0 <= node1_idx < len(nodes) and 0 <= node2_idx < len(nodes) and node1_idx != node2_idx:
                        edge = (min(node1_idx, node2_idx), max(node1_idx, node2_idx))
                        edges.add(edge)

    # Convert to PyTorch tensor and append to edge_indices list
    if len(edges) > 0:
        edge_tensor = torch.tensor(list(edges), dtype=torch.long).t().contiguous()
        edge_indices.append(to_undirected(edge_tensor))
    else:
        edge_indices.append(torch.tensor([], dtype=torch.long))

# Ensure all indices are within bounds
for i, edges in enumerate(edge_indices):
    if edges.numel() > 0:
        max_index = edges.max().item()
        print(f"Max index in network_layer{i+1}: {max_index}")
        if max_index >= len(nodes):
            raise ValueError(f"Layer {i+1}: Adjacency matrix references out-of-bounds index {max_index}. Maximum allowed index is {len(nodes) - 1}.")

# Prepare Node Features
filtered_data = data.drop_duplicates(subset=['Job Substation'])
feature_columns = filtered_data.columns.difference(layer_columns + ['Job Area (DISTRICT)', 'Job Substation'])

# ** Fix: Define feature_names here **
feature_names = np.array(feature_columns, dtype='S')

# Handle NaN values in numeric features
numeric_features = filtered_data[feature_columns].select_dtypes(include=[np.number])
numeric_features.fillna(0, inplace=True)  # Fill NaN with 0 or you could use `numeric_features.fillna(numeric_features.mean(), inplace=True)`

# Handle NaN values in non-numeric features
non_numeric_features = filtered_data[feature_columns].select_dtypes(exclude=[np.number])
non_numeric_features.fillna('missing', inplace=True)  # Replace NaN with 'missing' or any other placeholder

# Ensure uniform data type in non-numeric features by converting everything to string
non_numeric_features = non_numeric_features.astype(str)

# One-Hot Encode non-numeric features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_non_numeric_features = encoder.fit_transform(non_numeric_features)

# Combine numeric and encoded non-numeric features
features = np.hstack([numeric_features.values, encoded_non_numeric_features])

# Prepare Target Classes
target_classes = pd.Categorical(filtered_data['Job Area (DISTRICT)']).codes

# Prepare Train/Test Masks based on the filtered_data (unique Job Substations)
num_filtered_nodes = len(filtered_data)  # This should match the number of nodes

# Ensure that the masks cover the entire set of nodes
train_mask = np.zeros(num_filtered_nodes, dtype=bool)
test_mask = np.zeros(num_filtered_nodes, dtype=bool)

# Assign masks for train/test in a way that ensures all nodes are included
train_size = int(0.8 * num_filtered_nodes)
test_size = num_filtered_nodes - train_size

train_indices = np.random.choice(num_filtered_nodes, size=train_size, replace=False)
test_indices = np.setdiff1d(np.arange(num_filtered_nodes), train_indices)

train_mask[train_indices] = True
test_mask[test_indices] = True

# Apply the masks to the target classes
y_train = target_classes[train_mask]
y_test = target_classes[test_mask]

# Adjust train_mask and test_mask to match the size of y_train and y_test
train_mask = train_mask[train_mask]  # Adjust the size to match y_train
test_mask = test_mask[test_mask]  # Adjust the size to match y_test

# Ensure the masks are correctly sized and used consistently
assert len(y_train) == np.sum(train_mask)
assert len(y_test) == np.sum(test_mask)

# Save the container
output_path = 'container_with_node_names_R1.h5'
with h5py.File(output_path, 'w') as f:
    f.create_dataset('node_features', data=features)
    f.create_dataset('feature_names', data=feature_names)
    
    # Store node (substation) names
    f.create_dataset('node_names', data=np.array(nodes, dtype='S'))  # Rename to node_names
    
    for i, edge_index in enumerate(edge_indices):
        f.create_dataset(f'network_layer{i+1}', data=edge_index.numpy())
        
    f.create_dataset('y_train', data=y_train)
    f.create_dataset('y_test', data=y_test)
    f.create_dataset('train_mask', data=train_mask)
    f.create_dataset('test_mask', data=test_mask)

print(f"Container saved to {output_path}")


Number of unique Job Substations (nodes): 380
Node to index mapping sample: [('8170:SANTA FE AVE', 0), ('5606:WALNUT CREEK', 1), ('8617:SUNNYLANE', 2), ('8245:COUNCIL', 3), ('9137:PARK VIEW', 4), ('8522:MIDWAY', 5), ('4518:MENO TAP', 6), ('8662:SE 15TH ST', 7), ('8905:EL RENO', 8), ('8822:WATERLOO', 9)]
Max index in network_layer1: 379
Max index in network_layer2: 379
Max index in network_layer3: 379
Max index in network_layer4: 379
Max index in network_layer5: 379
Max index in network_layer6: 379
Container saved to container_with_node_names_R1.h5


In [7]:
import torch
from torch_geometric.utils import add_self_loops
from torch_geometric.data import Data
import h5py

# Load the adjacency matrices and node features from the container
with h5py.File('container_with_node_names_R1.h5', 'r') as f:
    adj_tensors = [torch.tensor(f[f'network_layer{i}'][:], dtype=torch.long) for i in range(1, 7)]
    features = f['node_features'][:]
    node_names = f['node_names'][:]
    y = torch.zeros(len(node_names))  # Placeholder for labels

# Debugging output
print(f"Loaded node names length: {len(node_names)}")
print(f"Loaded features shape: {features.shape}")

# Container for processed data
data_list = []

# Process each adjacency matrix
for i, adj_tensor in enumerate(adj_tensors):
    # Initial check for max adjacency index
    max_adj_index = adj_tensor.max().item()
    print(f"Initial Layer {i+1} max_adj_index: {max_adj_index}, adj shape: {adj_tensor.shape}")
    
    # Ensure indices are within valid range
    if max_adj_index >= len(node_names):
        raise ValueError(f"Layer {i+1}: Adjacency matrix has out-of-bounds index {max_adj_index}. Maximum allowed index is {len(node_names) - 1}.")
    
    # Convert the feature matrix to a tensor
    features_tensor = torch.tensor(features, dtype=torch.float)
    print(f"Layer {i+1} max_adj_index: {max_adj_index}, features_tensor size: {features_tensor.size(0)}")
    
    # Create edge_index from adj_tensor
    edge_index = adj_tensor
    print(f"Layer {i+1} edge_index initial shape: {edge_index.shape}, max: {edge_index.max().item()}")

    # Check for out-of-bounds indices before adding self-loops
    max_index = edge_index.max().item()
    if max_index >= len(node_names):
        raise ValueError(f"Found an out-of-bounds index in edge_index: {max_index}")

    # Add self-loops and validate again
    edge_index, _ = add_self_loops(edge_index, num_nodes=len(node_names))
    max_index_with_self_loops = edge_index.max().item()
    if max_index_with_self_loops >= len(node_names):
        raise ValueError(f"Edge index with self-loops contains an out-of-bounds node index: {max_index_with_self_loops}")
    
    print(f"Processed Edge_index after adding self-loops: \n{edge_index.shape} with max index: {max_index_with_self_loops}")

    # Add this processed data to the data list
    data = Data(x=features_tensor, edge_index=edge_index, y=y, node_names=node_names)
    data_list.append(data)

# Return the number of processed layers
print(f"Number of processed layers: {len(data_list)}")


Loaded node names length: 380
Loaded features shape: (380, 2467)
Initial Layer 1 max_adj_index: 379, adj shape: torch.Size([2, 18336])
Layer 1 max_adj_index: 379, features_tensor size: 380
Layer 1 edge_index initial shape: torch.Size([2, 18336]), max: 379
Processed Edge_index after adding self-loops: 
torch.Size([2, 18716]) with max index: 379
Initial Layer 2 max_adj_index: 379, adj shape: torch.Size([2, 112758])
Layer 2 max_adj_index: 379, features_tensor size: 380
Layer 2 edge_index initial shape: torch.Size([2, 112758]), max: 379
Processed Edge_index after adding self-loops: 
torch.Size([2, 113138]) with max index: 379
Initial Layer 3 max_adj_index: 379, adj shape: torch.Size([2, 142690])
Layer 3 max_adj_index: 379, features_tensor size: 380
Layer 3 edge_index initial shape: torch.Size([2, 142690]), max: 379
Processed Edge_index after adding self-loops: 
torch.Size([2, 143070]) with max index: 379
Initial Layer 4 max_adj_index: 379, adj shape: torch.Size([2, 141672])
Layer 4 max_adj