In [1]:
import pandas as pd
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
import networkx as nx
from sklearn.preprocessing import LabelEncoder

# Load the dataset and clean column names
file_path = 'Incidents_5000.xlsx'
df = pd.read_excel(file_path)
df.columns = df.columns.str.strip()

# Columns used for layers and target
layer_columns = [
    'Job Region', 
    'Month/Day/Year', 
    'Custs Affected', 
    'OGE Causes', 
    'Major Storm Event  Y (Yes) or N (No)', 
    'Distribution, Substation, Transmission'
]
target_column = 'Job Area (DISTRICT)'

# Exclude all layer columns and high-cardinality columns from node features
high_cardinality_columns = [
    'Job Display ID', 'CAD_ID', 'Job OFF Time', 'Job ON Time', 
    'Device Address', 'STRCTUR_NO/Job Device ID', 'Job Substation'
]
excluded_columns = layer_columns + high_cardinality_columns + [target_column]

# Filter out the columns to use as node features
feature_columns = [col for col in df.columns if col not in excluded_columns]

# Label encode remaining high-cardinality columns
for col in high_cardinality_columns:
    if col in df.columns and col not in layer_columns:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Convert categorical columns to numeric (one-hot encoding for remaining)
df_encoded = pd.get_dummies(df, columns=feature_columns, drop_first=True)

# Reassign 'Job Substation' column and set as index
df_encoded['Job Substation'] = df['Job Substation']
df_encoded.set_index('Job Substation', inplace=True)

# Aggregate features by Job Substation
df_aggregated = df_encoded.groupby('Job Substation').mean()
target_aggregated = df.groupby('Job Substation')[target_column].first()

# Extract unique nodes (Job Substations)
nodes = df_aggregated.index.unique()

# Initialize an empty graph for each layer
graphs = {col: nx.Graph() for col in layer_columns}

# Add nodes to each graph
for col in layer_columns:
    graphs[col].add_nodes_from(nodes)

# Add edges based on shared attributes in each layer
for col in layer_columns:
    for value in df[col].unique():
        nodes_with_value = df[df[col] == value]['Job Substation'].unique()
        for i, node1 in enumerate(nodes_with_value):
            for node2 in nodes_with_value[i+1:]:
                graphs[col].add_edge(node1, node2)

# Combine graphs into a multiplex network
multiplex_graph = nx.Graph()
for col in layer_columns:
    multiplex_graph = nx.compose(multiplex_graph, graphs[col])

# Convert to PyTorch Geometric Data object
data = from_networkx(multiplex_graph)

# Add node features and target labels
data.x = torch.tensor(df_aggregated.values, dtype=torch.float)
data.y = torch.tensor(target_aggregated.factorize()[0], dtype=torch.long)

# Display the processed data object
print(f"Node features shape: {data.x.shape}")
print(f"Target labels shape: {data.y.shape}")
print(f"Edge index shape: {data.edge_index.shape}")


Node features shape: torch.Size([281, 29077])
Target labels shape: torch.Size([281])
Edge index shape: torch.Size([2, 78680])


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv

class SupraGNNLayer(nn.Module):
    def __init__(self, in_channels, out_channels, gnn_intra_layer, gnn_inter_layer, aggregation="sum", mlp_hidden_dim=None):
        super(SupraGNNLayer, self).__init__()
        self.gnn_intra_layer = gnn_intra_layer(in_channels, out_channels)
        self.gnn_inter_layer = gnn_inter_layer(in_channels, out_channels)
        
        # Handle the case where aggregation is an MLP
        if aggregation == "mlp":
            assert mlp_hidden_dim is not None, "mlp_hidden_dim must be specified for MLP aggregation."
            self.aggregation_mlp = nn.Sequential(
                nn.Linear(out_channels * 2, mlp_hidden_dim),
                nn.ReLU(),
                nn.Linear(mlp_hidden_dim, out_channels)
            )
        else:
            self.aggregation_mlp = None
        
        self.aggregation = aggregation

    def forward(self, x, intra_edge_index, inter_edge_index):
        # Apply intra-layer GNN
        intra_out = self.gnn_intra_layer(x, intra_edge_index)
        
        # Apply inter-layer GNN
        inter_out = self.gnn_inter_layer(x, inter_edge_index)
        
        # Aggregate the results from intra-layer and inter-layer GNNs
        if self.aggregation == "sum":
            out = intra_out + inter_out
        elif self.aggregation == "mean":
            out = (intra_out + inter_out) / 2
        elif self.aggregation == "concat":
            out = torch.cat([intra_out, inter_out], dim=1)
        elif self.aggregation == "mlp":
            concat_out = torch.cat([intra_out, inter_out], dim=1)
            out = self.aggregation_mlp(concat_out)
        else:
            raise ValueError(f"Unknown aggregation method: {self.aggregation}")
        
        return out

class mGNN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers, gnn_intra_layer=GCNConv, gnn_inter_layer=GCNConv, aggregation="sum", mlp_hidden_dim=None):
        super(mGNN, self).__init__()
        self.layers = nn.ModuleList()
        
        # Input layer
        self.layers.append(SupraGNNLayer(in_channels, hidden_channels, gnn_intra_layer, gnn_inter_layer, aggregation, mlp_hidden_dim))
        
        # Hidden layers
        for _ in range(num_layers - 2):
            self.layers.append(SupraGNNLayer(hidden_channels, hidden_channels, gnn_intra_layer, gnn_inter_layer, aggregation, mlp_hidden_dim))
        
        # Output layer
        self.layers.append(SupraGNNLayer(hidden_channels, out_channels, gnn_intra_layer, gnn_inter_layer, aggregation, mlp_hidden_dim))

    def forward(self, x, intra_edge_index, inter_edge_index):
        for layer in self.layers:
            x = layer(x, intra_edge_index, inter_edge_index)
            x = F.relu(x)
        return x

# Example instantiation of the model with MLP aggregation:
# model = mGNN(in_channels=16, hidden_channels=32, out_channels=8, num_layers=3, gnn_intra_layer=GATConv, gnn_inter_layer=GATConv, aggregation="mlp", mlp_hidden_dim=64)


In [6]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import torch
import networkx as nx

# Assuming 'graphs' is a dictionary with each layer's graph as in your previous code
# Get the unique nodes (consistent across layers)
nodes = df_aggregated.index.unique()

# Initialize lists to store the edges
intra_edges = []
inter_edges = []

# Collect intra-layer edges
for col in layer_columns:
    intra_edges.extend(list(graphs[col].edges))

# Create inter-layer edges by connecting each node to its counterparts across layers
for i, node in enumerate(nodes):
    for j, other_layer_node in enumerate(nodes):
        if i != j:
            inter_edges.append((i, j))

# Convert intra-layer edges and inter-layer edges to PyTorch tensors
intra_edge_index = torch.tensor(list(zip(*intra_edges)), dtype=torch.long)
inter_edge_index = torch.tensor(list(zip(*inter_edges)), dtype=torch.long)

# If needed, concatenate these with existing edge indices
intra_edge_index = torch.cat([intra_edge_index, intra_edge_index], dim=1)
inter_edge_index = torch.cat([inter_edge_index, inter_edge_index], dim=1)

# You can now use these edge indices in your model

# Split the data into train and test sets
train_mask, test_mask = train_test_split(range(data.num_nodes), test_size=0.2, random_state=42)

# Convert the masks into boolean masks for easy indexing
train_mask = torch.tensor([i in train_mask for i in range(data.num_nodes)], dtype=torch.bool)
test_mask = torch.tensor([i in test_mask for i in range(data.num_nodes)], dtype=torch.bool)

# Define model, optimizer, and loss function
model = mGNN(in_channels=data.x.shape[1], hidden_channels=128, out_channels=len(data.y.unique()), num_layers=4)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, min_lr=1e-6)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, intra_edge_index, inter_edge_index)  # Pass both edge indices
    loss = criterion(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()
    scheduler.step(loss)
    print(f'Epoch {epoch+1}, Loss: {loss:.4f}')

# Evaluation on test set
model.eval()
with torch.no_grad():
    out = model(data.x, intra_edge_index, inter_edge_index)  # Pass both edge indices
    _, pred = out[test_mask].max(dim=1)
    accuracy = accuracy_score(data.y[test_mask].cpu(), pred.cpu())
    print(f'Accuracy: {accuracy:.4f}')


Epoch 1, Loss: 925.4236
Epoch 2, Loss: 6166.6440
Epoch 3, Loss: 6801.6177
Epoch 4, Loss: 4281.5718
Epoch 5, Loss: 4752.5537
Epoch 6, Loss: 3413.7896
Epoch 7, Loss: 2909.3674
Epoch 8, Loss: 3031.1072
Epoch 9, Loss: 2430.6433
Epoch 10, Loss: 1688.8027
Epoch 11, Loss: 1416.1896
Epoch 12, Loss: 1424.9769
Epoch 13, Loss: 1022.5799
Epoch 14, Loss: 757.7928
Epoch 15, Loss: 558.5190
Epoch 16, Loss: 415.1020
Epoch 17, Loss: 297.5292
Epoch 18, Loss: 197.9680
Epoch 19, Loss: 145.2372
Epoch 20, Loss: 155.3915
Epoch 21, Loss: 138.1759
Epoch 22, Loss: 129.4360
Epoch 23, Loss: 90.6215
Epoch 24, Loss: 76.0009
Epoch 25, Loss: 37.5228
Epoch 26, Loss: 16.3842
Epoch 27, Loss: 11.0031
Epoch 28, Loss: 9.8776
Epoch 29, Loss: 13.8182
Epoch 30, Loss: 12.9942
Epoch 31, Loss: 9.5932
Epoch 32, Loss: 6.4778
Epoch 33, Loss: 4.6709
Epoch 34, Loss: 4.2783
Epoch 35, Loss: 4.4356
Epoch 36, Loss: 3.9794
Epoch 37, Loss: 3.2384
Epoch 38, Loss: 3.2998
Epoch 39, Loss: 3.5213
Epoch 40, Loss: 3.7432
Epoch 41, Loss: 3.7578
Epo

In [7]:
# Example with reduced learning rate and class weights
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Calculate class weights for imbalanced data
class_weights = compute_class_weight('balanced', classes=np.unique(data.y), y=data.y.numpy())
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Define model, optimizer, and weighted loss function
model = mGNN(in_channels=data.x.shape[1], hidden_channels=256, out_channels=len(data.y.unique()), num_layers=5)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)  # Reduced learning rate
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss:.4f}')

# Evaluate accuracy
model.eval()
with torch.no_grad():
    out = model(data.x, data.edge_index)
    _, pred = out[test_mask].max(dim=1)
    accuracy = accuracy_score(data.y[test_mask].cpu(), pred.cpu())
    print(f'Accuracy: {accuracy:.4f}')


TypeError: forward() missing 1 required positional argument: 'inter_edge_index'

In [24]:
# Simple MLP for initial testing
class SimpleMLP(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(SimpleMLP, self).__init__()
        self.fc1 = torch.nn.Linear(in_channels, hidden_channels)
        self.fc2 = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define a simpler model
model = SimpleMLP(in_channels=data.x.shape[1], hidden_channels=128, out_channels=len(data.y.unique()))
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

# Training loop remains the same
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(data.x)
    loss = criterion(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss:.4f}')

# Evaluate accuracy
model.eval()
with torch.no_grad():
    out = model(data.x)
    _, pred = out[test_mask].max(dim=1)
    accuracy = accuracy_score(data.y[test_mask].cpu(), pred.cpu())
    print(f'Accuracy: {accuracy:.4f}')


Epoch 1, Loss: 14.2160
Epoch 2, Loss: 98.9591
Epoch 3, Loss: 111.2072
Epoch 4, Loss: 140.3306
Epoch 5, Loss: 177.4357
Epoch 6, Loss: 168.4834
Epoch 7, Loss: 182.5373
Epoch 8, Loss: 191.2162
Epoch 9, Loss: 206.9705
Epoch 10, Loss: 204.0223
Epoch 11, Loss: 197.6575
Epoch 12, Loss: 158.4462
Epoch 13, Loss: 130.7926
Epoch 14, Loss: 100.7701
Epoch 15, Loss: 81.7536
Epoch 16, Loss: 50.5388
Epoch 17, Loss: 29.8745
Epoch 18, Loss: 13.6502
Epoch 19, Loss: 3.6823
Epoch 20, Loss: 3.2232
Epoch 21, Loss: 3.2223
Epoch 22, Loss: 3.2214
Epoch 23, Loss: 3.2205
Epoch 24, Loss: 3.2195
Epoch 25, Loss: 3.2185
Epoch 26, Loss: 3.2174
Epoch 27, Loss: 3.2163
Epoch 28, Loss: 3.2152
Epoch 29, Loss: 3.2140
Epoch 30, Loss: 3.2129
Epoch 31, Loss: 3.2117
Epoch 32, Loss: 3.2105
Epoch 33, Loss: 3.2093
Epoch 34, Loss: 3.2080
Epoch 35, Loss: 3.2068
Epoch 36, Loss: 3.2055
Epoch 37, Loss: 3.2043
Epoch 38, Loss: 3.2030
Epoch 39, Loss: 3.2017
Epoch 40, Loss: 3.2005
Epoch 41, Loss: 3.1992
Epoch 42, Loss: 3.1979
Epoch 43, Los

In [25]:
# Simplify the feature set by selecting only the most relevant features (if possible)
# Example: Select a subset of features
selected_features = df_aggregated.iloc[:, :5000]  # Example to limit the number of features

data.x = torch.tensor(selected_features.values, dtype=torch.float)

# Apply StandardScaler for normalization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data.x = torch.tensor(scaler.fit_transform(data.x), dtype=torch.float)

# Define model with dropout layers for regularization
class SimpleMLP(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.5):
        super(SimpleMLP, self).__init__()
        self.fc1 = torch.nn.Linear(in_channels, hidden_channels)
        self.dropout = torch.nn.Dropout(dropout)
        self.fc2 = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Train the model with dropout
model = SimpleMLP(in_channels=data.x.shape[1], hidden_channels=128, out_channels=len(data.y.unique()), dropout=0.5)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(data.x)
    loss = criterion(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss:.4f}')

# Evaluate accuracy
model.eval()
with torch.no_grad():
    out = model(data.x)
    _, pred = out[test_mask].max(dim=1)
    accuracy = accuracy_score(data.y[test_mask].cpu(), pred.cpu())
    print(f'Accuracy: {accuracy:.4f}')


Epoch 1, Loss: 3.2624
Epoch 2, Loss: 2.9423
Epoch 3, Loss: 2.5290
Epoch 4, Loss: 2.1474
Epoch 5, Loss: 1.8328
Epoch 6, Loss: 1.5807
Epoch 7, Loss: 1.3748
Epoch 8, Loss: 1.1746
Epoch 9, Loss: 1.0139
Epoch 10, Loss: 0.8321
Epoch 11, Loss: 0.7088
Epoch 12, Loss: 0.6697
Epoch 13, Loss: 0.5392
Epoch 14, Loss: 0.4599
Epoch 15, Loss: 0.3918
Epoch 16, Loss: 0.3437
Epoch 17, Loss: 0.2663
Epoch 18, Loss: 0.2702
Epoch 19, Loss: 0.2216
Epoch 20, Loss: 0.1866
Epoch 21, Loss: 0.1559
Epoch 22, Loss: 0.1326
Epoch 23, Loss: 0.1338
Epoch 24, Loss: 0.1332
Epoch 25, Loss: 0.0943
Epoch 26, Loss: 0.0755
Epoch 27, Loss: 0.0738
Epoch 28, Loss: 0.0632
Epoch 29, Loss: 0.0586
Epoch 30, Loss: 0.0459
Epoch 31, Loss: 0.0364
Epoch 32, Loss: 0.0456
Epoch 33, Loss: 0.0423
Epoch 34, Loss: 0.0239
Epoch 35, Loss: 0.0245
Epoch 36, Loss: 0.0276
Epoch 37, Loss: 0.0224
Epoch 38, Loss: 0.0272
Epoch 39, Loss: 0.0265
Epoch 40, Loss: 0.0164
Epoch 41, Loss: 0.0174
Epoch 42, Loss: 0.0242
Epoch 43, Loss: 0.0177
Epoch 44, Loss: 0.01

In [27]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from torch_geometric.utils import to_undirected  # Import the missing function

# Load the dataset
file_path = 'Incidents.xlsx'
data = pd.read_excel(file_path)

# Preprocess data as before
data['Job Substation'] = data['Job Substation'].astype(str).str.strip().str.upper()
layer_columns = ['Job Region', 'Month/Day/Year', 'Custs Affected', 
                 'OGE Causes', 'Major Storm Event  Y (Yes) or N (No)', 
                 'Distribution, Substation, Transmission']

nodes = data['Job Substation'].unique()
node_to_idx = {node: idx for idx, node in enumerate(nodes)}
edge_indices = []

for col in layer_columns:
    edges = []
    for value in data[col].unique():
        same_layer_nodes = data[data[col] == value]['Job Substation'].unique()
        for i in range(len(same_layer_nodes)):
            for j in range(i + 1, len(same_layer_nodes)):
                node1 = same_layer_nodes[i]
                node2 = same_layer_nodes[j]
                if node1 in node_to_idx and node2 in node_to_idx:
                    node1_idx = node_to_idx[node1]
                    node2_idx = node_to_idx[node2]
                    if node1_idx != node2_idx:
                        edges.append((node1_idx, node2_idx))
    if len(edges) > 0:
        edge_indices.append(to_undirected(torch.tensor(edges, dtype=torch.long).t().contiguous()))
    else:
        edge_indices.append(torch.tensor([], dtype=torch.long))

filtered_data = data.drop_duplicates(subset=['Job Substation'])
feature_columns = filtered_data.columns.difference(layer_columns + ['Job Area (DISTRICT)', 'Job Substation'])
numeric_features = filtered_data[feature_columns].select_dtypes(include=[np.number])
numeric_features = numeric_features.fillna(0)
node_features = numeric_features.values
scaler = StandardScaler()
node_features = scaler.fit_transform(node_features)
node_features = torch.tensor(node_features, dtype=torch.float)

class MultilayerData(Data):
    def __init__(self, edge_indices=None, **kwargs):
        super(MultilayerData, self).__init__(**kwargs)
        self.edge_indices = edge_indices if edge_indices is not None else []
    
    def __inc__(self, key, value, *args, **kwargs):
        if key == 'edge_indices':
            return [self.num_nodes] * len(self.edge_indices)
        else:
            return super().__inc__(key, value, *args, **kwargs)

multilayer_data = MultilayerData(x=node_features, edge_indices=edge_indices)

class SimplemGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(SimplemGNN, self).__init__()
        self.conv_intra = GCNConv(in_channels, hidden_channels)
        self.conv_inter = GCNConv(hidden_channels, hidden_channels)
        self.lin = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, x, edge_indices):
        out = 0
        for edge_index in edge_indices:
            intra_out = F.relu(self.conv_intra(x, edge_index))
            inter_out = F.relu(self.conv_inter(intra_out, edge_index))
            out += inter_out
        out = self.lin(out)
        return F.log_softmax(out, dim=1)

target_column = 'Job Area (DISTRICT)'
num_classes = filtered_data[target_column].nunique()
y = torch.tensor(filtered_data[target_column].astype('category').cat.codes.values, dtype=torch.long)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []

for fold, (train_idx, test_idx) in enumerate(kf.split(node_features)):
    print(f"FOLD {fold + 1}")

    # Define model, optimizer, and loss function
    model = SimplemGNN(in_channels=node_features.shape[1], hidden_channels=64, out_channels=num_classes)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()

    # Train/validation split
    train_mask = torch.tensor([i in train_idx for i in range(len(node_features))], dtype=torch.bool)
    test_mask = torch.tensor([i in test_idx for i in range(len(node_features))], dtype=torch.bool)

    # Training loop
    model.train()
    for epoch in range(100):  # Adjust the number of epochs as needed
        optimizer.zero_grad()
        output = model(multilayer_data.x, multilayer_data.edge_indices)
        loss = criterion(output[train_mask], y[train_mask])
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        output = model(multilayer_data.x, multilayer_data.edge_indices)
        _, predicted = torch.max(output[test_mask], 1)
        correct = (predicted == y[test_mask]).sum().item()
        accuracy = correct / test_mask.sum().item()
        accuracies.append(accuracy)
        print(f"Fold {fold + 1}, Accuracy: {accuracy * 100:.2f}%")

# Average accuracy across all folds
avg_accuracy = np.mean(accuracies)
print(f"Average 5-Fold Accuracy: {avg_accuracy * 100:.2f}%")


FOLD 1


In [8]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.utils import to_undirected
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Load the dataset
file_path = 'Incidents.xlsx'
data = pd.read_excel(file_path)

# Convert the Job Substation column to strings and clean the data
data['Job Substation'] = data['Job Substation'].astype(str).str.strip().str.upper()

# Define the columns used for constructing the layers of the multilayer network
layer_columns = ['Job Region', 'Month/Day/Year', 'Custs Affected', 
                 'OGE Causes', 'Major Storm Event  Y (Yes) or N (No)', 
                 'Distribution, Substation, Transmission']

# Initialize the graph by extracting unique Job Substations
nodes = data['Job Substation'].unique()
node_to_idx = {node: idx for idx, node in enumerate(nodes)}

# Create a list to hold edge indices for each layer
edge_indices = []

for col in layer_columns:
    edges = []
    for value in data[col].unique():
        same_layer_nodes = data[data[col] == value]['Job Substation'].unique()
        for i in range(len(same_layer_nodes)):
            for j in range(i + 1, len(same_layer_nodes)):
                node1 = same_layer_nodes[i]
                node2 = same_layer_nodes[j]
                if node1 in node_to_idx and node2 in node_to_idx:
                    node1_idx = node_to_idx[node1]
                    node2_idx = node_to_idx[node2]
                    if node1_idx != node2_idx:
                        edges.append((node1_idx, node2_idx))
    
    # Convert to PyTorch tensor and append to edge_indices list
    if len(edges) > 0:
        edge_indices.append(to_undirected(torch.tensor(edges, dtype=torch.long).t().contiguous()))
    else:
        edge_indices.append(torch.tensor([], dtype=torch.long))

# Filter and ensure each Job Substation appears only once
filtered_data = data.drop_duplicates(subset=['Job Substation'])

# Ensure all node features are numeric before scaling
feature_columns = filtered_data.columns.difference(layer_columns + ['Job Area (DISTRICT)', 'Job Substation'])
numeric_features = filtered_data[feature_columns].select_dtypes(include=[np.number])

# Handle NaN values by filling them with a constant (e.g., 0 or the mean)
numeric_features = numeric_features.fillna(0)

# Convert node features to a numpy array and normalize
node_features = numeric_features.values
scaler = StandardScaler()
node_features = scaler.fit_transform(node_features)

# Convert node features to tensor
node_features = torch.tensor(node_features, dtype=torch.float)

# Prepare intra-layer and inter-layer edges
intra_edges = []
inter_edges = []

for col in layer_columns:
    intra_edges.extend(list(edge_indices[col].numpy().T))  # Add intra-layer edges

# Create inter-layer edges by connecting each node to its counterparts across layers (clique)
for i in range(len(nodes)):
    for j in range(i + 1, len(nodes)):
        inter_edges.append((i, j))

# Convert intra-layer edges and inter-layer edges to PyTorch tensors
intra_edge_index = torch.tensor(list(zip(*intra_edges)), dtype=torch.long)
inter_edge_index = torch.tensor(list(zip(*inter_edges)), dtype=torch.long)

# Define the model with appropriate output channels
target_column = 'Job Area (DISTRICT)'
num_classes = filtered_data[target_column].nunique()

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv

# Define a custom mGNN model using GATConv for intra-layer propagation and GCNConv for inter-layer propagation
class CustommGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=1):
        super(CustommGNN, self).__init__()
        # GATConv for intra-layer propagation
        self.conv_intra = GATConv(in_channels, hidden_channels, heads=heads, concat=True)
        # GCNConv for inter-layer propagation
        self.conv_inter = GCNConv(hidden_channels * heads, hidden_channels)
        self.lin = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, x, intra_edge_index, inter_edge_index):
        # Intra-layer propagation
        intra_out = F.relu(self.conv_intra(x, intra_edge_index))
        # Inter-layer propagation
        inter_out = F.relu(self.conv_inter(intra_out, inter_edge_index))
        out = self.lin(inter_out)
        return F.log_softmax(out, dim=1)

# Define the model with the appropriate number of input, hidden, and output channels
model = CustommGNN(in_channels=node_features.shape[1], hidden_channels=64, out_channels=num_classes, heads=4)

# Convert the target labels to a tensor
y = torch.tensor(filtered_data[target_column].astype('category').cat.codes.values, dtype=torch.long)

# Define the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, min_lr=1e-6)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
model.train()
for epoch in range(500):
    optimizer.zero_grad()
    output = model(node_features, [intra_edge_index, inter_edge_index])  # Pass both edge indices
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()
    scheduler.step(loss)

    # Calculate accuracy
    _, predicted = torch.max(output, 1)
    correct = (predicted == y).sum().item()
    accuracy = correct / y.size(0)

    print(f'Epoch {epoch + 1}, Loss: {loss.item():.4f}, Accuracy: {accuracy * 100:.2f}%')


TypeError: list indices must be integers or slices, not str

In [4]:
import pandas as pd
import numpy as np
import h5py
import torch
from sklearn.preprocessing import OneHotEncoder
from torch_geometric.utils import to_undirected

# Load the dataset
file_path = 'Incidents.xlsx'
data = pd.read_excel(file_path, engine='openpyxl')

# Convert the Job Substation column to strings and clean the data
data['Job Substation'] = data['Job Substation'].astype(str).str.strip().str.upper()

# Columns with mixed types - convert these to strings
columns_to_convert = [
    'CAD_ID', 'Job City', 'Device Address', 'STRCTUR_NO/Job Device ID',
    'Device Type', 'Dev Subtype', 'Lead Crew', 'Lead Crew Phone', 
    'AM Notes', 'Ark Grid Mod or OK Grid Enhancement Circuits'
]
data[columns_to_convert] = data[columns_to_convert].astype(str)

# Define the columns used for constructing the layers of the multilayer network
layer_columns = ['Job Region', 'Month/Day/Year', 'Custs Affected', 
                 'OGE Causes', 'Major Storm Event  Y (Yes) or N (No)', 
                 'Distribution, Substation, Transmission']

# Initialize the graph by extracting unique Job Substations
nodes = data['Job Substation'].unique()
node_to_idx = {node: idx for idx, node in enumerate(nodes)}

# Create a list to hold edge indices for each layer
edge_indices = []

for col in layer_columns:
    edges = []
    for value in data[col].unique():
        same_layer_nodes = data[data[col] == value]['Job Substation'].unique()
        for i in range(len(same_layer_nodes)):
            for j in range(i + 1, len(same_layer_nodes)):
                node1 = same_layer_nodes[i]
                node2 = same_layer_nodes[j]
                if node1 in node_to_idx and node2 in node_to_idx:
                    node1_idx = node_to_idx[node1]
                    node2_idx = node_to_idx[node2]
                    if node1_idx != node_to_idx:
                        edges.append((node1_idx, node2_idx))
    
    # Convert to PyTorch tensor and append to edge_indices list
    if len(edges) > 0:
        edge_indices.append(to_undirected(torch.tensor(edges, dtype=torch.long).t().contiguous()))
    else:
        edge_indices.append(torch.tensor([], dtype=torch.long))

# Prepare Node Features
filtered_data = data.drop_duplicates(subset=['Job Substation'])
feature_columns = filtered_data.columns.difference(layer_columns + ['Job Area (DISTRICT)', 'Job Substation'])

# Extract feature names
feature_names = np.array(feature_columns, dtype='S')

# Separate numeric and non-numeric features
numeric_features = filtered_data[feature_columns].select_dtypes(include=[np.number])
non_numeric_features = filtered_data[feature_columns].select_dtypes(exclude=[np.number])

# Ensure uniform data type in non-numeric features by converting everything to string
non_numeric_features = non_numeric_features.astype(str)

# One-Hot Encode non-numeric features
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_non_numeric_features = encoder.fit_transform(non_numeric_features)

# Combine numeric and encoded non-numeric features
features = np.hstack([numeric_features.values, encoded_non_numeric_features])

# Prepare Target Classes
target_classes = pd.Categorical(filtered_data['Job Area (DISTRICT)']).codes

# Prepare Train/Test Masks (example: randomly split)
num_nodes = len(nodes)
train_mask = np.random.rand(num_nodes) < 0.8
test_mask = ~train_mask

# Create the HDF5 container
output_path = 'emgnn_data_with_features_fixed.h5'
with h5py.File(output_path, 'w') as f:
    # Store each layer's adjacency matrix
    for i, edges in enumerate(edge_indices):
        f.create_dataset(f'network_layer{i+1}', data=edges.numpy())

    # Store node features
    f.create_dataset('features', data=features)
    
    # Store gene (substation) names
    f.create_dataset('gene_names', data=np.array(nodes, dtype='S'))
    
    # Store labels and masks
    f.create_dataset('y_train', data=target_classes[train_mask])
    f.create_dataset('y_test', data=target_classes[test_mask])
    f.create_dataset('train_mask', data=train_mask)
    f.create_dataset('test_mask', data=test_mask)
    
    # Store feature names
    f.create_dataset('feature_names', data=feature_names)

output_path


'emgnn_data_with_features_fixed.h5'

In [12]:
import pandas as pd

# Load the Excel file
file_path = 'Incidents.xlsx'
data = pd.read_excel(file_path, engine='openpyxl')

# Extract unique values from the 'Job Substation' column
unique_job_substations = data['Job Area (DISTRICT)'].unique()

# Print the unique values
print(unique_job_substations)


['SOUTH CENTRAL' 'SOUTH' 'EAST' 'WEST' 'FORT SMITH' 'ALVA' 'EL RENO'
 'GUTHRIE' 'SHAWNEE' 'ADA' 'WEWOKA' 'NORTH' 'MUSKOGEE' 'DRUMRIGHT'
 'ARDMORE' 'POTEAU' 'HEALDTON' 'ENID' 'SEMINOLE' 'SAPULPA' 'PAULS VALLEY'
 'MADILL' 'DURANT' 'WOODWARD' 'SULPHUR' 'CHANDLER' 'OZARK' 'BRISTOW']


In [13]:
print(f"Number of unique Job Substations: {len(unique_job_substations)}")

Number of unique Job Substations: 28
