In [1]:
# Cell 01 : Loading 
import re
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as F
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from torch_geometric.nn import HeteroConv, RGCNConv, GATConv, Linear
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts

# Configuration
DATA_PATH = "Incidents_imputed.xlsx"
GRAPH_PATH = "Hetro_Final_NW_graph_1.pt"

# Load incident dataset
incident_df = pd.read_excel(DATA_PATH, parse_dates=['Job OFF Time', 'Job ON Time'], engine='openpyxl')

# Load heterogeneous graph
hetero_graph = torch.load(GRAPH_PATH)

print("Data and Graph Loaded Successfully!")


Data and Graph Loaded Successfully!


  hetero_graph = torch.load(GRAPH_PATH)


In [2]:
# Cell X: Data Exploration and Validation

# 1. Inspect the raw "Equip Desc" values
unique_equip_desc = incident_df['Equip Desc'].unique()
print(f"Number of unique 'Equip Desc' entries: {len(unique_equip_desc)}")
print("Sample 'Equip Desc' values:")
print(unique_equip_desc[:20])  # print first 20 for brevity

# 2. Frequency distribution of 'Equip Desc'
equip_desc_counts = incident_df['Equip Desc'].value_counts(dropna=False)
print("\nEquipment Description Counts:")
print(equip_desc_counts.head(30))  # top 30 for brevity

# 3. Verify the categorize_equip function
#    (assuming you've defined categorize_equip already)
def categorize_equip(desc):
    desc = str(desc).upper()
    EQUIP_GROUP_MAPPING = {
        r'\bFUSE\b': 'Fuse',
        r'\bCUTOUT\b': 'Fuse',
        r'\bCONDUCTOR\b': 'Conductor',
        r'\b(RECLOSER|ARRESTOR|SUBSTATION CIRCUIT|RELAY|CIRCUIT BREAKER|GROUNDING)\b': 'Protection_Device',
        r'\b(TRANSFORMER|XFMR)\b': 'Transformer',
        r'\b(CONNECTOR|JUMPER|SPLICE)\b': 'Infrastructure',
        r'\bCUSTOMER\b': 'Customer_Equipment',
        r'\b(POLE|CROSSARM|PIN|TOWER|ANCHOR)\b': 'Infrastructure',
        r'\b(REGULATOR|CAPACITOR)\b': 'Power_Management'
    }
    for pattern, group in EQUIP_GROUP_MAPPING.items():
        if re.search(pattern, desc, flags=re.IGNORECASE):
            return group
    return 'Customer_Equipment'

incident_df['EDA_Equipment_Group'] = incident_df['Equip Desc'].apply(categorize_equip)

# 4. Distribution of categories BEFORE any merges (Transformer -> Infrastructure, etc.)
raw_group_counts = incident_df['EDA_Equipment_Group'].value_counts(dropna=False)
print("\nRaw Category Distribution (before merges):")
print(raw_group_counts)

# 5. Check final category distribution AFTER merging 'Power_Management'/'Transformer' => 'Infrastructure'
#    (mirroring your logic)
def merge_categories(category):
    if category == 'Power_Management':
        return 'Infrastructure'
    elif category == 'Transformer':
        return 'Infrastructure'
    else:
        return category

incident_df['EDA_Equipment_Group_Merged'] = incident_df['EDA_Equipment_Group'].apply(merge_categories)
merged_group_counts = incident_df['EDA_Equipment_Group_Merged'].value_counts(dropna=False)
print("\nFinal Category Distribution (after merges):")
print(merged_group_counts)

# 6. Substation-level analysis
#    Let's see how many incidents per substation, and the distribution of final categories.
grouped_subs = incident_df.groupby('Job Substation')['EDA_Equipment_Group_Merged'].value_counts(normalize=True)
print("\nDistribution of categories per substation (top 10 substations):")
print(grouped_subs.head(10))  # print first 10 substation-group combos

# 7. Identify majority categories per substation
#    We'll see which category is the top one per substation, ignoring any priority threshold.
def majority_category(series):
    return series.idxmax()

substation_majorities = incident_df.groupby('Job Substation')['EDA_Equipment_Group_Merged'].agg(
    majority_category=lambda x: x.value_counts().idxmax(),
    top_share=lambda x: x.value_counts(normalize=True).max()
).reset_index()

print("\nSample of majority categories per substation:")
print(substation_majorities.head(10))

# 8. Compare with your priority-based logic
#    If you have code for priority-based logic, replicate it here to see if there's a mismatch.
#    For example (pseudo-code):
"""
priority_classes = ['Protection_Device','Infrastructure','Conductor']
threshold = 0.2

def priority_label(dist_series):
    # dist_series is the value_counts(normalize=True) for a substation
    for cls in priority_classes:
        if dist_series.get(cls, 0) > threshold:
            return cls
    return dist_series.idxmax()
"""

# 9. Summarize how many times each category ends up as the majority vs. your priority-based approach.
#    This will reveal if the threshold logic is overshadowing certain categories.
#    (Optional, depending on how your code is structured.)


Number of unique 'Equip Desc' entries: 112
Sample 'Equip Desc' values:
['CONDUCTOR SECONDARY - OH' 'CONDUCTOR SERVICE - OH'
 'CONDUCTOR PRIMARY - UG' 'FUSE - TRANSFORMER' 'CONNECTOR'
 'CUSTOMER EQUIPMENT' 'FUSE - PRIMARY' 'CONDUCTOR PRIMARY - OH' 'CUTOUT'
 'CONDUCTOR SECONDARY - UG' 'GROUNDING' 'FUSE BARREL' 'TRANSFORMER OH'
 'INSULATOR' 'POLE' 'ARRESTOR OH' 'CROSSARM' 'JUMPER' 'SUBSTATION CIRCUIT'
 'CAPACITOR']

Equipment Description Counts:
Equip Desc
FUSE - TRANSFORMER          59287
CANCELLED                   33170
FUSE - PRIMARY              30081
CUSTOMER EQUIPMENT          22818
ON UPON ARRIVAL             17059
CONDUCTOR SECONDARY - OH    14304
CONDUCTOR PRIMARY - OH      11989
SUBSTATION CIRCUIT          11790
OTHER                       11364
POLE                        11071
TRANSFORMER OH               9213
CONDUCTOR SERVICE - OH       8038
CONDUCTOR PRIMARY - UG       6496
CONNECTOR                    6482
JUMPER                       5907
CUTOUT                       422

"\npriority_classes = ['Protection_Device','Infrastructure','Conductor']\nthreshold = 0.2\n\ndef priority_label(dist_series):\n    # dist_series is the value_counts(normalize=True) for a substation\n    for cls in priority_classes:\n        if dist_series.get(cls, 0) > threshold:\n            return cls\n    return dist_series.idxmax()\n"

In [3]:
# Cell 2: Multi Class Final

import re
import torch
import pandas as pd
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# Define mapping of equipment descriptions to categories
EQUIP_GROUP_MAPPING = {
    r'\bFUSE\b': 'Fuse',
    r'\bCUTOUT\b': 'Fuse',
    r'\bCONDUCTOR\b': 'Conductor',
    r'\b(RECLOSER|ARRESTOR|SUBSTATION CIRCUIT|RELAY|CIRCUIT BREAKER|GROUNDING)\b': 'Protection_Device',
    r'\b(TRANSFORMER|XFMR)\b': 'Transformer',
    r'\b(CONNECTOR|JUMPER|SPLICE)\b': 'Infrastructure',
    r'\bCUSTOMER\b': 'Customer_Equipment',
    r'\b(POLE|CROSSARM|PIN|TOWER|ANCHOR)\b': 'Infrastructure',
    r'\b(REGULATOR|CAPACITOR)\b': 'Power_Management'
}

def categorize_equip(desc):
    """Categorize equipment based on description using regex matching."""
    desc = str(desc).upper()
    for pattern, group in EQUIP_GROUP_MAPPING.items():
        if re.search(pattern, desc, flags=re.IGNORECASE):
            return group
    return 'Customer_Equipment'  # Default category

# Apply categorization to dataset
incident_df['Equipment_Group'] = incident_df['Equip Desc'].apply(categorize_equip)

# Replace categories as per earlier merging logic
incident_df['Equipment_Group'] = incident_df['Equipment_Group'].replace({
    'Power_Management': 'Infrastructure',
    'Transformer': 'Infrastructure'
})

# Print class distribution
print("\nEquipment Grouping Recreated!")
print(incident_df['Equipment_Group'].value_counts(normalize=True))

# Clean & Standardize Job Substation names (ensuring consistency with graph)
incident_df['Job Substation'] = incident_df['Job Substation'].astype(str).str.strip().str.upper()

def clean_and_standardize_substations(substation_name):
    """Cleans and standardizes substation names to '<feeder_id>:<substation_name>' format."""
    match = re.match(r"(\d+)\s*:\s*(.+)", substation_name.strip().upper())
    if match:
        feeder_id, name = match.groups()
        return f"{feeder_id}:{name.strip()}"
    return None  # Mark invalid format clearly

# Apply cleaning function
incident_df['Job Substation'] = incident_df['Job Substation'].apply(clean_and_standardize_substations)
valid_subs = incident_df['Job Substation'].dropna().unique()

# Class labels with weighting
class_labels = {
    'Fuse': 0, 
    'Conductor': 1, 
    'Infrastructure': 2,
    'Customer_Equipment': 3, 
    'Protection_Device': 4
}

# Class Weights with Smoothing (avoid division by zero)
class_counts = incident_df['Equipment_Group'].value_counts().reindex(class_labels.keys(), fill_value=1)
class_weights = 1.0 / (class_counts + 1e-6)
class_weights = class_weights / class_weights.sum()  # Normalize
print("\nInitial Class Weights (from incident data):")
print(class_weights)

# Set a threshold for Infrastructure proportion.
infrastructure_threshold = 0.15

def assign_label_with_infra_override(group_series, infra_thresh=infrastructure_threshold):
    """
    Given a series of equipment categories for a substation (from EDA_Equipment_Group_Merged),
    compute the normalized frequency (proportions). If 'Infrastructure' proportion meets or exceeds
    the threshold, assign 'Infrastructure' as the label; otherwise, assign the most frequent category.
    """
    counts = group_series.value_counts(normalize=True)
    # Check if Infrastructure meets the threshold.
    if counts.get('Infrastructure', 0) >= infra_thresh:
        return 'Infrastructure'
    else:
        return counts.idxmax()

# Apply the new label assignment for each substation.
substation_new_labels = incident_df.groupby('Job Substation')['EDA_Equipment_Group_Merged'] \
                                   .agg(assign_label_with_infra_override)

print("Alternative Majority Vote with Infrastructure Override (sample):")
print(substation_new_labels.head(10))

# Check the distribution of the new textual labels.
new_label_distribution = substation_new_labels.value_counts()
print("\nNew Distribution of Majority Labels (textual):")
print(new_label_distribution)

# Map the textual labels to numeric labels using class_labels.
substation_new_numeric = substation_new_labels.map(class_labels)

# Map Labels to Graph Nodes (fill missing nodes with 'Fuse' by default)
substation_labels = pd.Series(substation_labels)
missing_subs = set(hetero_graph['substation'].node_ids) - set(substation_labels.index)
print(f"\nNumber of graph nodes missing in incident data: {len(missing_subs)}")
substation_labels = substation_labels.reindex(hetero_graph['substation'].node_ids, fill_value=class_labels['Fuse'])

# Convert to Tensor and assign to graph
hetero_graph['substation'].y = torch.tensor(
    substation_labels.values.astype(np.int64),
    dtype=torch.long,
    device=device
)

# Process and update class weights:
# Clip and normalize weights to avoid extreme values.
cw_values = class_weights.values  # convert to numpy array
max_weight = np.percentile(cw_values, 90)  # cap extreme weights
cw_values = np.clip(cw_values, 0.1, max_weight)
cw_values = cw_values / cw_values.sum()  # re-normalize

# Assign class weights to the graph
hetero_graph['substation'].class_weights = torch.tensor(cw_values, dtype=torch.float32, device=device)

print(f"\nFinal Class Weights assigned to graph nodes: {hetero_graph['substation'].class_weights.cpu().numpy()}")


Using device: cuda

Equipment Grouping Recreated!
Equipment_Group
Fuse                  0.336005
Customer_Equipment    0.320310
Conductor             0.163706
Infrastructure        0.129765
Protection_Device     0.050214
Name: proportion, dtype: float64

Initial Class Weights (from incident data):
Equipment_Group
Fuse                  0.074725
Conductor             0.153373
Infrastructure        0.193489
Customer_Equipment    0.078387
Protection_Device     0.500026
Name: count, dtype: float64
Alternative Majority Vote with Infrastructure Override (sample):
Job Substation
3109:HONOR HEIGHTS        Infrastructure
3110:RIVERSIDE                      Fuse
3111:FIVE TRIBES                    Fuse
3114:TENNYSON                       Fuse
3116:CALLERY                        Fuse
3117:THREE RIVERS     Customer_Equipment
3128:HANCOCK                        Fuse
3131:MUSKOGEE WW      Customer_Equipment
3132:MUSKOGEE PORT                  Fuse
3136:EUCLID                         Fuse
Name: EDA_Eq

NameError: name 'substation_labels' is not defined

In [4]:
# Cell 2: Multi Class Final (Optimal Labeling Based on EDA )

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# 1. Define mapping of equipment descriptions to categories.
EQUIP_GROUP_MAPPING = {
    r'\bFUSE\b': 'Fuse',
    r'\bCUTOUT\b': 'Fuse',
    r'\bCONDUCTOR\b': 'Conductor',
    r'\b(RECLOSER|ARRESTOR|SUBSTATION CIRCUIT|RELAY|CIRCUIT BREAKER|GROUNDING)\b': 'Protection_Device',
    r'\b(TRANSFORMER|XFMR)\b': 'Transformer',
    r'\b(CONNECTOR|JUMPER|SPLICE)\b': 'Infrastructure',
    r'\bCUSTOMER\b': 'Customer_Equipment',
    r'\b(POLE|CROSSARM|PIN|TOWER|ANCHOR)\b': 'Infrastructure',
    r'\b(REGULATOR|CAPACITOR)\b': 'Power_Management'
}

def categorize_equip(desc):
    """Categorize equipment based on description using regex matching."""
    desc = str(desc).upper()
    for pattern, group in EQUIP_GROUP_MAPPING.items():
        if re.search(pattern, desc, flags=re.IGNORECASE):
            return group
    return 'Customer_Equipment'

# 2. Apply categorization and merge categories.
incident_df['Equipment_Group'] = incident_df['Equip Desc'].apply(categorize_equip)
incident_df['Equipment_Group'] = incident_df['Equipment_Group'].replace({
    'Power_Management': 'Infrastructure',
    'Transformer': 'Infrastructure'
})

print("\nEquipment Grouping Recreated!")
print(incident_df['Equipment_Group'].value_counts(normalize=True))

# 3. Clean & Standardize Job Substation names.
incident_df['Job Substation'] = incident_df['Job Substation'].astype(str).str.strip().str.upper()

def clean_and_standardize_substations(substation_name):
    """Cleans and standardizes substation names to '<feeder_id>:<substation_name>' format."""
    match = re.match(r"(\d+)\s*:\s*(.+)", substation_name.strip().upper())
    if match:
        feeder_id, name = match.groups()
        return f"{feeder_id}:{name.strip()}"
    return None

incident_df['Job Substation'] = incident_df['Job Substation'].apply(clean_and_standardize_substations)
valid_subs = incident_df['Job Substation'].dropna().unique()

# 4. Define class labels.
class_labels = {
    'Fuse': 0, 
    'Conductor': 1, 
    'Infrastructure': 2,
    'Customer_Equipment': 3, 
    'Protection_Device': 4
}

# 5. Compute class weights (with smoothing).
class_counts = incident_df['Equipment_Group'].value_counts().reindex(class_labels.keys(), fill_value=1)
class_weights = 1.0 / (class_counts + 1e-6)
class_weights = class_weights / class_weights.sum()  # Normalize
print("\nInitial Class Weights (from incident data):")
print(class_weights)

# 6. Set a threshold for the Infrastructure override.
infrastructure_threshold = 0.15

def assign_label_with_infra_override(group_series, infra_thresh=infrastructure_threshold):
    """
    For a given series of equipment categories for a substation,
    if the proportion of 'Infrastructure' is >= infra_thresh, return 'Infrastructure';
    otherwise, return the most frequent category.
    """
    counts = group_series.value_counts(normalize=True)
    if counts.get('Infrastructure', 0) >= infra_thresh:
        return 'Infrastructure'
    else:
        return counts.idxmax()

# 7. Assign substation labels using alternative majority vote with Infrastructure override.
substation_new_labels = incident_df.groupby('Job Substation')['Equipment_Group'] \
                                   .agg(assign_label_with_infra_override)

print("\nAlternative Majority Vote with Infrastructure Override (sample):")
print(substation_new_labels.head(10))

print("\nNew Distribution of Majority Labels (textual):")
print(substation_new_labels.value_counts())

# 8. Map textual labels to numeric labels.
substation_new_numeric = substation_new_labels.map(class_labels)

# 9. Ensure every node in the graph has a label (fill missing with 'Fuse').
substation_new_numeric = substation_new_numeric.reindex(hetero_graph['substation'].node_ids, fill_value=class_labels['Fuse'])

# 10. Update the graph with these labels.
hetero_graph['substation'].y = torch.tensor(substation_new_numeric.values.astype(np.int64),
                                             dtype=torch.long,
                                             device=device)

# 11. Process and update class weights: clip and re-normalize.
cw_values = class_weights.values  # convert to numpy array
max_weight = np.percentile(cw_values, 90)  # cap extreme weights
cw_values = np.clip(cw_values, 0.1, max_weight)
cw_values = cw_values / cw_values.sum()  # re-normalize

hetero_graph['substation'].class_weights = torch.tensor(cw_values, dtype=torch.float32, device=device)

print(f"\nFinal Class Weights assigned to graph nodes: {hetero_graph['substation'].class_weights.cpu().numpy()}")

print("\nFinal Label Distribution (Numeric):")
final_distribution = pd.Series(substation_new_numeric).value_counts().sort_index()
print(final_distribution)


Using device: cuda

Equipment Grouping Recreated!
Equipment_Group
Fuse                  0.336005
Customer_Equipment    0.320310
Conductor             0.163706
Infrastructure        0.129765
Protection_Device     0.050214
Name: proportion, dtype: float64

Initial Class Weights (from incident data):
Equipment_Group
Fuse                  0.074725
Conductor             0.153373
Infrastructure        0.193489
Customer_Equipment    0.078387
Protection_Device     0.500026
Name: count, dtype: float64

Alternative Majority Vote with Infrastructure Override (sample):
Job Substation
3109:HONOR HEIGHTS        Infrastructure
3110:RIVERSIDE                      Fuse
3111:FIVE TRIBES                    Fuse
3114:TENNYSON                       Fuse
3116:CALLERY                        Fuse
3117:THREE RIVERS     Customer_Equipment
3128:HANCOCK                        Fuse
3131:MUSKOGEE WW      Customer_Equipment
3132:MUSKOGEE PORT                  Fuse
3136:EUCLID                         Fuse
Name: Equip

In [40]:
# Cell Test: Multi Class Final (No Class Weights)

import re
import torch
import pandas as pd
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# Define mapping of equipment descriptions to categories
EQUIP_GROUP_MAPPING = {
    r'\bFUSE\b': 'Fuse',
    r'\bCUTOUT\b': 'Fuse',
    r'\bCONDUCTOR\b': 'Conductor',
    r'\b(RECLOSER|ARRESTOR|SUBSTATION CIRCUIT|RELAY|CIRCUIT BREAKER|GROUNDING)\b': 'Protection_Device',
    r'\b(TRANSFORMER|XFMR)\b': 'Transformer',
    r'\b(CONNECTOR|JUMPER|SPLICE)\b': 'Infrastructure',
    r'\bCUSTOMER\b': 'Customer_Equipment',
    r'\b(POLE|CROSSARM|PIN|TOWER|ANCHOR)\b': 'Infrastructure',
    r'\b(REGULATOR|CAPACITOR)\b': 'Power_Management'
}

def categorize_equip(desc):
    """Categorize equipment based on description using regex matching."""
    desc = str(desc).upper()
    for pattern, group in EQUIP_GROUP_MAPPING.items():
        if re.search(pattern, desc, flags=re.IGNORECASE):
            return group
    return 'Customer_Equipment'  # Default category

# Apply categorization to dataset
incident_df['Equipment_Group'] = incident_df['Equip Desc'].apply(categorize_equip)

# Merge categories (Transformer, Power_Management => Infrastructure)
incident_df['Equipment_Group'] = incident_df['Equipment_Group'].replace({
    'Power_Management': 'Infrastructure',
    'Transformer': 'Infrastructure'
})

# Print class distribution
print("\nEquipment Grouping Recreated!")
print(incident_df['Equipment_Group'].value_counts(normalize=True))

# Clean & Standardize Job Substation names (ensuring consistency with graph)
incident_df['Job Substation'] = incident_df['Job Substation'].astype(str).str.strip().str.upper()

def clean_and_standardize_substations(substation_name):
    """Cleans and standardizes substation names to '<feeder_id>:<substation_name>' format."""
    match = re.match(r"(\d+)\s*:\s*(.+)", substation_name.strip().upper())
    if match:
        feeder_id, name = match.groups()
        return f"{feeder_id}:{name.strip()}"
    return None  # Mark invalid format clearly

incident_df['Job Substation'] = incident_df['Job Substation'].apply(clean_and_standardize_substations)
valid_subs = incident_df['Job Substation'].dropna().unique()

# Define the class labels (without weighting)
class_labels = {
    'Fuse': 0, 
    'Conductor': 1, 
    'Infrastructure': 2,
    'Customer_Equipment': 3, 
    'Protection_Device': 4
}

# ---------------------------------------------------------
# REMOVE CLASS WEIGHT CALCULATION ENTIRELY
# (No class_weights assigned to the graph)
# ---------------------------------------------------------

# We'll do a simple majority vote approach. If you already have
# an alternative method, replace the logic below with it.

substation_labels = {}
for substation, group in incident_df.groupby('Job Substation'):
    # If substation has no incidents, default to Fuse
    if len(group) == 0:
        substation_labels[substation] = class_labels['Fuse']
        continue
    
    # Simple majority vote
    class_dist = group['Equipment_Group'].value_counts()
    substation_labels[substation] = class_labels[class_dist.idxmax()]

substation_labels = pd.Series(substation_labels)
missing_subs = set(hetero_graph['substation'].node_ids) - set(substation_labels.index)
print(f"\nNumber of graph nodes missing in incident data: {len(missing_subs)}")

# Fill missing nodes with Fuse by default
substation_labels = substation_labels.reindex(hetero_graph['substation'].node_ids, fill_value=class_labels['Fuse'])

# Convert to Tensor and assign to graph
hetero_graph['substation'].y = torch.tensor(
    substation_labels.values.astype(np.int64),
    dtype=torch.long,
    device=device
)

# Print final label distribution
final_distribution = pd.Series(substation_labels).value_counts().sort_index()
print("\nFinal Label Distribution (Numeric):")
print(final_distribution)


Using device: cuda

Equipment Grouping Recreated!
Equipment_Group
Fuse                  0.336005
Customer_Equipment    0.320310
Conductor             0.163706
Infrastructure        0.129765
Protection_Device     0.050214
Name: proportion, dtype: float64

Number of graph nodes missing in incident data: 0

Final Label Distribution (Numeric):
0    202
1      4
3    123
4     18
Name: count, dtype: int64


In [None]:
# Cell 3: Stratified Splitting of the Dataset (Enhanced with Fallback)

from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
import torch
import pandas as pd

# Assume final_labels contains the final numeric substation labels from Cell 2.
# We'll use substation_new_numeric as our final labels.
final_labels = substation_new_numeric.copy()  # Series indexed by substation names, values: numeric labels

print("Overall Label Distribution for Stratified Split:")
print(final_labels.value_counts())

# Convert index and labels to numpy arrays.
substations = np.array(final_labels.index)
labels = final_labels.values

# First, split into train (70%) and temporary (30%)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.30, random_state=42)
for train_index, temp_index in sss.split(substations, labels):
    train_substations = substations[train_index]
    temp_substations = substations[temp_index]

# Check the distribution in the temporary set.
temp_labels = final_labels.loc[temp_substations].values
unique, counts = np.unique(temp_labels, return_counts=True)
print("Temporary set label counts:", dict(zip(unique, counts)))

# If any class in the temporary set has fewer than 2 samples, use a random split.
if np.min(np.bincount(temp_labels)) < 2:
    print("Warning: Some classes in the temporary set have fewer than 2 members. Using random split for validation/test.")
    indices = np.arange(len(temp_substations))
    np.random.shuffle(indices)
    split_idx = int(len(temp_substations) * 0.5)
    val_indices = indices[:split_idx]
    test_indices = indices[split_idx:]
    val_substations = temp_substations[val_indices]
    test_substations = temp_substations[test_indices]
else:
    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
    for val_index, test_index in sss2.split(temp_substations, temp_labels):
        val_substations = temp_substations[val_index]
        test_substations = temp_substations[test_index]

# Display counts in each split.
print("\nSubstation counts in each split:")
print(f"Train: {len(train_substations)}, Validation: {len(val_substations)}, Test: {len(test_substations)}")

# Now, create boolean masks for each node in the heterogeneous graph.
node_ids = hetero_graph['substation'].node_ids  # List of substation names from the graph.
train_mask = torch.tensor([sub in set(train_substations) for sub in node_ids], dtype=torch.bool, device=device)
val_mask = torch.tensor([sub in set(val_substations) for sub in node_ids], dtype=torch.bool, device=device)
test_mask = torch.tensor([sub in set(test_substations) for sub in node_ids], dtype=torch.bool, device=device)

# Assign these masks to the graph.
hetero_graph['substation'].train_mask = train_mask
hetero_graph['substation'].val_mask = val_mask
hetero_graph['substation'].test_mask = test_mask

# Print final counts and overall label distribution.
print("\nFinal Train/Val/Test Split:")
print(f"Train: {train_mask.sum().item()} nodes, Validation: {val_mask.sum().item()} nodes, Test: {test_mask.sum().item()} nodes")
print("\nOverall Final Label Distribution:")
print(pd.Series(final_labels).value_counts().to_dict())


In [41]:
# Cell 3: Splitting the dataset 

# Get unique substations sorted by earliest incident
substation_earliest_time = incident_df.groupby('Job Substation')['Job OFF Time'].min()

sorted_substations = substation_earliest_time.sort_values().index

# Define split percentages
train_ratio = 0.70  
val_ratio = 0.15 
test_ratio = 0.15 

# Compute split indices
train_cutoff = int(len(sorted_substations) * train_ratio)
val_cutoff = int(len(sorted_substations) * (train_ratio + val_ratio))

# Assign substations to splits
train_substations = set(sorted_substations[:train_cutoff])
val_substations = set(sorted_substations[train_cutoff:val_cutoff])
test_substations = set(sorted_substations[val_cutoff:])

# Initialize masks as False
train_mask = torch.zeros(len(hetero_graph['substation'].node_ids), dtype=torch.bool, device=device)
val_mask = torch.zeros(len(hetero_graph['substation'].node_ids), dtype=torch.bool, device=device)
test_mask = torch.zeros(len(hetero_graph['substation'].node_ids), dtype=torch.bool, device=device)

# Assign substations to the correct mask
for i, sub in enumerate(hetero_graph['substation'].node_ids):
    if sub in train_substations:
        train_mask[i] = True
    elif sub in val_substations:
        val_mask[i] = True
    elif sub in test_substations:
        test_mask[i] = True

# Assign to the graph
hetero_graph['substation'].train_mask = train_mask
hetero_graph['substation'].val_mask = val_mask
hetero_graph['substation'].test_mask = test_mask

# Step 6: Print final counts
assert hetero_graph['substation'].y.dtype == torch.long, "Labels must be long dtype"
assert hetero_graph['substation'].class_weights.shape[0] == len(class_labels), "Class weight mismatch"
print(f"\nFinal class distribution: {pd.Series(substation_labels).value_counts().to_dict()}")
print("Edge normalization complete")
print(f"Final Train/Val/Test split: {train_mask.sum().item()}/{val_mask.sum().item()}/{test_mask.sum().item()}")



Final class distribution: {0: 202, 3: 123, 4: 18, 1: 4}
Edge normalization complete
Final Train/Val/Test split: 251/52/44


In [42]:
# Cell 4: Model Definition and Training for Multiclass Classification

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch_geometric.nn import GCNConv
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import f1_score

# -------------------------
# Weighted GCN Convolution
# -------------------------
class WeightedGCNConv(nn.Module):
    def __init__(self, in_channels, out_channels, edge_attr_dim):
        super().__init__()
        self.gcn = GCNConv(in_channels, out_channels, add_self_loops=False)
        self.edge_mlp = nn.Sequential(
            nn.Linear(edge_attr_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
        self.batch_norm = nn.BatchNorm1d(out_channels)
        self.residual_proj = nn.Linear(in_channels, out_channels) if in_channels != out_channels else nn.Identity()
        self._init_weights()
    
    def _init_weights(self):
        for layer in self.edge_mlp:
            if isinstance(layer, nn.Linear):
                nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu')
                nn.init.constant_(layer.bias, 0.1)
    
    def forward(self, x, edge_index, edge_attr):
        if edge_index.size(1) == 0:
            return self.residual_proj(x)  # Handle empty edge case
        # Adjust edge MLP if needed
        edge_dim = edge_attr.shape[1] if edge_attr.dim() > 1 else 1
        if edge_dim != self.edge_mlp[0].in_features:
            print(f"⚠ Adjusting edge MLP: Expected {self.edge_mlp[0].in_features}, got {edge_dim}")
            self.edge_mlp[0] = nn.Linear(edge_dim, 64).to(edge_attr.device)
        # Normalize edge attributes
        edge_attr = (edge_attr - edge_attr.mean(dim=0)) / (edge_attr.std(dim=0) + 1e-8)
        edge_attr = edge_attr.unsqueeze(-1) if edge_attr.dim() == 1 else edge_attr
        weights = self.edge_mlp(edge_attr).squeeze()
        out = self.gcn(x, edge_index, edge_weight=weights + 1e-8)
        out = self.batch_norm(out)
        return out + self.residual_proj(x)


In [43]:
# ---------------------------------------
# Multiclass Heterogeneous GNN Model
# ---------------------------------------
class PowerGridGNN_Multiclass(nn.Module):
    def __init__(self, in_channels, hidden_dim, edge_dims, num_classes, num_layers=2):
        super().__init__()
        self.edge_types = list(edge_dims.keys())
        self.layers = nn.ModuleList()
        current_dim = in_channels
        
        # Build GCN layers for each edge type
        for _ in range(num_layers):
            layer_dict = nn.ModuleDict({
                et: WeightedGCNConv(current_dim, hidden_dim, edge_dims[et])
                for et in self.edge_types
            })
            self.layers.append(layer_dict)
            current_dim = hidden_dim
        
        # Multiclass prediction head (raw logits; CrossEntropyLoss applies softmax)
        self.head = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hidden_dim // 2, num_classes)
        )
        
        # Learnable aggregation weights for combining messages from different edge types
        self.aggregation_weights = nn.Parameter(torch.ones(len(self.edge_types)))
        self._init_weights()
    
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
    
    def _register_edge_stats(self, data):
        """Compute and store edge attribute statistics for normalization."""
        for et in self.edge_types:
            attr = data['substation', et, 'substation'].edge_attr
            self.register_buffer(f'{et}_mean', attr.mean(dim=0))
            self.register_buffer(f'{et}_std', attr.std(dim=0) + 1e-8)
    
    def _get_edge_masks(self, data, mode='train'):
        """Generate edge masks so that only edges connecting nodes within the current split are used."""
        masks = {}
        split_mask = getattr(data['substation'], f"{mode}_mask").bool().to(data['substation'].x.device)
        for et in self.edge_types:
            edge_info = data['substation', et, 'substation']
            edge_index = edge_info.edge_index
            source_mask = split_mask[edge_index[0]]
            target_mask = split_mask[edge_index[1]]
            masks[et] = source_mask & target_mask
        return masks
    
    def forward(self, data, mode='train'):
        x = data['substation'].x
        edge_masks = self._get_edge_masks(data, mode)
        
        # Message passing through each layer
        for layer in self.layers:
            messages = []
            for et in self.edge_types:
                edge_data = data['substation', et, 'substation']
                idx, attr = edge_data.edge_index, edge_data.edge_attr
                mask = edge_masks[et]
                idx, attr = idx[:, mask], attr[mask]
                if idx.shape[1] == 0:
                    messages.append(x.new_zeros(x.size(0), layer[et].gcn.out_channels))
                else:
                    # Normalize edge attributes using stored statistics
                    attr = (attr - getattr(self, f'{et}_mean')) / getattr(self, f'{et}_std')
                    messages.append(layer[et](x, idx, attr))
            # Aggregate messages across edge types with learnable weights
            x = torch.stack(messages, dim=0)  # Shape: (num_edge_types, num_nodes, hidden_dim)
            x = torch.sum(x * F.softmax(self.aggregation_weights, dim=0)[:, None, None], dim=0)
            x = F.relu(x)
            x = F.dropout(x, p=0.3, training=self.training)
        
        return self.head(x)

In [46]:
# -------------------------
# Hyperparameters and Setup
# -------------------------
in_channels = hetero_graph['substation'].x.size(1)
hidden_dim = 128
num_classes = 5  # According to class_labels mapping
edge_attr_dims = {'spatial': 8, 'temporal': 2, 'causal': 13}
num_epochs = 200
patience = 20

model = PowerGridGNN_Multiclass(in_channels, hidden_dim, edge_attr_dims, num_classes, num_layers=2).to(device)
model._register_edge_stats(hetero_graph)

optimizer = AdamW(model.parameters(), lr=0.005, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss(weight=hetero_graph['substation'].class_weights)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10)


In [47]:
# -------------------------
# Training Loop
# -------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)
hetero_graph = hetero_graph.to(device)  

best_val_loss = float('inf')
no_improve = 0

for epoch in range(1, num_epochs + 1):
    model.train()
    optimizer.zero_grad()
    
    logits = model(hetero_graph, mode='train')
    loss = criterion(logits[hetero_graph['substation'].train_mask],
                     hetero_graph['substation'].y[hetero_graph['substation'].train_mask])
    
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    
    model.eval()
    with torch.no_grad():
        val_logits = model(hetero_graph, mode='val')
        val_loss = criterion(val_logits[hetero_graph['substation'].val_mask],
                             hetero_graph['substation'].y[hetero_graph['substation'].val_mask])
    
    scheduler.step(val_loss)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        no_improve = 0
        torch.save(model.state_dict(), 'best_multiclass_model.pt')
    else:
        no_improve += 1
    
    if epoch % 10 == 0 or epoch == 1:
        train_preds = torch.argmax(logits[hetero_graph['substation'].train_mask], dim=1)
        train_labels = hetero_graph['substation'].y[hetero_graph['substation'].train_mask]
        train_acc = (train_preds == train_labels).float().mean().item()
        print(f"Epoch {epoch} | Train Loss: {loss.item():.4f} | Val Loss: {val_loss.item():.4f} | Train Acc: {train_acc:.4f}")
    
    if no_improve >= patience:
        print("Early stopping!")
        break

# Test Evaluation

model.load_state_dict(torch.load('best_multiclass_model.pt'))
model.eval()
with torch.no_grad():
    test_logits = model(hetero_graph, mode='test')
    test_preds = torch.argmax(test_logits[hetero_graph['substation'].test_mask], dim=1)
    test_labels = hetero_graph['substation'].y[hetero_graph['substation'].test_mask]
    test_acc = (test_preds == test_labels).float().mean().item()
    test_f1 = f1_score(test_labels.cpu().numpy(), test_preds.cpu().numpy(), average='macro')
    
print("\nTest Metrics:")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Macro F1: {test_f1:.4f}")

Epoch 1 | Train Loss: 2.0289 | Val Loss: 2.4637 | Train Acc: 0.2948
Epoch 10 | Train Loss: 1.0227 | Val Loss: 1.4798 | Train Acc: 0.5578
Epoch 20 | Train Loss: 0.8359 | Val Loss: 1.5541 | Train Acc: 0.6494
Early stopping!

Test Metrics:
Test Accuracy: 0.5455
Test Macro F1: 0.1791


  model.load_state_dict(torch.load('best_multiclass_model.pt'))


In [None]:
# Cell 5: Diagnostic Analysis of Test Predictions

import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Ensure model is in evaluation mode and load best model
model.load_state_dict(torch.load('best_multiclass_model.pt'))
model.eval()

with torch.no_grad():
    test_logits = model(hetero_graph, mode='test')
    test_preds = torch.argmax(test_logits[hetero_graph['substation'].test_mask], dim=1)
    test_labels = hetero_graph['substation'].y[hetero_graph['substation'].test_mask]

# Print prediction distribution and true labels distribution
unique_preds, counts_preds = torch.unique(test_preds, return_counts=True)
unique_labels, counts_labels = torch.unique(test_labels, return_counts=True)
print("Test Predictions Distribution:")
for cls, count in zip(unique_preds.cpu().numpy(), counts_preds.cpu().numpy()):
    print(f"Class {cls}: {count}")

print("\nTest Ground Truth Distribution:")
for cls, count in zip(unique_labels.cpu().numpy(), counts_labels.cpu().numpy()):
    print(f"Class {cls}: {count}")

# Compute and display confusion matrix
cm = confusion_matrix(test_labels.cpu().numpy(), test_preds.cpu().numpy(), labels=list(class_labels.values()))
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=[k for k in class_labels.keys()],
            yticklabels=[k for k in class_labels.keys()])
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.title('Confusion Matrix on Test Set')
plt.show()


In [None]:
train_labels = hetero_graph['substation'].y[hetero_graph['substation'].train_mask].cpu().numpy()
val_labels = hetero_graph['substation'].y[hetero_graph['substation'].val_mask].cpu().numpy()
test_labels = hetero_graph['substation'].y[hetero_graph['substation'].test_mask].cpu().numpy()

print("Train Label Distribution:", pd.Series(train_labels).value_counts())
print("Val Label Distribution:", pd.Series(val_labels).value_counts())
print("Test Label Distribution:", pd.Series(test_labels).value_counts())


In [None]:
# Example: Random Substation Split

import random

substations = hetero_graph['substation'].node_ids
random.shuffle(substations)  # Shuffle in place

num_nodes = len(substations)
train_ratio, val_ratio, test_ratio = 0.7, 0.15, 0.15
train_cutoff = int(num_nodes * train_ratio)
val_cutoff = int(num_nodes * (train_ratio + val_ratio))

train_substations = set(substations[:train_cutoff])
val_substations = set(substations[train_cutoff:val_cutoff])
test_substations = set(substations[val_cutoff:])

train_mask = torch.zeros(num_nodes, dtype=torch.bool, device=device)
val_mask = torch.zeros(num_nodes, dtype=torch.bool, device=device)
test_mask = torch.zeros(num_nodes, dtype=torch.bool, device=device)

sub_to_idx = {sub: idx for idx, sub in enumerate(hetero_graph['substation'].node_ids)}

for i, sub in enumerate(hetero_graph['substation'].node_ids):
    if sub in train_substations:
        train_mask[i] = True
    elif sub in val_substations:
        val_mask[i] = True
    elif sub in test_substations:
        test_mask[i] = True

hetero_graph['substation'].train_mask = train_mask
hetero_graph['substation'].val_mask = val_mask
hetero_graph['substation'].test_mask = test_mask

print(f"Train: {train_mask.sum().item()}, Val: {val_mask.sum().item()}, Test: {test_mask.sum().item()}")
