### Multiplex Network Construction Documentation

In this document, we describe the construction of a multiplex network based on the incident data from the Oklahoma Gas and Electric company. The multiplex network consists of multiple layers, each representing different types of connections between the substations.

#### Layers in the Multiplex Network

1. **Job Region**
   - **Description**: This layer represents the geographical regions where the substations are located. Nodes (substations) are connected if they belong to the same job region.
   
2. **Job Area (DISTRICT)**
   - **Description**: This layer represents the specific districts within the regions. Nodes are connected if they belong to the same job area or district.
   
3. **Month/Day/Year**
   - **Description**: This temporal layer represents the date on which incidents occurred. Nodes are connected if incidents at these substations occurred on the same day.
   
4. **Custs Affected Interval**
   - **Description**: This layer categorizes incidents based on the number of customers affected. Nodes are connected if the number of affected customers falls within the same interval (Very Low, Low, Medium, High).
   
5. **OGE Causes**
   - **Description**: This layer categorizes incidents based on their causes as defined by the Oklahoma Gas and Electric company. Nodes are connected if incidents share the same cause.
   
6. **Major Storm Event (Yes or No)**
   - **Description**: This layer represents whether an incident occurred during a major storm event. Nodes are connected if incidents at these substations were affected by the same storm event (Yes or No).
   
7. **Distribution, Substation, Transmission Type**
   - **Description**: This layer represents the type of infrastructure associated with the incidents. Nodes are connected if they belong to the same type, such as distribution, substation, or transmission.

These layers collectively provide a comprehensive view of the different relationships and interactions between the substations based on various criteria, enabling a detailed analysis of the incident data.


In [None]:
# Multiplex Network Creation

import pandas as pd
import networkx as nx
import itertools
import numpy as np

# Load the dataset
file_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Incidents_5000.xlsx'
data = pd.read_excel(file_path)

# Preprocess data: replace spaces in 'Job Substation' names with underscores
data['Job Substation'] = data['Job Substation'].str.replace(' ', '_')

# Define intervals for 'Custs Affected Interval'
custs_intervals = {
    'Very Low': (0, 50),
    'Low': (51, 100),
    'Medium': (101, 500),
    'High': (501, float('inf'))
}

def categorize_custs_affected(affected):
    for category, (low, high) in custs_intervals.items():
        if low <= affected <= high:
            return category
    return 'Unknown'

# Add a column for categorized customer affected intervals
data['Custs Affected Interval'] = data['Custs Affected'].apply(categorize_custs_affected)

# Initialize a dictionary to hold each layer's graph
layers = {
    'Job Region': nx.Graph(),
    'Job Area (DISTRICT)': nx.Graph(),
    'Time': nx.Graph(),
    'Custs Affected Interval': nx.Graph(),
    'OGE Causes': nx.Graph(),
    'Major Storm Event': nx.Graph(),
    'Distribution, Substation, Transmission': nx.Graph()
}

# Add nodes with replaced spaces
for layer in layers:
    nodes = [substation.replace(' ', '_') for substation in data['Job Substation'].unique()]
    layers[layer].add_nodes_from(nodes)

# Define functions to add edges to each layer based on criteria
def add_edges_by_column(layer_name, column):
    layer = layers[layer_name]
    for _, group in data.groupby(column):
        nodes = [substation.replace(' ', '_') for substation in group['Job Substation']]
        for node1, node2 in itertools.combinations(nodes, 2):
            layer.add_edge(node1, node2)

def add_edges_by_date(layer_name):
    layer = layers[layer_name]
    for _, group in data.groupby('Month/Day/Year'):
        nodes = [substation.replace(' ', '_') for substation in group['Job Substation']]
        for node1, node2 in itertools.combinations(nodes, 2):
            layer.add_edge(node1, node2)

# Add edges for each layer
add_edges_by_column('Job Region', 'Job Region')
add_edges_by_column('Job Area (DISTRICT)', 'Job Area (DISTRICT)')
add_edges_by_date('Time')
add_edges_by_column('Custs Affected Interval', 'Custs Affected Interval')
add_edges_by_column('OGE Causes', 'OGE Causes')
add_edges_by_column('Major Storm Event', 'Major Storm Event  Y (Yes) or N (No)')
add_edges_by_column('Distribution, Substation, Transmission', 'Distribution, Substation, Transmission')

# Custom class to represent a multiplex network
class MultiplexNetwork:
    def __init__(self):
        self.layers = {}
        self.node_set = set()
        
    def add_layer(self, layer_name, graph):
        self.layers[layer_name] = graph
        self.node_set.update(graph.nodes)
        
    def get_layer(self, layer_name):
        return self.layers.get(layer_name, None)
    
    def nodes(self):
        return self.node_set
    
    def edges(self, layer_name=None):
        if layer_name:
            return self.layers[layer_name].edges
        else:
            all_edges = {}
            for layer, graph in self.layers.items():
                all_edges[layer] = list(graph.edges)
            return all_edges

# Create the multiplex network
multiplex_network = MultiplexNetwork()
for layer_name, graph in layers.items():
    multiplex_network.add_layer(layer_name, graph)

# Interact with the multiplex network
#print(f"All nodes in multiplex network: {multiplex_network.nodes()}")
#for layer_name in layers:
    #print(f"Edges in {layer_name} layer: {multiplex_network.edges(layer_name)}")

# print the number of nodes and edges in each layer
for layer_name in layers:
    print(f"Number of nodes in {layer_name} layer: {len(layers[layer_name].nodes)}")
    print(f"Number of edges in {layer_name} layer: {len(layers[layer_name].edges)}")    


In [None]:
# Save adjacency matrices of each layer as CSV files

import os
import numpy as np
import pandas as pd

# Define the directory to save the adjacency matrices
output_dir = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices'
os.makedirs(output_dir, exist_ok=True)

# Function to save adjacency matrix of each layer
def save_adjacency_matrices(multiplex_network, output_dir):
    for layer_name, graph in multiplex_network.layers.items():
        # Create adjacency matrix
        adjacency_matrix = nx.to_numpy_array(graph)
        
        # Convert adjacency matrix to DataFrame for CSV export
        adjacency_df = pd.DataFrame(adjacency_matrix, index=graph.nodes, columns=graph.nodes)
        
        # Define file path
        file_path = os.path.join(output_dir, f"{layer_name.replace(' ', '_')}_adjacency_matrix.csv")
        
        # Save adjacency matrix as .csv file
        adjacency_df.to_csv(file_path)
        print(f"Adjacency matrix for layer '{layer_name}' saved at: {file_path}")

# Assuming `multiplex_network` is your existing multiplex network object
save_adjacency_matrices(multiplex_network, output_dir)


In [None]:
# Preparing the datasets for Machine Learning

import pandas as pd

# Load the datasets
incidents_file_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Incidents_5000.xlsx'  # Update with your file path
embeddings_file_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/embeddings_new/r0.25/mltn2v_results.csv'  # Update with your file path

# Read the files
incidents_data = pd.read_excel(incidents_file_path)
embeddings_data = pd.read_csv(embeddings_file_path)

# Ensure 'Job Substation' in incidents_data matches the embedding keys
incidents_data['Job Substation'] = incidents_data['Job Substation'].str.replace(' ', '_')

# Merge the embeddings data with only the 'Extent' column from the incidents data
reduced_incidents_data = incidents_data[['Job Substation', 'Extent']]

# Merge the embeddings data with the reduced incidents data
merged_data_corrected = pd.merge(reduced_incidents_data, embeddings_data, left_on='Job Substation', right_on=embeddings_data.columns[0], how='inner')

# Drop the substation identifier column from embeddings data
merged_data_corrected = merged_data_corrected.drop(columns=[embeddings_data.columns[0]])

# Save the merged data to a CSV file
output_file_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/merged_data_n.csv'  # Update with your desired output path
merged_data_corrected.to_csv(output_file_path, index=False)

print(f"Merged data saved to {output_file_path}")


In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Read the merged data from the specified file path
merged_file_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/merged_data_n.csv'
merged_data = pd.read_csv(merged_file_path)

# Display the first few rows of the merged data to ensure it loaded correctly
#print(merged_data.head())

# Define the target variable
target_column = 'Extent'

# Extract embedding columns (all columns except the target)
embedding_columns = merged_data.columns[1:]

# Select features (embeddings) and target
X = merged_data[embedding_columns]
y = merged_data[target_column]

# Initialize label encoder
le = LabelEncoder()

# Encode the target variable
y_encoded = le.fit_transform(y)

# Ensure all feature columns are numeric
X = X.apply(pd.to_numeric, errors='coerce')

# Handle missing values (fill with median values)
X = X.fillna(X.median())

# Verify there are no more missing values
missing_values_after = X.isnull().sum().sum()
print(f"Total missing values after handling: {missing_values_after}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Total missing values after handling: 4993
Accuracy: 0.5435435435435435
Classification Report:
              precision    recall  f1-score   support

    Customer       0.59      0.78      0.67       488
      Feeder       0.33      0.13      0.19        62
        Load       0.48      0.50      0.49       305
    Mainline       0.08      0.01      0.02        76
   Secondary       0.00      0.00      0.00        37
  Substation       0.00      0.00      0.00         2
         Tap       0.33      0.03      0.06        29

    accuracy                           0.54       999
   macro avg       0.26      0.21      0.20       999
weighted avg       0.47      0.54      0.49       999



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
# Install the imblearn library
!pip install imbalanced-learn

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# Load the merged data
merged_file_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/merged_data_n.csv'
merged_data = pd.read_csv(merged_file_path)

# Define the target variable
target_column = 'Extent'

# Extract embedding columns (all columns except the target)
embedding_columns = merged_data.columns[2:]  # Excluding 'Job Substation' and 'Extent'

# Select features (embeddings) and target
X = merged_data[embedding_columns]
y = merged_data[target_column]

# Initialize label encoder
le = LabelEncoder()

# Encode the target variable
y_encoded = le.fit_transform(y)

# Ensure all feature columns are numeric
X = X.apply(pd.to_numeric, errors='coerce')

# Handle missing values (fill with median values)
X = X.fillna(X.median())

# Drop rows with missing values if any
if X.isnull().sum().sum() > 0:
    X.dropna(inplace=True)
    y_encoded = y_encoded[X.index]

# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_encoded)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best estimator
best_rf = grid_search.best_estimator_

# Make predictions with the best estimator
y_pred = best_rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Defaulting to user installation because normal site-packages is not writeable
Fitting 3 folds for each of 216 candidates, totalling 648 fits
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.0s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.1s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.2s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   9.8s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  10.0s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  10.0s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   4.9s
[CV] END bootstrap=True, max_depth=10, m

In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the incidents data
incidents_file_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Incidents_5000.xlsx'
incidents_data = pd.read_excel(incidents_file_path)

# Define the target variable
target_column = 'Extent'

# Select features (all columns except the target and any non-numeric columns)
features = incidents_data.select_dtypes(include=[np.number]).columns.tolist()

# Ensure the target column is in the data
assert target_column in incidents_data.columns, f"Target column '{target_column}' not found in the dataset"

# Select features and target
X = incidents_data[features]
y = incidents_data[target_column]

# Initialize label encoder
le = LabelEncoder()

# Encode the target variable
y_encoded = le.fit_transform(y)

# Handle missing values (fill with median values)
X = X.fillna(X.median())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_, labels=le.transform(le.classes_))

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Accuracy: 0.8141858141858141
Classification Report:
              precision    recall  f1-score   support

    Customer       0.85      0.96      0.90       505
      Feeder       0.95      0.93      0.94        57
        Load       0.75      0.70      0.72       309
    Mainline       0.71      0.56      0.62        81
   Secondary       0.77      0.63      0.69        27
  Substation       0.00      0.00      0.00         0
         Tap       0.00      0.00      0.00        22

    accuracy                           0.81      1001
   macro avg       0.58      0.54      0.55      1001
weighted avg       0.79      0.81      0.80      1001



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Check the data types of the columns
data_types = merged_data.dtypes
print(data_types)


In [None]:
# Identify non-numeric columns in the feature matrix
non_numeric_columns = X.select_dtypes(exclude=[np.number]).columns
print(f"Non-numeric columns: {non_numeric_columns}")

# Display unique values of non-numeric columns to understand the issue
for col in non_numeric_columns:
    unique_values = X[col].unique()
    print(f"Unique values in {col}: {unique_values}")


In [None]:
import pandas as pd
import networkx as nx
import itertools
import os

# Load the dataset
file_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Incidents_5000.xlsx'
data = pd.read_excel(file_path)

# Preprocess data: replace spaces in 'Job Substation' names with underscores
data['Job Substation'] = data['Job Substation'].str.replace(' ', '_')

# Define intervals for 'Custs Affected Interval'
custs_intervals = {
    'Very Low': (0, 50),
    'Low': (51, 100),
    'Medium': (101, 500),
    'High': (501, float('inf'))
}

def categorize_custs_affected(affected):
    for category, (low, high) in custs_intervals.items():
        if low <= affected <= high:
            return category
    return 'Unknown'

# Add a column for categorized customer affected intervals
data['Custs Affected Interval'] = data['Custs Affected'].apply(categorize_custs_affected)

# Initialize a dictionary to hold each layer's graph
layers = {
    'Job Region': nx.Graph(),
    'Job Area (DISTRICT)': nx.Graph(),
    'Month/Day/Year': nx.Graph(),
    'Custs Affected Interval': nx.Graph(),
    'OGE Causes': nx.Graph(),
    'Major Storm Event': nx.Graph(),
    'Distribution, Substation, Transmission Type': nx.Graph()
}

# Add nodes
for layer in layers:
    layers[layer].add_nodes_from(data['Job Substation'].unique())

# Define functions to add edges to each layer based on criteria
def add_edges_by_column(layer_name, column):
    layer = layers[layer_name]
    for _, group in data.groupby(column):
        for node1, node2 in itertools.combinations(group['Job Substation'], 2):
            layer.add_edge(node1, node2)

def add_edges_by_date(layer_name):
    layer = layers[layer_name]
    for _, group in data.groupby('Month/Day/Year'):
        for node1, node2 in itertools.combinations(group['Job Substation'], 2):
            layer.add_edge(node1, node2)

# Add edges for each layer
add_edges_by_column('Job Region', 'Job Region')
add_edges_by_column('Job Area (DISTRICT)', 'Job Area (DISTRICT)')
add_edges_by_date('Month/Day/Year')
add_edges_by_column('Custs Affected Interval', 'Custs Affected Interval')
add_edges_by_column('OGE Causes', 'OGE Causes')
add_edges_by_column('Major Storm Event', 'Major Storm Event  Y (Yes) or N (No)')
add_edges_by_column('Distribution, Substation, Transmission Type', 'Distribution, Substation, Transmission')
# Output the constructed graphs for verification
for layer_name, graph in layers.items():
    print(f"Layer: {layer_name}")
    print(f"Number of nodes: {graph.number_of_nodes()}")
    print(f"Number of edges: {graph.number_of_edges()}")
    print()
    
# Define the path where the adjacency matrices will be saved
output_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices'

# Create the output directory if it does not exist
os.makedirs(output_path, exist_ok=True)

# Function to save adjacency matrix with headers
def save_adjacency_matrix_with_headers(graph, filename):
    adj_matrix = nx.to_numpy_array(graph)
    nodes = list(graph.nodes)
    adj_df = pd.DataFrame(adj_matrix, index=nodes, columns=nodes)
    adj_df.to_csv(filename)

# Save adjacency matrices for each layer with correct headers
for layer_name, graph in layers.items():
    safe_layer_name = layer_name.replace(" ", "_").replace("/", "_")
    filename = os.path.join(output_path, f'{safe_layer_name}_adjacency_matrix.csv')
    save_adjacency_matrix_with_headers(graph, filename)

# List the saved files for confirmation
saved_files = os.listdir(output_path)
saved_files


In [None]:
# Load the embeddings file to check the structure
embeddings_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/embeddings_new/r0.25/mltn2v_results.csv'
embeddings = pd.read_csv(embeddings_path, header=None)

# Display the first few rows to understand the structure
embeddings.head()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the embeddings file
embeddings_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/embeddings_new/r0.25/mltn2v_results.csv'
embeddings = pd.read_csv(embeddings_path, header=None)

# Load the original dataset
original_data_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Incidents_5000.xlsx'
data = pd.read_excel(original_data_path)

# Assume the first column in embeddings is the node identifier
node_ids = embeddings[0]
embeddings = embeddings.drop(0, axis=1)

# Add column names to embeddings
embedding_columns = [f'embedding_{i}' for i in range(embeddings.shape[1])]
embeddings.columns = embedding_columns

# Combine node_ids with embeddings
embeddings['Job Substation'] = node_ids

# Preprocess the original dataset
data['Job Substation'] = data['Job Substation'].str.replace(' ', '_')

# Merge embeddings with the original dataset on Job Substation
merged_data = pd.merge(data, embeddings, on='Job Substation')

# Select the embeddings and the target column
ml_data = merged_data[embedding_columns + ['Extent']]

# Handle missing values if necessary
ml_data = ml_data.dropna()

# Split the data into features and target
X = ml_data[embedding_columns]
y = ml_data['Extent']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)

print(report)

# Save the prepared data for further analysis
prepared_data_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/prepared_ml_data.csv'
ml_data.to_csv(prepared_data_path, index=False)

prepared_data_path


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Load the embeddings file
embeddings_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/embeddings_new/r0.25/mltn2v_results.csv'
embeddings = pd.read_csv(embeddings_path, header=None)

# Load the original dataset
original_data_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Incidents_5000.xlsx'
data = pd.read_excel(original_data_path)

# Assume the first column in embeddings is the node identifier
node_ids = embeddings[0]
embeddings = embeddings.drop(0, axis=1)

# Add column names to embeddings
embedding_columns = [f'embedding_{i}' for i in range(embeddings.shape[1])]
embeddings.columns = embedding_columns

# Combine node_ids with embeddings
embeddings['Job Substation'] = node_ids

# Preprocess the original dataset
data['Job Substation'] = data['Job Substation'].str.replace(' ', '_')

# Merge embeddings with the original dataset on Job Substation
merged_data = pd.merge(data, embeddings, on='Job Substation')

# Select the embeddings and the target column
ml_data = merged_data[embedding_columns + ['Extent']]

# Handle missing values if necessary
ml_data = ml_data.dropna()

# Split the data into features and target
X = ml_data[embedding_columns]
y = ml_data['Extent']

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train a RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)

print(report)

# Save the prepared data for further analysis
prepared_data_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/prepared_ml_data.csv'
ml_data.to_csv(prepared_data_path, index=False)

prepared_data_path


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Load the embeddings file
embeddings_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/embeddings_new/r0.25/mltn2v_results.csv'
embeddings = pd.read_csv(embeddings_path, header=None)

# Load the original dataset
original_data_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Incidents_5000.xlsx'
data = pd.read_excel(original_data_path)

# Assume the first column in embeddings is the node identifier
node_ids = embeddings[0]
embeddings = embeddings.drop(0, axis=1)

# Add column names to embeddings
embedding_columns = [f'embedding_{i}' for i in range(embeddings.shape[1])]
embeddings.columns = embedding_columns

# Combine node_ids with embeddings
embeddings['Job Substation'] = node_ids

# Preprocess the original dataset
data['Job Substation'] = data['Job Substation'].str.replace(' ', '_')

# Merge embeddings with the original dataset on Job Substation
merged_data = pd.merge(data, embeddings, on='Job Substation')

# Select the embeddings and the target column
ml_data = merged_data[embedding_columns + ['Extent']]

# Handle missing values if necessary
ml_data = ml_data.dropna()

# Split the data into features and target
X = ml_data[embedding_columns]
y = ml_data['Extent']

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best estimator
best_rf = grid_search.best_estimator_

# Predict and evaluate the model
y_pred = best_rf.predict(X_test)
report = classification_report(y_test, y_pred)

print(report)

# Save the prepared data for further analysis
prepared_data_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/prepared_ml_data.csv'
ml_data.to_csv(prepared_data_path, index=False)

prepared_data_path


In [None]:
from xgboost import XGBClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

# Load the embeddings file
embeddings_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/embeddings_new/r0.25/mltn2v_results.csv'
embeddings = pd.read_csv(embeddings_path, header=None)

# Load the original dataset
original_data_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Incidents_5000.xlsx'
data = pd.read_excel(original_data_path)

# Assume the first column in embeddings is the node identifier
node_ids = embeddings[0]
embeddings = embeddings.drop(0, axis=1)

# Add column names to embeddings
embedding_columns = [f'embedding_{i}' for i in range(embeddings.shape[1])]
embeddings.columns = embedding_columns

# Combine node_ids with embeddings
embeddings['Job Substation'] = node_ids

# Preprocess the original dataset
data['Job Substation'] = data['Job Substation'].str.replace(' ', '_')

# Merge embeddings with the original dataset on Job Substation
merged_data = pd.merge(data, embeddings, on='Job Substation')

# Select the embeddings and the target column
ml_data = merged_data[embedding_columns + ['Extent']]

# Handle missing values if necessary
ml_data = ml_data.dropna()

# Encode the target labels
label_encoder = LabelEncoder()
ml_data['Extent'] = label_encoder.fit_transform(ml_data['Extent'])

# Split the data into features and target
X = ml_data[embedding_columns]
y = ml_data['Extent']

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train an XGBoostClassifier
model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print(report)


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

# Load the embeddings file
embeddings_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/embeddings_new/r0.25/mltn2v_results.csv'
embeddings = pd.read_csv(embeddings_path, header=None)

# Load the original dataset
original_data_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Incidents_5000.xlsx'
data = pd.read_excel(original_data_path)

# Assume the first column in embeddings is the node identifier
node_ids = embeddings[0]
embeddings = embeddings.drop(0, axis=1)

# Add column names to embeddings
embedding_columns = [f'embedding_{i}' for i in range(embeddings.shape[1])]
embeddings.columns = embedding_columns

# Combine node_ids with embeddings
embeddings['Job Substation'] = node_ids

# Preprocess the original dataset
data['Job Substation'] = data['Job Substation'].str.replace(' ', '_')

# Merge embeddings with the original dataset on Job Substation
merged_data = pd.merge(data, embeddings, on='Job Substation')

# Select the embeddings and the target column
ml_data = merged_data[embedding_columns + ['Extent']]

# Handle missing values if necessary
ml_data = ml_data.dropna()

# Encode the target labels
label_encoder = LabelEncoder()
ml_data['Extent'] = label_encoder.fit_transform(ml_data['Extent'])

# Split the data into features and target
X = ml_data[embedding_columns]
y = ml_data['Extent']

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

# Initialize the XGBoostClassifier
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best estimator
best_xgb = grid_search.best_estimator_

# Predict and evaluate the model
y_pred = best_xgb.predict(X_test)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print(report)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

# Load datasets
incident_df = pd.read_excel('/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Incidents_5000.xlsx')
embeddings_df = pd.read_csv('/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/embeddings/r0.25/mltn2v_results.csv')

# Rename the first column of embeddings_df to 'Job Substation'
embeddings_df.rename(columns={embeddings_df.columns[0]: 'Job Substation'}, inplace=True)

# Ensure 'Job Substation' column is standardized in both datasets
incident_df['Job Substation'] = incident_df['Job Substation'].str.replace(' ', '').str.replace('_', '').str.upper()
embeddings_df['Job Substation'] = embeddings_df['Job Substation'].str.replace(' ', '').str.replace('_', '').str.upper()

# Filter the original incident data to match the embeddings_df
filtered_incident_df = incident_df[incident_df['Job Substation'] != '3109:HONORHEIGHTS']

# Merge the filtered_incident_df with embeddings_df while preserving the 'Extent' column
aligned_df = pd.merge(filtered_incident_df, embeddings_df, on='Job Substation', how='inner')

# Check if 'Extent' column is preserved
print("'Extent' column in aligned_df:", 'Extent' in aligned_df.columns)

# Define features and target
drop_columns = [
    'Extent', 'Job Display ID', 'CAD_ID', 'Job Region', 'Job Area (DISTRICT)', 
    'Job Substation', 'Job Feeder', 'Feeder ID', 'Job OFF Time', 'Job ON Time', 
    'Job Duration Mins', 'OGE Causes', 'Major Storm Event  Y (Yes) or N (No)', 
    'Distribution, Substation, Transmission', 
    'Transmission Voltage (69kV, 138kV, 161kv) feeding distribution substation', 
    'Month/Day/Year', 'Year', 
    'Equipment Desc that should be excluded from reported indices', 
    'Ark Grid Mod or OK Grid Enhancement Circuits'
]

# Ensure correct columns are dropped for classical_features and network_features
classical_features = aligned_df.drop(columns=drop_columns, errors='ignore')
network_features = aligned_df.drop(columns=drop_columns + ['Extent'], errors='ignore')

# Ensure only numerical columns are kept in network_features
network_features = network_features.select_dtypes(include=['float64', 'int64'])

target = aligned_df['Extent']

# Identify columns with mixed types
def identify_mixed_types(df):
    mixed_type_columns = []
    for col in df.columns:
        unique_types = set(df[col].apply(type))
        if len(unique_types) > 1:
            mixed_type_columns.append(col)
    return mixed_type_columns

mixed_type_columns = identify_mixed_types(classical_features)

# Convert columns with mixed types to strings
for col in mixed_type_columns:
    classical_features[col] = classical_features[col].astype(str)

# Drop columns with all missing values
classical_features = classical_features.dropna(axis=1, how='all')
network_features = network_features.dropna(axis=1, how='all')

# Ensure DataFrames are not empty
print("Classical features shape:", classical_features.shape)
print("Network features shape:", network_features.shape)

# Split the data
X_train_classical, X_test_classical, y_train_classical, y_test_classical = train_test_split(classical_features, target, test_size=0.2, random_state=42)
X_train_network, X_test_network, y_train_network, y_test_network = train_test_split(network_features, target, test_size=0.2, random_state=42)

# Define preprocessing for classical features (example with numerical and categorical)
numerical_features = classical_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = classical_features.select_dtypes(include=['object']).columns.tolist()

preprocessor_classical = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ]
)

# Define preprocessing for network features (assuming all embeddings are numerical)
numerical_features_network = network_features.columns.tolist()

preprocessor_network = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_features_network)
    ]
)

# Create pipelines
pipeline_classical = Pipeline(steps=[
    ('preprocessor', preprocessor_classical),
    ('classifier', RandomForestClassifier(random_state=42))
])

pipeline_network = Pipeline(steps=[
    ('preprocessor', preprocessor_network),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train and evaluate classical model
pipeline_classical.fit(X_train_classical, y_train_classical)
y_pred_classical = pipeline_classical.predict(X_test_classical)
accuracy_classical = accuracy_score(y_test_classical, y_pred_classical)

# Train and evaluate network model
pipeline_network.fit(X_train_network, y_train_network)
y_pred_network = pipeline_network.predict(X_test_network)
accuracy_network = accuracy_score(y_test_network, y_pred_network)

print("Classical Machine Learning Accuracy:", accuracy_classical)
print("Network Embeddings Machine Learning Accuracy:", accuracy_network)
