In [1]:
# Loading the Libraries and Raw Dataset
import pandas as pd
import networkx as nx
import itertools
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder



In [2]:
############# Final Feature Importance for the Target Columns #############
# Loading the Raw Dataset
file_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Incidents_5000.xlsx'
data = pd.read_excel(file_path)

# Function to compute feature importances with stability
def compute_feature_importances(data, target_column, n_runs=10):
    all_importances = []
    feature_names = None
    
    for _ in range(n_runs):
        # Split the data into features and target
        X = data.drop(columns=[target_column])
        y = data[target_column]

        # Handle categorical variables using one-hot encoding
        X = pd.get_dummies(X, drop_first=True)

        # Capture feature names before transforming X
        feature_names = X.columns

        # Ensure no NaN or infinite values in the dataset
        X = np.nan_to_num(X)
        y = np.nan_to_num(y)

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train a RandomForestClassifier
        model = RandomForestClassifier(random_state=42)
        model.fit(X_train, y_train)

        # Capture feature importances
        importances = model.feature_importances_
        all_importances.append(importances)
    
    # Average the importances across all runs
    avg_importances = np.mean(all_importances, axis=0)
    
    # Create a DataFrame for feature importances
    importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': avg_importances}).sort_values(by='Importance', ascending=False)
    
    return importances_df

# Convert all object columns to strings to ensure uniformity
for column in data.select_dtypes(include=['object']).columns:
    data[column] = data[column].astype(str)

# Convert datetime columns to numeric timestamps
datetime_columns = ['Job OFF Time', 'Job ON Time', 'Month/Day/Year']
for column in datetime_columns:
    data[column] = pd.to_datetime(data[column], errors='coerce')
    data[column] = data[column].astype('int64') // 10**9  # Convert to seconds since epoch

# Encode categorical columns using LabelEncoder
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Identify and remove columns with all NaN values
nan_columns = data.columns[data.isna().all()].tolist()
data.drop(columns=nan_columns, inplace=True)

# Fill missing values with a placeholder first
data = data.fillna(-9999)

# Impute missing values by replacing the placeholder with the median value of the column
imputer = SimpleImputer(strategy='median', missing_values=-9999)
data_imputed = imputer.fit_transform(data)

# Convert back to DataFrame
data = pd.DataFrame(data_imputed, columns=data.columns)

# Compute feature importances for 'Job Area (DISTRICT)'
important_features_job_area = compute_feature_importances(data, 'Job Area (DISTRICT)', n_runs=10)
print("Top 10 important features for predicting 'Job Area (DISTRICT)':")
print(important_features_job_area.head(10))

# Compute feature importances for 'Extent'
important_features_extent = compute_feature_importances(data, 'Extent', n_runs=10)
print("Top 10 important features for predicting 'Extent':")
print(important_features_extent.head(10))

# Display full importance scores for both targets
#important_features_job_area, important_features_extent


Top 10 important features for predicting 'Job Area (DISTRICT)':
                                         Feature  Importance
24                           Total Area Premises    0.196319
23                         Total Region Premises    0.135345
3                                 Job Substation    0.132535
5                                      Feeder ID    0.120624
2                                     Job Region    0.106200
16                                      Job City    0.050573
25                          Total Subst Premises    0.049651
4                                     Job Feeder    0.038935
45  Ark Grid Mod or OK Grid Enhancement Circuits    0.031521
26                         Total Feeder Premises    0.022500
Top 10 important features for predicting 'Extent':
                     Feature  Importance
35               Dev Subtype    0.083898
29                 Job SAIFI    0.082758
10            Custs Affected    0.078632
19              CMI Category    0.060542
15       

In [3]:
############ Constructing Multiplex Networks with Top Features for Job Area (District) ############
file_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Incidents_5000.xlsx'
data = pd.read_excel(file_path)

#nodes 
unique_substations = data['Job Substation'].unique()

# Function to add edges to a graph based on a column
def add_edges_by_column(graph, column):
    for _, group in data.groupby(column):
        nodes = [str(substation).replace(' ', '_') for substation in group['Job Substation']]
        for node1, node2 in itertools.combinations(nodes, 2):
            graph.add_edge(node1, node2)

# Function to construct adjacency matrix for each layer in the multiplex network
def get_adjacency_matrix(graph):
    return nx.adjacency_matrix(graph, nodelist=unique_substations).todense()

# Define the directory to save adjacency matrices
output_dir = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices'
os.makedirs(output_dir, exist_ok=True)

# Feature importances (mock data)
#top_features = ['Total Area Premises', 'Total Region Premises', 'Job Substation', 'Feeder ID', 'Job Region', 'Job City', 'Total Subst Premises']
top_features = important_features_job_area.head(7)['Feature'].tolist()


# Construct multiplex networks and save individual adjacency matrices for each layer
for i in range(2, 8):
    features_subset = top_features[:i]
    network_folder = os.path.join(output_dir, f'Network_{i}_Features')
    os.makedirs(network_folder, exist_ok=True)
    
    for feature in features_subset:
        graph = nx.Graph()
        graph.add_nodes_from(unique_substations)  # Ensure all nodes are added first
        add_edges_by_column(graph, feature)
        
        adj_matrix = get_adjacency_matrix(graph)
        
        # Convert to DataFrame and include node names
        adj_df = pd.DataFrame(adj_matrix, index=unique_substations, columns=unique_substations)
        
        # Save adjacency matrix to CSV file
        file_path = os.path.join(network_folder, f'Adjacency_Matrix_{feature}.csv')
        adj_df.to_csv(file_path)

        # Print confirmation
        print(f"Saved adjacency matrix for {feature} in network with top {i} features to {file_path}")

print("Finished saving all adjacency matrices.")

Saved adjacency matrix for Total Area Premises in network with top 2 features to /Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_2_Features/Adjacency_Matrix_Total Area Premises.csv
Saved adjacency matrix for Total Region Premises in network with top 2 features to /Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_2_Features/Adjacency_Matrix_Total Region Premises.csv
Saved adjacency matrix for Total Area Premises in network with top 3 features to /Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_3_Features/Adjacency_Matrix_Total Area Premises.csv
Saved adjacency matrix for Total Region Premises in network with top 3 features to /Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_3_Features/Adjacency_Matrix_Total Region Premises.csv
Saved adjacency matrix for Job Substation in network with top 3 features to 

In [8]:
######### Prepare the datasets for prediction of Job Area (District) #########

# Preparing the datasets for Machine Learning
file_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Incidents_5000.xlsx'
data = pd.read_excel(file_path)

# Ensure 'Job Substation' in data matches the embedding keys
data['Job Substation'] = data['Job Substation'].astype(str).str.replace(' ', '_')

# Merge the embeddings data with the 'Job Area (DISTRICT)' column from the incidents data
reduced_incidents_data = data[['Job Substation', 'Job Area (DISTRICT)']]

# List of embedding paths
embedding_paths = [
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_2_Features/r0.25',
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_3_Features/r0.25',
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_4_Features/r0.25',
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_5_Features/r0.25',
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_6_Features/r0.25',
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_7_Features/r0.25'
]

# Iterate over each embedding path
for embedding_path in embedding_paths:
    nw_feature = os.path.basename(os.path.dirname(embedding_path))
    embeddings_file_path = os.path.join(embedding_path, 'mltn2v_results.csv')

    # Read the embeddings data
    embeddings_data = pd.read_csv(embeddings_file_path)

    # Merge the embeddings data with the reduced incidents data
    merged_data_corrected = pd.merge(reduced_incidents_data, embeddings_data, left_on='Job Substation', right_on=embeddings_data.columns[0], how='inner')

    # Drop the substation identifier column from embeddings data
    merged_data_corrected = merged_data_corrected.drop(columns=[embeddings_data.columns[0]])

    # Rename columns for embeddings
    embedding_columns = [f'Embedding {i+1}' for i in range(merged_data_corrected.shape[1] - 2)]
    merged_data_corrected.columns = ['Job Substation', 'Job Area (DISTRICT)'] + embedding_columns

    # Save the merged data to a CSV file
    output_file_path = os.path.join(embedding_path, 'merged_data_with_target.csv')
    merged_data_corrected.to_csv(output_file_path, index=False)

    print(f"Merged data saved to {output_file_path}")

    # Display the first few rows of the merged data
    print(merged_data_corrected.head())


Merged data saved to /Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_2_Features/r0.25/merged_data_with_target.csv
    Job Substation Job Area (DISTRICT)  Embedding 1  Embedding 2  Embedding 3  \
0   8617:SUNNYLANE                EAST    -0.166978     0.134816     0.114100   
1  5712:JOLLYVILLE             SULPHUR     0.079902     0.078907    -0.314676   
2         9410:IGO               OZARK     0.254093    -0.223003    -0.204571   
3      7306:FIXICO            SEMINOLE    -0.301220    -0.005030    -0.732102   
4     4106:HEMLOCK                ENID    -0.179222     0.041837    -0.324901   

   Embedding 4  Embedding 5  Embedding 6  Embedding 7  Embedding 8  ...  \
0    -0.124874    -0.046537     0.165154     0.299822     0.347808  ...   
1    -0.091225     0.031908     0.336273     0.206357     0.173952  ...   
2     0.233370     0.198864     0.527391     0.548082     0.334865  ...   
3     0.312610     0.392512     0.535990     0.8863

In [9]:
############## Testing on 2 Features Network Embeddings ##############

# Load the merged data
merged_data_file_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_2_Features/r0.25/merged_data_with_target.csv' 
merged_data = pd.read_csv(merged_data_file_path)

# Separate features and target
X = merged_data.drop(columns=['Job Area (DISTRICT)', 'Job Substation'])
y = merged_data['Job Area (DISTRICT)']

# Initialize the Random Forest classifier
model = RandomForestClassifier(random_state=42)

# Perform 10-fold cross-validation
scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')

# Print the accuracy for each fold and the mean accuracy
print(f"Accuracy for each fold: {scores}")
print(f"Mean accuracy: {scores.mean()}")




Accuracy for each fold: [1.         1.         0.99328859 1.         1.         1.
 0.99326599 1.         1.         1.        ]
Mean accuracy: 0.9986554583870021


In [10]:

####### Feature Importance for 2 Features Network Embeddings ########

# Train the model on the full dataset to get feature importances
model.fit(X, y)

# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame for better visualization
feature_importances_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Print the top 10 important features
print("Top 10 important features:")
print(feature_importances_df.head(10))


Top 10 important features:
         Feature  Importance
57  Embedding 58    0.027060
49  Embedding 50    0.026653
27  Embedding 28    0.026600
50  Embedding 51    0.025443
29  Embedding 30    0.023977
46  Embedding 47    0.022677
59  Embedding 60    0.021852
19  Embedding 20    0.019102
41  Embedding 42    0.017377
51  Embedding 52    0.016373


In [11]:
############# Final Prediction of Job Area (District) ############

from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import pandas as pd

# Paths to the datasets
embedding_paths = [
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_2_Features/r0.25/merged_data_with_target.csv',
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_2_Features/r0.25/merged_data_with_target.csv',
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_2_Features/r0.25/merged_data_with_target.csv',
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_2_Features/r0.25/merged_data_with_target.csv',
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_2_Features/r0.25/merged_data_with_target.csv',
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/adjacency_matrices/Network_2_Features/r0.25/merged_data_with_target.csv'
]

# Load embedding datasets
embedding_data = [pd.read_csv(path) for path in embedding_paths]

# Prepare data for embedding datasets
def prepare_embedding_data(data):
    X = data.iloc[:, 2:].values  # All columns except the first two
    y = data['Job Area (DISTRICT)'].values
    return X, y

# Encode labels
def encode_labels(y):
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    return y_encoded, le

# Perform 10-fold cross-validation
def perform_cv(X, y, cv=10):
    clf = RandomForestClassifier()
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    scores = cross_val_score(clf, X, y, cv=kf, scoring='accuracy')
    return scores.mean(), scores.std()

# Results dictionary
results = {}

# Process embedding datasets
for i, data in enumerate(embedding_data, start=2):
    X, y = prepare_embedding_data(data)
    y_encoded, _ = encode_labels(y)
    mean_acc, std_acc = perform_cv(X, y_encoded, cv=10)
    results[f'Merged_{i}F_NW_data.csv'] = (mean_acc, std_acc)

# Prepare raw dataset
categorical_cols = data.select_dtypes(include=['object', 'category', 'datetime']).columns.tolist()
numerical_cols = data.select_dtypes(include=['number']).columns.tolist()
categorical_cols.remove('Job Area (DISTRICT)')

X_raw = data.drop(columns=['Job Area (DISTRICT)'])
y_raw = data['Job Area (DISTRICT)']

# Convert all categorical columns to string
X_raw[categorical_cols] = X_raw[categorical_cols].astype(str)

# Define preprocessing for all columns
numerical_transformer_all = SimpleImputer(strategy='mean')
categorical_transformer_all = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for all numerical and categorical data
preprocessor_all = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_all, numerical_cols),
        ('cat', categorical_transformer_all, categorical_cols)
    ])

# Create the pipeline with all features
clf_pipeline_all = Pipeline(steps=[('preprocessor', preprocessor_all),
                                   ('model', RandomForestClassifier())])

# Encode labels for the raw dataset
y_raw_encoded, _ = encode_labels(y_raw)

# Perform cross-validation on the raw dataset
scores_raw = cross_val_score(clf_pipeline_all, X_raw, y_raw_encoded, cv=10, scoring='accuracy')
results['Incidents_5000.xlsx'] = (scores_raw.mean(), scores_raw.std())

# Print the results
for dataset, (mean_acc, std_acc) in results.items():
    print(f'{dataset}: Mean Accuracy = {mean_acc:.4f}, Standard Deviation = {std_acc:.4f}')




Merged_2F_NW_data.csv: Mean Accuracy = 0.9993, Standard Deviation = 0.0013
Merged_3F_NW_data.csv: Mean Accuracy = 0.9993, Standard Deviation = 0.0013
Merged_4F_NW_data.csv: Mean Accuracy = 0.9993, Standard Deviation = 0.0013
Merged_5F_NW_data.csv: Mean Accuracy = 0.9993, Standard Deviation = 0.0013
Merged_6F_NW_data.csv: Mean Accuracy = 0.9993, Standard Deviation = 0.0013
Merged_7F_NW_data.csv: Mean Accuracy = 0.9993, Standard Deviation = 0.0013
Incidents_5000.xlsx: Mean Accuracy = 0.9987, Standard Deviation = 0.0027


In [12]:
# print the extent top feature 

print(important_features_extent.head(10))

                     Feature  Importance
35               Dev Subtype    0.083898
29                 Job SAIFI    0.082758
10            Custs Affected    0.078632
19              CMI Category    0.060542
15                  Equip ID    0.058453
34               Device Type    0.041994
33  STRCTUR_NO/Job Device ID    0.041483
16                Equip Desc    0.039945
28                 Job SAIDI    0.039600
12                       CMI    0.037961


In [17]:
############### Constructing Multiplex Networks for Top Features of Predicting Extent ###############

# Define the MultiplexNetwork class
class MultiplexNetwork:
    def __init__(self):
        self.layers = {}
        self.node_set = set()
        
    def add_layer(self, layer_name, graph):
        self.layers[layer_name] = graph
        self.node_set.update(graph.nodes)
        
    def get_layer(self, layer_name):
        return self.layers.get(layer_name, None)
    
    def nodes(self):
        return self.node_set
    
    def edges(self, layer_name=None):
        if layer_name:
            return self.layers[layer_name].edges
        else:
            all_edges = {}
            for layer, graph in self.layers.items():
                all_edges[layer] = list(graph.edges)
            return all_edges

# Ensure we have the correct raw dataset
file_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Incidents_5000.xlsx'
data = pd.read_excel(file_path)

# Print the first few rows to confirm data is loaded correctly
print(data.head())

# The top 7 features for predicting Extent
top_7_features = important_features_extent.head(7)['Feature'].tolist()

# Verify if the features exist in the dataset
missing_features = [feature for feature in top_7_features if feature not in data.columns]
if missing_features:
    raise KeyError(f"The following important features are missing from the dataset: {missing_features}")

# Get the unique 'Job Substation' values to be used as nodes in all layers
unique_substations = [str(substation).replace(' ', '_') for substation in data['Job Substation'].unique()]

# Function to add edges to a graph based on a column
def add_edges_by_column(graph, column):
    for _, group in data.groupby(column):
        nodes = [str(substation).replace(' ', '_') for substation in group['Job Substation']]
        for node1, node2 in itertools.combinations(nodes, 2):
            graph.add_edge(node1, node2)

# Function to construct adjacency matrix for each layer in the multiplex network
def get_adjacency_matrix(graph):
    return nx.adjacency_matrix(graph, nodelist=unique_substations).todense()

# Directory to save adjacency matrices
output_base_dir = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices'
os.makedirs(output_base_dir, exist_ok=True)

# Construct multiplex networks and save adjacency matrices
for i in range(2, 8):
    features_subset = top_7_features[:i]
    
    multiplex_network = MultiplexNetwork()
    
    # Create a directory for this network
    network_dir = os.path.join(output_base_dir, f'Network_{i}_Features')
    os.makedirs(network_dir, exist_ok=True)
    
    for feature in features_subset:
        graph = nx.Graph()
        graph.add_nodes_from(unique_substations)  # Ensure all nodes are added first
        add_edges_by_column(graph, feature)
        
        multiplex_network.add_layer(feature, graph)
        
        adj_matrix = get_adjacency_matrix(graph)
        
        # Save adjacency matrix to CSV file
        safe_feature_name = feature.replace(' ', '_').replace('/', '_')
        file_path = os.path.join(network_dir, f'Adjacency_Matrix_Layer_{safe_feature_name}.csv')
        adj_df = pd.DataFrame(adj_matrix, index=unique_substations, columns=unique_substations)
        adj_df.to_csv(file_path)

        # Print confirmation
        print(f"Saved adjacency matrix for layer {feature} of top {i} features to {file_path}")

print("Combined Adjacency Matrices have been saved.")


  Job Display ID             CAD_ID  Job Region Job Area (DISTRICT)  \
0   J2001.000006  PD-01012020-00063  METRO EAST                EAST   
1   J2001.000021  PD-01012020-00363   NORTHWEST            WOODWARD   
2   J2001.000041                NaN  METRO WEST             EL RENO   
3   J2001.000045  PD-01012020-00859    SOUTHERN             SULPHUR   
4   J2001.000046  PD-01012020-00858   NORTHWEST            WOODWARD   

    Job Substation       Job Feeder  Feeder ID        Job OFF Time  \
0   8617:SUNNYLANE   SUNNYLANE_1722     861722 2020-01-01 00:21:50   
1   4606:CEDAR AVE    CEDAR_AVE_631     460631 2020-01-01 03:00:30   
2     8905:EL RENO      EL_RENO_522     890522 2020-01-01 08:39:50   
3  5712:JOLLYVILLE  JOLLYVILLE_1264     571264 2020-01-01 08:48:18   
4   4606:CEDAR AVE    CEDAR_AVE_622     460622 2020-01-01 08:50:25   

          Job ON Time  Job Duration Mins  ...  Feeder SAIDI  AM Notes  \
0 2020-01-01 09:22:20             540.50  ...           NaN       NaN   
1 2020

In [15]:
############ Preparing the NW Embeddings for Machine Learning to Predict Extent ############

# File paths
incidents_file_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Incidents_5000.xlsx'  # Update with your file path

# Embeddings paths
embeddings_paths = {
    2: '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_2_Features/r0.25/mltn2v_results.csv',
    3: '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_3_Features/r0.25/mltn2v_results.csv',
    4: '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_4_Features/r0.25/mltn2v_results.csv',
    5: '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_5_Features/r0.25/mltn2v_results.csv',
    6: '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_6_Features/r0.25/mltn2v_results.csv',
    7: '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_7_Features/r0.25/mltn2v_results.csv',
}

# Load the incidents data
incidents_data = pd.read_excel(incidents_file_path, engine='openpyxl')

# Ensure 'Job Substation' in incidents_data matches the embedding keys
incidents_data['Job Substation'] = incidents_data['Job Substation'].str.replace(' ', '_')

# Function to merge embeddings with incidents data
def merge_embeddings_with_incidents(incidents_data, embeddings_path, num_features):
    # Read the embeddings data
    embeddings_data = pd.read_csv(embeddings_path)

    # Merge the embeddings data with the 'Job Substation' and 'Extent' columns from the incidents data
    reduced_incidents_data = incidents_data[['Job Substation', 'Extent']]

    # Merge the embeddings data with the reduced incidents data
    merged_data = pd.merge(reduced_incidents_data, embeddings_data, left_on='Job Substation', right_on=embeddings_data.columns[0], how='inner')

    # Drop the substation identifier column from embeddings data
    merged_data = merged_data.drop(columns=[embeddings_data.columns[0]])

    # Rename columns for embeddings
    embedding_columns = [f'Embedding {i+1}' for i in range(merged_data.shape[1] - 2)]
    merged_data.columns = ['Job Substation', 'Extent'] + embedding_columns

    # Save the merged data to a CSV file in the same folder as the embeddings
    output_dir = os.path.dirname(embeddings_path)
    output_file_path = os.path.join(output_dir, f'Merged_{num_features}F_NW_data.csv')
    merged_data.to_csv(output_file_path, index=False)

    print(f"Merged data saved to {output_file_path}")

    return merged_data

# Loop through each embeddings path and merge with incidents data
for num_features, embeddings_path in embeddings_paths.items():
    merge_embeddings_with_incidents(incidents_data, embeddings_path, num_features)

print("All merged datasets have been saved.")


Merged data saved to /Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_2_Features/r0.25/Merged_2F_NW_data.csv
Merged data saved to /Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_3_Features/r0.25/Merged_3F_NW_data.csv
Merged data saved to /Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_4_Features/r0.25/Merged_4F_NW_data.csv
Merged data saved to /Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_5_Features/r0.25/Merged_5F_NW_data.csv
Merged data saved to /Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_6_Features/r0.25/Merged_6F_NW_data.csv
Merged data saved to /Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data

In [16]:
# Machine Learning Model for NW Embeddings 
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Paths to the datasets
embedding_paths = [
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_2_Features/r0.25/Merged_2F_NW_data.csv',
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_3_Features/r0.25/Merged_3F_NW_data.csv',
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_4_Features/r0.25/Merged_4F_NW_data.csv',
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_5_Features/r0.25/Merged_5F_NW_data.csv',
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_6_Features/r0.25/Merged_6F_NW_data.csv',
    '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Embeddings_Extent/Networks_Adjacency_Matrices/Network_7_Features/r0.25/Merged_7F_NW_data.csv',
]

raw_data_path = '/Volumes/Data/NDSU/PhD Work/Research/IME Research/AI-Energy/Data/SPP/Incidents_5000.xlsx'  # Update with your file path

# Load datasets
embedding_data = [pd.read_csv(path) for path in embedding_paths]
raw_data = pd.read_excel(raw_data_path, engine='openpyxl')

# Prepare data for embedding datasets
def prepare_embedding_data(data):
    X = data.iloc[:, 2:].values  # All columns except the first two
    y = data['Extent'].values
    return X, y

# Encode labels
def encode_labels(y):
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    return y_encoded, le

# Perform 10-fold cross-validation
def perform_cv(X, y, cv=10):
    clf = RandomForestClassifier()
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    scores = cross_val_score(clf, X, y, cv=kf, scoring='accuracy')
    return scores.mean(), scores.std()

# Results dictionary
results = {}

# Process embedding datasets
for i, data in enumerate(embedding_data, start=2):
    X, y = prepare_embedding_data(data)
    y_encoded, _ = encode_labels(y)
    mean_acc, std_acc = perform_cv(X, y_encoded, cv=10)
    results[f'Merged_{i}F_NW_data.csv'] = (mean_acc, std_acc)

# Prepare raw dataset
categorical_cols = raw_data.select_dtypes(include=['object', 'category', 'datetime']).columns.tolist()
numerical_cols = raw_data.select_dtypes(include=['number']).columns.tolist()
categorical_cols.remove('Extent')

X_raw = raw_data.drop(columns=['Extent'])
y_raw = raw_data['Extent']

# Convert all categorical columns to string
X_raw[categorical_cols] = X_raw[categorical_cols].astype(str)

# Define preprocessing for all columns
numerical_transformer_all = SimpleImputer(strategy='mean')
categorical_transformer_all = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for all numerical and categorical data
preprocessor_all = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_all, numerical_cols),
        ('cat', categorical_transformer_all, categorical_cols)
    ])

# Create the pipeline with all features
clf_pipeline_all = Pipeline(steps=[('preprocessor', preprocessor_all),
                                   ('model', RandomForestClassifier())])

# Encode labels for the raw dataset
y_raw_encoded, _ = encode_labels(y_raw)

# Perform cross-validation on the raw dataset
scores_raw = cross_val_score(clf_pipeline_all, X_raw, y_raw_encoded, cv=10, scoring='accuracy')
results['Incidents_5000.xlsx'] = (scores_raw.mean(), scores_raw.std())

# Print the results
for dataset, (mean_acc, std_acc) in results.items():
    print(f'{dataset}: Mean Accuracy = {mean_acc:.4f}, Standard Deviation = {std_acc:.4f}')


 'Equipment Desc that should be excluded from reported indices']. At least one non-missing value is needed for imputation with strategy='mean'.
 'Equipment Desc that should be excluded from reported indices']. At least one non-missing value is needed for imputation with strategy='mean'.
 'Equipment Desc that should be excluded from reported indices']. At least one non-missing value is needed for imputation with strategy='mean'.
 'Equipment Desc that should be excluded from reported indices']. At least one non-missing value is needed for imputation with strategy='mean'.
 'Equipment Desc that should be excluded from reported indices']. At least one non-missing value is needed for imputation with strategy='mean'.
 'Equipment Desc that should be excluded from reported indices']. At least one non-missing value is needed for imputation with strategy='mean'.
 'Equipment Desc that should be excluded from reported indices']. At least one non-missing value is needed for imputation with strategy=

Merged_2F_NW_data.csv: Mean Accuracy = 0.5480, Standard Deviation = 0.0155
Merged_3F_NW_data.csv: Mean Accuracy = 0.5464, Standard Deviation = 0.0146
Merged_4F_NW_data.csv: Mean Accuracy = 0.5466, Standard Deviation = 0.0169
Merged_5F_NW_data.csv: Mean Accuracy = 0.5462, Standard Deviation = 0.0154
Merged_6F_NW_data.csv: Mean Accuracy = 0.5478, Standard Deviation = 0.0157
Merged_7F_NW_data.csv: Mean Accuracy = 0.5478, Standard Deviation = 0.0166
Incidents_5000.xlsx: Mean Accuracy = 0.8226, Standard Deviation = 0.0329


 'Equipment Desc that should be excluded from reported indices']. At least one non-missing value is needed for imputation with strategy='mean'.
