In [1]:
import pandas as pd
import os, re

In [18]:
def load_msss_data(msss_tables_dir : str, pattern = r"AD_(.*).xls") -> dict:
    """
    Read all MSSS tables from a directory.

    Args:
        msss_tables_dir (str): Folder to search for MSSS table files from.
        pattern (regexp, optional): Pattern to select MSSS table files and capture the label name. Defaults to r"AD_(.*).xls".

    Returns:
        dict: Dictionary of all MSSS data.
    """
    msss_features = {}

    for file in os.listdir(msss_tables_dir):

        match = re.match(pattern=pattern, string=file)

        if not match: continue

        label_name = match.group(1)

        path = os.path.join(msss_tables_dir, file)

        df = pd.read_excel(path, header=1, index_col="Code")

        msss_features[label_name] = df[["Description", "Aux1", "Aux2", "Filter_Key"]]

    return msss_features

In [19]:
msss_codes = load_msss_data("data/eq/datasets/ergon-ffa-msss-tables")

In [22]:
EQ_DATA_PATH = "data/eq/datasets/2024-tim-elkins-failures-data/"

EQ_DATA_FILES = [
    "EE Failures FY23-24 (Updated).xlsx",
    "EGX Failures FY23-24 (updated).xlsx"
]

EQ_DATA_PROVIDERS = ["Ergon Energy", "Energex"]

EQ_DATA_FILES = [os.path.join(EQ_DATA_PATH, file) for file in EQ_DATA_FILES]

In [None]:
# Load all the datasets
data = [pd.read_excel(dataset) for dataset in EQ_DATA_FILES]

In [46]:
# Data backup
# Executing this cell will restore the dataset to the original state

from copy import copy
if "data2" in locals():
    data = data2
data2 = copy(data)

In [22]:
index = "OUTAGE_ID"

input_features = [
"WEATHER_CONDITION",
"OUTAGE_CAUSE",
"FAULT_LONG_DESCRIPTION",
"SHORT_DESC_2",
"WORK_ORDER_COMPONENT_CODE_DESCRIPTION",
"OUTAGE_CAUSE_GROUP",
"OUTAGE_STANDARD_REASON_DESCRIPTION",
"REASON_FOR_INTERRUPTION",
"PROVIDER" # We add this feature to the datasets during pre-processing
]

output_labels = [
    "MSSS_OBJECT",
    "MSSS_DAMAGE",
    "MSSS_CAUSE"
]

In [48]:
# Capitalize all column names and replace spaces with underscores
for i, dataset in enumerate(data):
    columns = dataset.columns.to_list()
    new_columns = [column.replace(" ", "_").upper() for column in columns]
    replacement = dict(zip(columns, new_columns))
    data[i] = dataset.rename(columns=replacement)

for i, dataset in enumerate(data):
    # Drop all rows with duplicate indices
    dataset = dataset.drop_duplicates(subset=[index])

    # Set OUTAGE_ID as the index field
    dataset = dataset.set_index(index)

    # Drop all non input/output columns
    features = input_features + output_labels
    features = [f for f in features if f in dataset.columns.to_list()]
    dataset = dataset[features]
    
    # Drop all rows with entirely null/duplicate input values
    features = [f for f in features if f in input_features]
    dataset = dataset.drop_duplicates(subset=features).dropna(subset=features, how='all')
    
    # Add a feature to tell which dataset we're using (EE/EGX)
    provider = EQ_DATA_PROVIDERS[i]
    dataset["PROVIDER"] = provider

    data[i] = dataset

In [49]:
# Drop the index for each dataset
for i, dataset in enumerate(data):
    dataset = dataset.reset_index(drop=False)
    data[i] = dataset

In [76]:
# Concatenate all datasets into one
dataset = pd.concat(data)
dataset.set_index(index)

In [None]:
new_path = os.path.join(EQ_DATA_PATH, "../preprocessed.csv")

dataset.to_csv(new_path)

In [81]:
# # Concatenate each sample's input features into strings

# # Get all input features present in the dataset
# inputs = [f for f in input_features if f in dataset.columns]

# # Replace all missing input feature text with "Unknown"
# dataset[inputs] = dataset[inputs].fillna("Unknown")

# # Concatenate each sample's input features into strings
# input_texts = []
# for _, sample in dataset[inputs].iterrows():

#     text = [f"{key}: {value}" for key, value in sample.items()]
#     text = ";\n".join(text)
#     input_texts.append(text)

# # Add all concatenated inputs as a new feature
# dataset["TEXT"] = input_texts

# # Delete individual input features
# dataset = dataset.drop(columns=inputs)

In [103]:
# new_path = os.path.join(EQ_DATA_PATH, "../preprocessed.csv")

# dataset.to_csv(new_path)

In [93]:
import pandas as pd
d = pd.read_csv("../data/eq/datasets/preprocessed.csv", index_col=0)