In [1]:
import pandas as pd
import numpy as np
import os, re
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
def load_msss_tables(msss_tables_dir : str) -> dict:
    MSSS_TABLES = [
        "AD_Outage_Type.xls",
        "AD_Componant.xls",
        "AD_Object.xls",
        "AD_Damage.xls",
        "AD_Cause.xls"
    ]

    msss_tables = []

    for level, table in enumerate(MSSS_TABLES):
        path = os.path.join(msss_tables_dir, table)
        data = pd.read_excel(path, header=1, index_col="Code")
        data = data.dropna(axis=1, how='all') # Drop null rows

        # Get the column which contains the MSSS Code
        msss_code_column = "Aux2" if "Filter_Key" in data else "Aux1"
        # Rename this column to "ID"
        data = data.rename(columns={"Description": "Name", msss_code_column: "ID"})

        # If "ID" column only has two states, convert it into a boolean
        if data["ID"].nunique() == 1:
            value = data["ID"].unique()[0]
            data["ID"] = data["ID"].map(lambda x: x == value)

        # Get the column which contains the previous MSSS code
        if "Filter_Key" in data:
            # Rename this column to "PrevID"
            data = data.rename(columns={"Filter_Key":"PrevID"})
        else:
            # If none exists, create an empty one
            data.insert(data.columns.size, "PrevID", np.nan)

        data = data[["Name", "ID", "PrevID"]]
        msss_tables.append(data)

    return msss_tables

In [3]:
def _get_codes(table : pd.DataFrame, prev_codes : list | None = None, prev_table : pd.DataFrame | None = None) -> list:
    """This function generates a list of MSSS codes given a MSSS table.
    It can be used recursively to generate every unique combination of MSSS codes in the MSSS taxonomy.

    Args:
        table (DataFrame): A MSSS table from load_msss_tables().
        prev_codes (list, optional): List of the MSSS codes from the previous table.
        prev_table (pd.DataFrame, optional): The previous MSSS table. Used to translate MSSS codes into indices.

    Returns:
        codes (list): A list of the table's MSSS codes.
    """
    
    if not prev_codes:
        # If the table does not have a previous MSSS Code column,
        # we can simply use the unique MSSS Codes from the table
        if "PrevID" not in table.dropna(axis=1,how='any'):
            codes = table["ID"].unique().tolist()
            return codes
        else:
            raise TypeError("Getting MSSS codes for this table requires the codes from the previous table!")
    
    codes = prev_codes

    # Use the MSSS Code column as the category
    if not pd.api.types.is_bool_dtype(table["ID"]):
        category_column = "ID"

    # Unless the column is a boolean, in which case use the numerical code
    else:
        table = table.reset_index()
        category_column = "Code"

    # For each category in the previous MSSS table
    for i, code in enumerate(prev_codes):

        last_code = code[-1] if type(code) is list else code

        # Get all sub-categories in this MSSS table
        subcodes = table[table["PrevID"] == last_code][category_column].unique().tolist()
        
        # Convert the last code entry from MSSS Code -> numerical code
        if prev_table is not None:
            last_code_index = prev_table[prev_table["ID"] == last_code].index[0]

            if type(code) is list:
                code[-1] = last_code_index
            else: code = last_code_index

        if type(code) is list:
            subcodes = [[*code, subcode] for subcode in subcodes]
        else:
            subcodes = [[code, subcode] for subcode in subcodes]
        
        codes[i] = subcodes

    # Flatten the list
    codes = [i for j in codes for i in j]

    return codes

In [4]:
def create_msss_lookup_table(tables : list[pd.DataFrame]) -> pd.DataFrame:
    """
    Create a lookup table with a row for every unique MSSS classification.

    Args:
        tables (list[DataFrame]): List of pre-processed MSSS tables obtained with load_msss_tables().

    Returns:
        DataFrame: The MSSS lookup table.
    """

    # Create a list of lists for every possible MSSS classification
    codes = None
    for table_id, table in enumerate(tables):
        codes = _get_codes(table, codes, tables[table_id - 1] if table_id > 0 else None)
    
    # Create a DataFrame to represent all the MSSS classifications
    msss_features = ["Outage Type", "Component", "Object", "Damage", "Cause"]
    index = pd.DataFrame(codes, columns=msss_features)

    # Add features in the index for the Description and MSSS Code for each feature
    for column, table in zip(msss_features, tables):
        index[f"{column} Description"] = index[column].map(lambda x: table["Name"][x])
        index[f"{column} Code"] = index[column].map(lambda x: table["ID"][x])
    
    return index

In [5]:
msss_tables_path = "data/eq/datasets/ergon-ffa-msss-tables"
tables = load_msss_tables(msss_tables_path)

In [6]:
msss_table = create_msss_lookup_table(tables)

In [7]:
msss_table.to_csv("data/eq/datasets/msss_table.csv", index=False)

In [8]:
def get_classif(msss_object_description : str, msss_damage_description : str, msss_cause_description : str, lookup_table : pd.DataFrame) -> pd.DataFrame:
    t = lookup_table
    return t.loc[
        (t["Object Description"] == msss_object_description) & 
        (t["Damage Description"] == msss_damage_description) &
        (t["Cause Description"] == msss_cause_description)
    ]

In [9]:
EQ_DATA_PATH = "data/eq/datasets/2024-tim-elkins-failures-data/"

EQ_DATA_FILES = [
    "EE Failures FY23-24 (Updated).xlsx",
    "EGX Failures FY23-24 (updated).xlsx"
]

EQ_DATA_PROVIDERS = ["Ergon Energy", "Energex"]

EQ_DATA_FILES = [os.path.join(EQ_DATA_PATH, file) for file in EQ_DATA_FILES]

In [10]:
# Load all the datasets
data = [pd.read_excel(dataset) for dataset in EQ_DATA_FILES]

In [192]:
index = "OUTAGE_ID"

input_features = [
"WEATHER_CONDITION",
"OUTAGE_CAUSE",
"FAULT_LONG_DESCRIPTION",
"SHORT_DESC_2",
"WORK_ORDER_COMPONENT_CODE_DESCRIPTION",
"OUTAGE_CAUSE_GROUP",
"OUTAGE_STANDARD_REASON_DESCRIPTION",
"REASON_FOR_INTERRUPTION",
"PROVIDER" # We add this feature to the datasets during pre-processing
]

output_labels = [
    "MSSS_OBJECT_DESCRIPTION",
    "MSSS_DAMAGE_DESCRIPTION",
    "MSSS_CAUSE_DESCRIPTION"
]

In [364]:
# Data backup
# Executing this cell will restore the dataset to the original state

from copy import copy
if "data2" in locals():
    data = data2
data2 = copy(data)

In [365]:
# Capitalize all column names and replace spaces with underscores
for i, dataset in enumerate(data):
    columns = dataset.columns.to_list()
    new_columns = [column.replace(" ", "_").upper() for column in columns]
    replacement = dict(zip(columns, new_columns))
    data[i] = dataset.rename(columns=replacement)

for i, dataset in enumerate(data):
    # Drop all non input/output columns
    features = input_features + output_labels + [index]
    features = [f for f in features if f in dataset.columns]
    dataset = dataset[features]
    
    # Drop all rows with entirely null input features
    features = [f for f in features if f in input_features]
    dataset = dataset.dropna(subset=features, how='all')
    
    # Add a feature to tell which dataset we're using (EE/EGX)
    provider = EQ_DATA_PROVIDERS[i]
    dataset.insert(dataset.columns.size, "PROVIDER", provider)

    data[i] = dataset

In [366]:
# Certain null-like string values should be replaced with null
for i, dataset in enumerate(data):
    null_like = ["<null>", "#REF!"]

    for j in null_like: dataset = dataset.replace(j, np.nan)
    
    data[i] = dataset

In [367]:
def combine(rows : pd.Series) -> object:
    """
    Combine a series of rows from a given column into one item.

    The combination uses different methods based on the data type of the rows:
    - ``str``: Returns a concatenation of all unique values.
    - ``int``: Returns the mode.
    - ``nan``: Returns ``nan``.

    Args:
        rows (pd.Series): The series of rows.

    Returns:
        object: The combination of the rows.
    """
    # If the series is entirely empty, return NaN
    if rows.value_counts().empty: return np.nan
    
    if len(rows) == 1: return rows.iloc[0]

    # If the series is made of strings:
    if pd.api.types.is_string_dtype(rows):
        
        # Concatenate all unique strings
        unique_values = rows.unique()
        return ", ".join(unique_values)

        return rows.loc[rows.str.len().idxmax()]
    else:
        # Otherwise, return the mode
        return rows.value_counts().index[0]

In [368]:
# For rows with duplicate Outage IDs,
# concatenate input features when the output labels are the same,
# or split them into distinct cases when the output labels are different.
# (Combine all rows with duplicate outage IDs and output labels.)

for i, dataset in enumerate(data):
    dataset = dataset.groupby([index, *output_labels]).agg(combine).reset_index()
    data[i] = dataset

In [369]:
# Concatenate all datasets into one
dataset = pd.concat(data)

In [370]:
# Make spacing / capitalisation of MSSS feature labels consistent
dataset[output_labels] = dataset[output_labels].map(lambda x: x.strip().title())

In [371]:
# Correct inconsistent capitalisation within MSSS feature labels where possible

def map_item_to_msss_feature_label(item : str, label_names : list[str]) -> str | None:
    """
    Given a MSSS label (e.g., " animal"), try to map
    it to an MSSS label from a list (e.g., "Animal")
    to correct issues in capitalisation / spacing.

    Args:
        item (str): The raw label name.
        label_names (list[str]): List of legal label names.

    Returns:
        label (str | None): The correct label name, if found, otherwise the raw label name.
    """
    if pd.isna(item): return None

    if item in label_names: return item

    original_item = item
    item = item.strip().lower()

    labels_easy = [i.strip().lower() for i in label_names]
    if item in labels_easy:
        index = labels_easy.index(item)
        return label_names[ index ]

    word_matches = [i for i in labels_easy if item in i]
    
    if word_matches:
        if len(word_matches) == 1:
            item = word_matches[0]
            index = labels_easy.index(item)
            return label_names[ index ]
    
    return original_item

def rectify_feature(dataset : pd.DataFrame, msss_feature : str, msss_table : pd.DataFrame) -> pd.DataFrame:
    """Correct an MSSS feature from a dataset by removing inconsistencies in capitalisation / spacing.

    Args:
        dataset (pd.DataFrame): The original dataset
        msss_feature (str): MSSS feature name, e.g., "Cause".
        msss_table (pd.DataFrame): MSSS lookup table.

    Returns:
        pd.DataFrame: The dataset with msss_feature rectified.
    """
    table_column = f"{msss_feature} Description"
    data_column = f"MSSS_{msss_feature.upper()}_DESCRIPTION"

    label_names = msss_table[table_column].unique().tolist()

    dataset[data_column] = dataset[data_column].map(lambda x: map_item_to_msss_feature_label(x, label_names))
    return dataset

for feature in ["Object", "Cause", "Damage"]:
    dataset = rectify_feature(dataset, feature, msss_table)

In [372]:
def get_unknowns(dataset, msss_feature, msss_lookup_table):

    table_column = f"{msss_feature} Description"
    data_column = f"MSSS_{msss_feature.upper()}_DESCRIPTION"

    label_names = msss_lookup_table[table_column].unique().tolist()

    items = dataset[data_column].dropna().unique().tolist()
    unknown_items = [i for i in items if i not in label_names]

    return unknown_items

In [None]:
get_unknowns(dataset, "Object", msss_table)

In [266]:
new_path = os.path.join(EQ_DATA_PATH, "../preprocessed.csv")

dataset.to_csv(new_path)

new_path = os.path.join(EQ_DATA_PATH, "../preprocessed-supervised.csv")

dataset.dropna(subset=output_labels).to_csv(new_path)

In [17]:
def map_item_to_msss_feature_label(item, label_names):
    
    if pd.isna(item): return None

    if item in label_names: return item

    item = item.strip().lower()

    labels_easy = [i.strip().lower() for i in label_names]
    if item in labels_easy:
        index = labels_easy.index(item)
        return label_names[ index ]

    word_matches = [i for i in labels_easy if item in i]
    
    if word_matches:
        if len(word_matches) == 1:
            item = word_matches[0]
            index = labels_easy.index(item)
            return label_names[ index ]
    
    return None

def is_unknown(item, label_names):
    if pd.isna(item): return False

    match = map_item_to_msss_feature_label(item, label_names)
    return (match is None)
    
def get_unknowns(dataset, msss_feature):

    table_column = f"{msss_feature} Description"
    data_column = f"MSSS_{msss_feature.upper()}_DESCRIPTION"

    label_names = msss_table[table_column].unique().tolist()

    items = dataset[data_column]
    unknown_indices = items.map(lambda x: x in label_names)

    unknown_items = items[unknown_indices].unique().tolist()

    return unknown_items

In [None]:
# dupes = data[1][ data[1].duplicated(keep=False) ]

# # Obtain a 2D list containing all rows in the dataset with duplicate indices grouped by index.
# # Source: https://stackoverflow.com/a/46629549
# duplicate_indices = dupes.groupby(index).apply(lambda x : list(x.index), include_groups=False).tolist()

# for indices in duplicate_indices:

#     # For all rows which have the same Outage ID:
#     rows = data[1].iloc[indices]

#     # Combine all columns in the rows into one.
#     for column in data[1].columns:
        
#         values = rows[column]

#         combined = values.dropna().mode()[0]

#         # Replace the first index with the combined rows
#         first_index = indices[0]
#         #data[1].loc[index, column] = combined
#         data[1][column][first_index] = combined

# data[1] = data[1].drop_duplicates(subset=index,keep='first')
# #data[1] = data[1].drop(rows_to_remove)

In [12]:
# # Drop the index for each dataset
# for i, dataset in enumerate(data):
#     dataset = dataset.reset_index(drop=False)
#     data[i] = dataset

In [81]:
# # Concatenate each sample's input features into strings

# # Get all input features present in the dataset
# inputs = [f for f in input_features if f in dataset.columns]

# # Replace all missing input feature text with "Unknown"
# dataset[inputs] = dataset[inputs].fillna("Unknown")

# # Concatenate each sample's input features into strings
# input_texts = []
# for _, sample in dataset[inputs].iterrows():

#     text = [f"{key}: {value}" for key, value in sample.items()]
#     text = ";\n".join(text)
#     input_texts.append(text)

# # Add all concatenated inputs as a new feature
# dataset["TEXT"] = input_texts

# # Delete individual input features
# dataset = dataset.drop(columns=inputs)

In [103]:
# new_path = os.path.join(EQ_DATA_PATH, "../preprocessed.csv")

# dataset.to_csv(new_path)

In [93]:
import pandas as pd
d = pd.read_csv("../data/eq/datasets/preprocessed.csv", index_col=0)