In [1]:
from glob import glob
import numpy as np
import os
import pandas as pd
from scipy.io import loadmat
import torch

def set_seed_and_deterministic(seed):
    """
    Sets the seed for NumPy and PyTorch and makes PyTorch operations deterministic.

    Parameters:
    seed (int): The seed value to be set for reproducibility.
    """
    # Set seed for NumPy
    np.random.seed(seed)

    # Set seed for PyTorch
    torch.manual_seed(seed)

    # If using CUDA
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # For multi-GPU setups

    # Ensure deterministic behavior in PyTorch
    # Note: This might impact performance and is not guaranteed for all operations
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Usage
set_seed_and_deterministic(42)  # Replace 42 with your desired seed value


# DataFolder Metadata

In [2]:

# Creating a dictionary to represent healthy bearings data.
# 'Folder_Name' lists the names of the folders containing data about healthy bearings.
# 'Damage' labels each folder as 'Healthy'.
healthy_dict = {
    "Folder_Name": ["K001", "K002", "K003", "K004", "K005", "K006"],
    "Damage": ["Healthy", "Healthy", "Healthy", "Healthy", "Healthy", "Healthy"]
}

# Converting the dictionary to a Pandas DataFrame for better data representation.
healthy_df = pd.DataFrame(healthy_dict)

# Printing the DataFrame for healthy bearings.
print("HEALTHY BEARINGS")
# print(healthy_df)

# Creating a dictionary for bearings with artificial damages.
# 'Folder_Name' lists the names of the folders containing artificially damaged bearings data.
# 'Damage' labels each folder based on the type of damage (Outer or Inner).
artificial_dict = {
    "Folder_Name": ["KA01", "KA03", "KA05", "KA06", "KA07", "KA09", "KI01", "KI03", "KI05", "KI07", "KI08"],
    "Damage": ["Outer", "Outer", "Outer", "Outer", "Outer", "Outer", "Inner", "Inner", "Inner", "Inner", "Inner"]
}

# Converting the dictionary to a Pandas DataFrame.
artificial_df = pd.DataFrame(artificial_dict)

# Printing the DataFrame for bearings with artificial damages.
print("ARTIFICIAL DAMAGES")
# print(artificial_df)

# Creating a dictionary for bearings with real damages.
# 'Folder_Name' lists the names of folders containing data about bearings with real damages.
# 'Damage' labels each folder based on the type of damage (Outer or Inner).
real_dict = {
    "Folder_Name": ["KA04", "KA15", "KA16", "KA22", "KA30", "KI04", "KI14", "KI16", "KI17", "KI18", "KI21"],
    "Damage": ["Outer", "Outer", "Outer", "Outer", "Outer", "Inner", "Inner", "Inner", "Inner", "Inner", "Inner"]
}

# Converting the dictionary to a Pandas DataFrame.
real_df = pd.DataFrame(real_dict)

# Printing the DataFrame for bearings with real damages.
print("REAL DAMAGES")
# print(real_df)


HEALTHY BEARINGS
ARTIFICIAL DAMAGES
REAL DAMAGES


# Constants

In [12]:
data_dir = os.path.join(os.getcwd(), "./Paderborn Dataset")
N_POINTS = 249600 #? 64k Sampling rate for 3.9 seconds
artificial_dataset = ["K001",
                      "KA01", "KA03", "KA05", "KA07", "KI01", "KI03", "KI07"]
real_dataset = ["K001",
               "KA04", "KB23", "KB27", "KI04"]
SAMPLE_LEN = 1024
STRIDE = SAMPLE_LEN

In [32]:
def load_data(dataset_list):
    """
    Extracts and labels data from the given dataset list.
    Each folder is expected to contain 4 conditions with 20 measurements each.
    """
    # Dictionary to hold the data for each condition
    conditions = {cond: {"x": np.zeros((len(dataset_list) * 20, N_POINTS)), "y": np.zeros(len(dataset_list) * 20)} for cond in ["P1", "P2", "P3", "P4"]}
    condition_files = {"N15_M07_F10": "P1", "N09_M07_F10": "P2", "N15_M01_F10": "P3", "N15_M07_F04": "P4"}

    for label, foldername in enumerate(dataset_list):
        folder_path = os.path.join(data_dir, foldername)
        file_path_list = glob(os.path.join(folder_path, "*.mat"))

        # Assuming each folder contains an equal number of files for each condition
        files_per_condition = len(file_path_list) // len(condition_files)
        for file_index, file_path in enumerate(file_path_list):
            for file_cond, cond in condition_files.items():
                if file_cond in file_path:
                    vibration_data = load_mat(file_path)
                    sample_id = (label * 20) + (file_index % files_per_condition)
                    conditions[cond]["x"][sample_id] = vibration_data
                    conditions[cond]["y"][sample_id] = label
                    break

    # Convert numpy arrays to torch tensors
    for cond in conditions.values():
        cond["x"], cond["y"] = torch.from_numpy(cond["x"]), torch.from_numpy(cond["y"])

    return conditions

def load_mat(mat_file):
    """
    Loads vibration data from a .mat file.
    """
    file_name = os.path.splitext(os.path.basename(mat_file))[0]
    mat_file_array = loadmat(mat_file)
    vibration_data = mat_file_array[file_name]["Y"][0][0][0][6][2][:, :N_POINTS]
    return np.array(vibration_data)

def sample(x, y):
    output_x = x.unfold(1, SAMPLE_LEN, STRIDE)
    output_x = output_x.contiguous().view(-1, SAMPLE_LEN)
    windows_per_sample = output_x.size(0) // y.size(0)
    output_y = y.repeat_interleave(windows_per_sample)
    return output_x, output_y

def sample_data(input_data):
    # Applying the sample function to each condition in the dictionary
    for condition, datasets in input_data.items():
        sampled_x, sampled_y = sample(datasets["x"], datasets["y"])
        input_data[condition] = {"x": sampled_x, "y": sampled_y}
    
    return input_data


def split_data(input_data):
    # Assuming your original data is in the 'data' dictionary
    split_data = {}

    for condition, datasets in input_data.items():
        x_train, y_train, x_val, y_val, x_test, y_test = train_val_test_split(datasets["x"], datasets["y"])
        split_data[condition] = {
            "train": {"x": x_train, "y": y_train},
            "val": {"x": x_val, "y": y_val},
            "test": {"x": x_test, "y": y_test}
        }
    return split_data

def train_val_test_split(x, y):
    total_size = len(x)
    train_size = int(0.6 * total_size)  # 60% of the dataset
    val_size = int(0.2 * total_size)
    test_size = total_size - train_size - val_size
    
    indices = torch.randperm(total_size)
    
    train_indices = indices[:train_size]
    val_indices = indices[train_size:train_size + val_size]
    test_indices = indices[train_size + val_size:]

    x_train = x[train_indices]
    y_train = y[train_indices]

    x_val = x[val_indices]
    y_val = y[val_indices]

    x_test = x[test_indices]
    y_test = y[test_indices]
    
    return x_train, y_train, x_val, y_val, x_test, y_test



def save_condition_data(data, save_path):
    """
    Saves train, val, and test sets for each condition in separate .pth files.

    Parameters:
    data (dict): Nested dictionaries containing the data for each condition.
    save_path (str): Path to save the .pth files.
    """
    for index, (condition, datasets) in enumerate(data.items()):
        for split in ['train', 'val', 'test']:
            if split in datasets:
                modified_data = {
                    'samples': datasets[split]['x'],
                    'labels': datasets[split]['y']
                }
                file_name = f"{save_path}/{split}_{index}.pt"
                torch.save(modified_data, file_name)
                print(f"Saved {file_name}")

def process_data(fault_data, save_path= None):
    loaded_data =load_data(fault_data)
    sampled_data = sample_data(loaded_data)
    splitted_data = split_data(sampled_data)
    if save_path is not None:
        save_condition_data(splitted_data, save_path)
    return splitted_data


# Functions

In [36]:
artificial_dataset = ["K001", "KA01", "KA03", "KA05", "KA07", "KI01", "KI03", "KI07"]
real_dataset = ["K001", "KA04", "KB23", "KB27", "KI04"]

In [37]:
proc_real_data = process_data(real_dataset, "PU_Real_Val")



Saved PU_Real_Val/train_0.pt
Saved PU_Real_Val/val_0.pt
Saved PU_Real_Val/test_0.pt
Saved PU_Real_Val/train_1.pt
Saved PU_Real_Val/val_1.pt
Saved PU_Real_Val/test_1.pt
Saved PU_Real_Val/train_2.pt
Saved PU_Real_Val/val_2.pt
Saved PU_Real_Val/test_2.pt
Saved PU_Real_Val/train_3.pt
Saved PU_Real_Val/val_3.pt
Saved PU_Real_Val/test_3.pt


In [35]:
proc_artificial_dataset_data = process_data(artificial_dataset, "PU_Art_Val")
                
                

Saved PU_Real_Val/train_0.pt
Saved PU_Real_Val/val_0.pt
Saved PU_Real_Val/test_0.pt
Saved PU_Real_Val/train_1.pt
Saved PU_Real_Val/val_1.pt
Saved PU_Real_Val/test_1.pt
Saved PU_Real_Val/train_2.pt
Saved PU_Real_Val/val_2.pt
Saved PU_Real_Val/test_2.pt
Saved PU_Real_Val/train_3.pt
Saved PU_Real_Val/val_3.pt
Saved PU_Real_Val/test_3.pt


In [18]:
class VibrationDataProcessor:
    def __init__(self, dataset_list, data_dir, n_points, sample_len, stride):
        self.dataset_list = dataset_list
        self.data_dir = data_dir
        self.N_POINTS = n_points
        self.SAMPLE_LEN = sample_len
        self.STRIDE = stride
        self.data = self.load_data()

    def load_data(self):
        conditions = {cond: {"x": np.zeros((len(self.dataset_list) * 20, self.N_POINTS)), "y": np.zeros(len(self.dataset_list) * 20)} for cond in ["P1", "P2", "P3", "P4"]}
        condition_files = {"N15_M07_F10": "P1", "N09_M07_F10": "P2", "N15_M01_F10": "P3", "N15_M07_F04": "P4"}

        for label, foldername in enumerate(self.dataset_list):
            folder_path = os.path.join(self.data_dir, foldername)
            file_path_list = glob(os.path.join(folder_path, "*.mat"))

            files_per_condition = len(file_path_list) // len(condition_files)
            for file_index, file_path in enumerate(file_path_list):
                for file_cond, cond in condition_files.items():
                    if file_cond in file_path:
                        vibration_data = self.load_mat(file_path)
                        sample_id = (label * 20) + (file_index % files_per_condition)
                        conditions[cond]["x"][sample_id] = vibration_data
                        conditions[cond]["y"][sample_id] = label
                        break

        for cond in conditions.values():
            cond["x"], cond["y"] = torch.from_numpy(cond["x"]), torch.from_numpy(cond["y"])

        return conditions

    def load_mat(self, mat_file):
        file_name = os.path.splitext(os.path.basename(mat_file))[0]
        mat_file_array = loadmat(mat_file)
        vibration_data = mat_file_array[file_name]["Y"][0][0][0][6][2][:, :self.N_POINTS]
        return np.array(vibration_data)

    def sample(self, x, y):
        output_x = x.unfold(1, self.SAMPLE_LEN, self.STRIDE)
        output_x = output_x.contiguous().view(-1, self.SAMPLE_LEN)
        windows_per_sample = output_x.size(0) // y.size(0)
        output_y = y.repeat_interleave(windows_per_sample)
        return output_x, output_y

    def sample_data(self):
        for condition, datasets in self.data.items():
            sampled_x, sampled_y = self.sample(datasets["x"], datasets["y"])
            self.data[condition] = {"x": sampled_x, "y": sampled_y}

    def train_val_test_split(self, x, y):
        total_size = len(x)
        train_size = int(0.6 * total_size)
        val_size = int(0.2 * total_size)

        indices = torch.randperm(total_size)

        train_indices = indices[:train_size]
        val_indices = indices[train_size:train_size + val_size]
        test_indices = indices[train_size + val_size:]

        x_train = x[train_indices]
        y_train = y[train_indices]

        x_val = x[val_indices]
        y_val = y[val_indices]

        x_test = x[test_indices]
        y_test = y[test_indices]

        return x_train, y_train, x_val, y_val, x_test, y_test

    def split_data(self):
        split_data = {}
        for condition, datasets in self.data.items():
            x_train, y_train, x_val, y_val, x_test, y_test = self.train_val_test_split(datasets["x"], datasets["y"])
            split_data[condition] = {"train": {"x": x_train, "y": y_train}, "val": {"x": x_val, "y": y_val}, "test": {"x": x_test, "y": y_test}}
        return split_data
    
    def process_data(self):
        """
        Processes the data by loading, sampling, and splitting.
        This method orchestrates the entire workflow.
        """
        self.raw_data = self.load_data()  # Load the data
        self.sample_data()  # Apply sampling
        self.splitted_data = self.split_data()  # Perform train-val-test split
        
        return self.splitted_data


In [19]:
# Example usage

data_dir = os.path.join(os.getcwd(), "./Paderborn Dataset")
N_POINTS = 249600 #? 64k Sampling rate for 3.9 seconds
artificial_dataset = ["K001",  "KA01", "KA03", "KA05", "KA07", "KI01", "KI03", "KI07"]
real_dataset = ["K001",     "KA04", "KB23", "KB27", "KI04"]
SAMPLE_LEN = 1024
STRIDE = SAMPLE_LEN 
processor = VibrationDataProcessor(artificial_dataset, data_dir, N_POINTS, SAMPLE_LEN, STRIDE)
processed_art_data = processor.process_data()  


In [20]:
processed_art_data["P1"]["train"]["x"]

tensor([[ 0.0885,  0.1038,  0.0885,  ...,  0.0488, -0.0427, -0.0305],
        [ 2.5665, -0.1892,  0.9277,  ..., -0.1038, -0.0977, -0.0854],
        [-0.1129, -0.1770, -0.2106,  ...,  0.2106,  0.4578,  0.5737],
        ...,
        [-0.0427,  0.0793,  0.1190,  ...,  0.0336, -0.0153, -0.0610],
        [ 0.0397, -0.0427, -0.1190,  ..., -0.1373, -0.0366,  0.1221],
        [-0.1068, -0.0671,  0.2686,  ..., -0.6836, -0.4608, -0.5859]],
       dtype=torch.float64)

In [22]:
proc_art_data["P1"]["train"]["x"]

tensor([[-0.1343,  0.0427,  0.2197,  ..., -0.1587,  0.1129,  0.1984],
        [-0.0153,  0.0092, -0.0183,  ...,  0.1465,  0.1556,  0.0397],
        [ 0.0336, -0.5157,  0.0641,  ..., -0.0793, -0.0671,  0.0519],
        ...,
        [-0.0549, -0.0244,  0.0427,  ...,  0.0854,  0.0092,  0.2289],
        [-0.1801,  0.0610, -0.0275,  ..., -0.2747, -0.2106, -0.1648],
        [-0.2319, -0.4089, -0.1709,  ..., -0.5554, -0.5432, -0.4303]],
       dtype=torch.float64)

In [23]:
are_close = torch.allclose(processed_art_data["P1"]["train"]["x"], proc_art_data["P1"]["train"]["x"], atol=1e-04)  # atol is the tolerance level
print(are_close)  # This will print True if they are approximately equal within the tolerance


False


In [25]:
proc_art_data.keys(), proc_art_data["P1"].keys(), proc_art_data["P1"]["train"].keys()

(dict_keys(['P1', 'P2', 'P3', 'P4']),
 dict_keys(['train', 'val', 'test']),
 dict_keys(['x', 'y']))

# Sampling

In [91]:
### CONSTANTS ###
SAMPLE_LEN = 1024
STRIDE = SAMPLE_LEN #! NO overlap
TEST_SIZE = 0.2
VAL_SIZE = 0.2
# N_FOLDS = 5 #! Validation set not created


# Train-test split

In [17]:
# Real domain
Real_Normal_train = {"samples": Real_Normal_x_train, "labels": Real_Normal_y_train}
Real_Normal_test = {"samples": Real_Normal_x_test, "labels": Real_Normal_y_test}

Real_Rotate_train = {"samples": Real_Rotate_x_train, "labels": Real_Rotate_y_train}
Real_Rotate_test = {"samples": Real_Rotate_x_test, "labels": Real_Rotate_y_test}

Real_Load_train = {"samples": Real_Load_x_train, "labels": Real_Load_y_train}
Real_Load_test = {"samples": Real_Load_x_test, "labels": Real_Load_y_test}

Real_Radial_train = {"samples": Real_Radial_x_train, "labels": Real_Radial_y_train}
Real_Radial_test = {"samples": Real_Radial_x_test, "labels": Real_Radial_y_test}

# Artificial domain
Artificial_Normal_train = {"samples": Artificial_Normal_x_train, "labels": Artificial_Normal_y_train}
Artificial_Normal_test = {"samples": Artificial_Normal_x_test, "labels": Artificial_Normal_y_test}

Artificial_Rotate_train = {"samples": Artificial_Rotate_x_train, "labels": Artificial_Rotate_y_train}
Artificial_Rotate_test = {"samples": Artificial_Rotate_x_test, "labels": Artificial_Rotate_y_test}

Artificial_Load_train = {"samples": Artificial_Load_x_train, "labels": Artificial_Load_y_train}
Artificial_Load_test = {"samples": Artificial_Load_x_test, "labels": Artificial_Load_y_test}

Artificial_Radial_train = {"samples": Artificial_Radial_x_train, "labels": Artificial_Radial_y_train}
Artificial_Radial_test = {"samples": Artificial_Radial_x_test, "labels": Artificial_Radial_y_test}

# Save files

In [19]:
# Get parent directory
parent_dir = os.path.dirname(os.getcwd())
Real_dir = os.path.join(parent_dir, "dataset", "PU_Real")
Artificial_dir = os.path.join(parent_dir, "dataset", "PU_Artificial")

if not os.path.exists(Real_dir):
    os.makedirs(Real_dir)

if not os.path.exists(Artificial_dir):
    os.makedirs(Artificial_dir)

torch.save(Real_Normal_train, os.path.join(Real_dir, "train_Normal.pt"))
torch.save(Real_Normal_test, os.path.join(Real_dir, "test_Normal.pt"))
torch.save(Real_Rotate_train, os.path.join(Real_dir, "train_Rotate.pt"))
torch.save(Real_Rotate_test, os.path.join(Real_dir, "test_Rotate.pt"))
torch.save(Real_Load_train, os.path.join(Real_dir, "train_Load.pt"))
torch.save(Real_Load_test, os.path.join(Real_dir, "test_Load.pt"))
torch.save(Real_Radial_train, os.path.join(Real_dir, "train_Radial.pt"))
torch.save(Real_Radial_test, os.path.join(Real_dir, "test_Radial.pt"))

torch.save(Artificial_Normal_train, os.path.join(Artificial_dir, "train_Normal.pt"))
torch.save(Artificial_Normal_test, os.path.join(Artificial_dir, "test_Normal.pt"))
torch.save(Artificial_Rotate_train, os.path.join(Artificial_dir, "train_Rotate.pt"))
torch.save(Artificial_Rotate_test, os.path.join(Artificial_dir, "test_Rotate.pt"))
torch.save(Artificial_Load_train, os.path.join(Artificial_dir, "train_Load.pt"))
torch.save(Artificial_Load_test, os.path.join(Artificial_dir, "test_Load.pt"))
torch.save(Artificial_Radial_train, os.path.join(Artificial_dir, "train_Radial.pt"))
torch.save(Artificial_Radial_test, os.path.join(Artificial_dir, "test_Radial.pt"))