In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import py7zr
import random
import rarfile
from scipy.io import loadmat
import torch
import urllib.request
import zipfile

# Set seed

In [2]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    
set_seed(42)

# Download and extract

In [3]:
# IMS bearings dataset
IMS_links = {
    'IMS': 'https://phm-datasets.s3.amazonaws.com/NASA/4.+Bearings.zip'
}

def download_and_extract(file_name, url, folder_path, dtype, extract_function):
    while True:
        try:
            print(f"Downloading {url}")
            urllib.request.urlretrieve(url, os.path.join(folder_path, f'{file_name}{dtype}'))
            print(f'Extracting {file_name}{dtype}')
            extract_function(folder_path, file_name)
            break  # Exit the loop if the download is successful
        except Exception as e:
            print(f"Failed to download {url}: {e}")

def extract_rar(folder, file_name):
    print(f'Extract {os.path.join(folder, file_name)}.rar manually')
#     with rarfile.RarFile(os.path.join(folder, f'{file_name}.rar')) as rf:
#         rf.extractall(os.path.join(folder, file_name))
    pass
        
def extract_zip(folder, file_name):
    with zipfile.ZipFile(os.path.join(folder, f'{file_name}.zip'), 'r') as zip_ref:
        zip_ref.extractall(os.path.join(folder, file_name))
        
def extract_7zip(folder, file_name):
    with py7zr.SevenZipFile(os.path.join(folder, f'{file_name}.7z'), mode='r') as sevenzip:
        sevenzip.extractall(os.path.join(folder, file_name))
        
# Download & Extract IMS dataset
folder_path = os.path.join(os.getcwd(), 'IMS')
os.makedirs(folder_path, exist_ok=True)
for file_name, url_link in IMS_links.items():
    if not os.path.exists(os.path.join(folder_path, file_name)):
        download_and_extract(file_name, url_link, folder_path, '.zip', extract_zip)

# Unzip the 7zip folder inside zipfolder & extract .rar files inside inner 7zip folder
if not os.path.exists(os.path.join(folder_path, 'IMS', '4. Bearings', 'IMS')):
    extract_7zip(os.path.join(folder_path, 'IMS', '4. Bearings'), 'IMS')
    extract_rar(os.path.join(folder_path, 'IMS', '4. Bearings', 'IMS'), '1st_test')
    extract_rar(os.path.join(folder_path, 'IMS', '4. Bearings', 'IMS'), '2nd_test')
    extract_rar(os.path.join(folder_path, 'IMS', '4. Bearings', 'IMS'), '3rd_test')

Downloading https://phm-datasets.s3.amazonaws.com/NASA/4.+Bearings.zip
Extracting IMS.zip
Extract C:\Work\ASTAR\codes\CDA\CDA\data_preproc\IMS\IMS\4. Bearings\IMS\1st_test.rar manually
Extract C:\Work\ASTAR\codes\CDA\CDA\data_preproc\IMS\IMS\4. Bearings\IMS\2nd_test.rar manually
Extract C:\Work\ASTAR\codes\CDA\CDA\data_preproc\IMS\IMS\4. Bearings\IMS\3rd_test.rar manually


# Process IMS dataset

In [4]:
# Convert file from YYYY.MM.DD.hh.mm.ss format to YYYYMMDDhhmmss.txt files to make it easier to sort
def convert_filename_to_integer(filename):
    parts = filename.split('.')
    return int("".join(parts))

def rename_files_in_directory(directory):
    for filename in os.listdir(directory):
        try:
            new_filename = str(convert_filename_to_integer(filename))
        except ValueError:
            print(f"Already converted filenames")
            break
        os.rename(os.path.join(directory, filename), os.path.join(directory, f'{new_filename}.txt'))
 
test_1_path = os.path.join(os.getcwd(), 'IMS/IMS/4. Bearings/IMS/1st_test')
test_2_path = os.path.join(os.getcwd(), 'IMS/IMS/4. Bearings/IMS/2nd_test')
rename_files_in_directory(test_1_path)
rename_files_in_directory(test_2_path)

Already converted filenames
Already converted filenames


Following the works of https://hal.science/hal-01715193/document, the following labels are used

        Inner:   2003/11/22/00:06:56 - 2003/11/25/23:39:56 (30 days into the run of test 1, channel 5)
        Roller:  2003/11/14/11:02:17 - 2003/11/25/23:39:56 (23 days into the run of test 1, channel 7)
        Outer:   2004/02/14/22:22:39 - 2004/02/19/06:22:39 (2.5 days into the run of test 2, channel 1)
        
For the healthy states, the first half of the snapshots before it was considered faulty will be used. Additionally, the first 10% of the first snapshot will be treated as early operation and excluded similarly to this work: https://papers.phmsociety.org/index.php/phme/article/download/2947/1761

        Healthy: 2003/10/29/17:29:46 - 2003/11/10/12:05:58 (test 1, channel 5)
        Roller:  2003/10/22/23:04:13 - 2003/11/01/19:21:44 (test 1, channel 7)
        Outer:   2004/02/12/16:22:39 - 2004/02/13/16:32:39 (test 2, channel 1)

In [11]:
file_list_1 = os.listdir(test_1_path)
file_list_2 = os.listdir(test_2_path)

Inner_files = file_list_1[1735:]
Roller_files = file_list_1[894:]
Outer_files = file_list_2[363:]

Healthy_bearing_3 = file_list_1[173:867]
Healthy_bearing_4 = file_list_1[88:485]
Healthy_bearing_1 = file_list_2[35:181]

In [28]:
def sliding_window_subsample(tensor, window_size=1024, step=1024):
    tensor = tensor.unsqueeze(1)
    return tensor.unfold(2, window_size, step).transpose(0, 1).transpose(1, 2).squeeze(0)

def read_files(folder, file_list, channels, label):
    x, y = [], []
    for file_name in file_list:
        data = pd.read_csv(os.path.join(folder, file_name), sep='\t', header=None)
        tensor_data = torch.tensor(data.iloc[:, channels].values, dtype=torch.float).unsqueeze(0)
        
        subsampled_data = sliding_window_subsample(tensor_data, window_size=1024, step=1024)
        labels = torch.full((subsampled_data.shape[0],), label)
        
        x.append(subsampled_data)
        y.append(labels)
        
    x_tensor = torch.cat(x, dim=0)
    y_tensor = torch.cat(y, dim=0)
    
    return x_tensor, y_tensor

# Dataset 1 has 2 channels for each bearing but both channels were noted to be similar enough
Inner_x, Inner_y = read_files(test_1_path, Inner_files, channels=5, label=1)
Roller_x, Roller_y = read_files(test_1_path, Roller_files, channels=7, label=1)
Outer_x, Outer_y = read_files(test_2_path, Outer_files, channels=0, label=1)

Healthy_bearing_3_x, Healthy_bearing_3_y = read_files(test_1_path, Healthy_bearing_3, channels=5, label=0)
Healthy_bearing_4_x, Healthy_bearing_4_y = read_files(test_1_path, Healthy_bearing_4, channels=7, label=0)
Healthy_bearing_1_x, Healthy_bearing_1_y = read_files(test_2_path, Healthy_bearing_1, channels=0, label=0)

# Combine all class into a tensor
combined_x = torch.cat((Inner_x, Roller_x, Outer_x, Healthy_bearing_3_x, Healthy_bearing_4_x, Healthy_bearing_1_x), dim=0)
combined_y = torch.cat((Inner_y, Roller_y, Outer_y, Healthy_bearing_3_y, Healthy_bearing_4_y, Healthy_bearing_1_y), dim=0)

print(combined_x.shape, combined_y.shape)

torch.Size([70820, 1, 1024]) torch.Size([70820])


In [None]:
# Split the tensor into training, validation and testing
dataset = torch.utils.data.TensorDataset(combined_x, combined_y) # Combine x and y to ensure both are split in the same way



In [31]:
# Split the tensor into training, validation and testing
dataset = torch.utils.data.TensorDataset(combined_x, combined_y) # Combine x and y to ensure both are split in the same way

total_size = len(dataset)
train_size = int(0.6 * total_size)
val_size = int(0.2 * total_size)
test_size = total_size - train_size - val_size

# Split the dataset
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

# Split x and y
def split_xy(dataset):
    x, y = [], []
    for x_tensor, y_tensor in dataset:
        x.append(x_tensor)
        y.append(y_tensor)
    # Convert lists to tensors
    x = torch.stack(x)
    y = torch.stack(y)
    return x, y

train_x, train_y = split_xy(train_dataset)
val_x, val_y = split_xy(val_dataset)
test_x, test_y = split_xy(test_dataset)

# Check the shapes of the splits
print(train_x.shape, train_y.shape)  # Training set
print(val_x.shape, val_y.shape)      # Validation set
print(test_x.shape, test_y.shape)    # Test set

torch.Size([42492, 1, 1024]) torch.Size([42492])
torch.Size([14164, 1, 1024]) torch.Size([14164])
torch.Size([14164, 1, 1024]) torch.Size([14164])


In [32]:
# Save the datasets
training = {"samples": train_x,  "labels":train_y}
validation = {"samples": val_x,  "labels":val_y}
testing = {"samples": test_x,  "labels":test_y}

torch.save(training, os.path.join(os.getcwd(), 'IMS', 'train.pt'))
torch.save(validation, os.path.join(os.getcwd(), 'IMS', 'val.pt'))
torch.save(testing, os.path.join(os.getcwd(), 'IMS', 'test.pt'))