In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:

# Directory where the 3 main datasets (UORED, CWRU, HUST) are stored
directory = 'C:\\Users\\Efe\\Desktop\\uOttawaEMRefinedCSV'
chunk_size = 100000  # Set chunk size to read large files in parts

# Initialize lists to hold training and testing data
train_data_list = []
test_data_list = []

# Dictionary for dataset-specific label rules
label_rules = {
    'UORED': lambda filename: 'healthy' if filename.startswith('H') else 'faulty',
    'CWRU': lambda filename: 'healthy' if filename.startswith('97_Normal') else 'faulty',
    'HUST': lambda filename: 'healthy' if filename.startswith('N') else 'faulty'
}

In [3]:
# List of datasets with directories corresponding to each dataset
datasets = {
    'UORED': 'C:\\Users\\Efe\\Desktop\\UORED Zero csv',  # Replace with actual subdirectory name for UORED
    'CWRU': 'C:\\Users\\Efe\\Desktop\\CWRU Zero csv',    # Replace with actual subdirectory name for CWRU
    'HUST': 'C:\\Users\\Efe\\Desktop\\HUST Zero csv'     # Replace with actual subdirectory name for HUST
}

In [4]:
# Function to read all CSV files from a directory (or subdirectory) and return concatenated DataFrame with labeling
def load_and_label_csv_from_directory(main_file_path, label_func):
    all_data = pd.DataFrame()
    
    # Walk through the directory and find CSV files
    for root, dirs, files in os.walk(main_file_path):
        for filename in files:
            if filename.endswith('.csv'):
                file_path = os.path.join(root, filename)
                print(f"Processing CSV file: {file_path}")
                
                # Apply labeling rule for the specific dataset
                label = label_func(filename)
                
                # Load CSV in chunks and concatenate
                for chunk in pd.read_csv(file_path, chunksize=chunk_size):
                    chunk['Label'] = label  # Add the standardized 'healthy' or 'faulty' label
                    all_data = pd.concat([all_data, chunk], ignore_index=True)
    
    return all_data

In [5]:
# Process the first two datasets (UORED and CWRU)
for dataset_name in ['UORED', 'CWRU']:
    dataset_path = os.path.join(directory, datasets[dataset_name])
    
    # Load and label all CSV files within this main directory
    file_data = load_and_label_csv_from_directory(dataset_path, label_rules[dataset_name])
    
    # Split the data into 80% training and 20% testing
    train_data, test_data = train_test_split(file_data, test_size=0.2, random_state=42)
    
    # Append training and testing data to their respective lists
    train_data_list.append(train_data)
    test_data_list.append(test_data)

# Process the third dataset (HUST) - use all data as testing samples
hust_path = os.path.join(directory, datasets['HUST'])
hust_data = load_and_label_csv_from_directory(hust_path, label_rules['HUST'])

# Append all data from the HUST dataset to the test data list
test_data_list.append(hust_data)

# Concatenate all training and testing data into final DataFrames
train_data_final = pd.concat(train_data_list, ignore_index=True)
test_data_final = pd.concat(test_data_list, ignore_index=True)

# Save the training and testing data to CSV files
train_data_final.to_csv('train_data.csv', index=False)
test_data_final.to_csv('test_data.csv', index=False)

print("Training and testing data split, labeled, and saved successfully.")

Processing CSV file: C:\Users\Efe\Desktop\UORED Zero csv\B_11_2.csv
Processing CSV file: C:\Users\Efe\Desktop\UORED Zero csv\B_12_2.csv
Processing CSV file: C:\Users\Efe\Desktop\UORED Zero csv\B_13_2.csv
Processing CSV file: C:\Users\Efe\Desktop\UORED Zero csv\B_14_2.csv
Processing CSV file: C:\Users\Efe\Desktop\UORED Zero csv\B_15_2.csv
Processing CSV file: C:\Users\Efe\Desktop\UORED Zero csv\C_16_2.csv
Processing CSV file: C:\Users\Efe\Desktop\UORED Zero csv\C_17_2.csv
Processing CSV file: C:\Users\Efe\Desktop\UORED Zero csv\C_18_2.csv
Processing CSV file: C:\Users\Efe\Desktop\UORED Zero csv\C_19_2.csv
Processing CSV file: C:\Users\Efe\Desktop\UORED Zero csv\C_20_2.csv
Processing CSV file: C:\Users\Efe\Desktop\UORED Zero csv\H_10_0.csv
Processing CSV file: C:\Users\Efe\Desktop\UORED Zero csv\H_11_0.csv
Processing CSV file: C:\Users\Efe\Desktop\UORED Zero csv\H_12_0.csv
Processing CSV file: C:\Users\Efe\Desktop\UORED Zero csv\H_13_0.csv
Processing CSV file: C:\Users\Efe\Desktop\UORED 

In [6]:
# Snippet to show label counts
print("\nTraining Data Label Distribution:")
print(train_data_final['Label'].value_counts())

print("\nTesting Data Label Distribution:")
print(test_data_final['Label'].value_counts())


Training Data Label Distribution:
Label
faulty     10780294
healthy     7111912
Name: count, dtype: int64

Testing Data Label Distribution:
Label
faulty     16862655
healthy     4335964
Name: count, dtype: int64
