In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import h5py


In [2]:
# Define the directory where your CSV files are stored
directory = 'C:\\Users\\Efe\\Desktop\\uOttawaEMRefinedCSV'  # Adjust this to your files' directory path

all_data = pd.DataFrame()
label_map = {}
label_index = 0

# Load data and create labels based on filenames, using only the first column of data
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        filepath = os.path.join(directory, filename)
        data = pd.read_csv(filepath, usecols=[0])  # Only load the first column

        # Extract the condition part from the filename and ignore the speed
        condition = '_'.join(filename.split('_')[:2])  # Takes only the first two parts for the condition
        
        if condition not in label_map:
            label_map[condition] = label_index
            label_index += 1
        
        # Assign label to data
        data['Label'] = label_map[condition]
        all_data = pd.concat([all_data, data], ignore_index=True)

print("Loaded and labeled data from all files.")
print("Label mapping:", label_map)


Loaded and labeled data from all files.
Label mapping: {'B_R': 0, 'F_B': 1, 'H_H': 2, 'K_A': 3, 'R_M': 4, 'R_U': 5, 'S_W': 6, 'V_U': 7}


In [3]:
# Windowing function to create 28x28 images from the data
def create_windows(data, window_size, stride):
    X = []
    y = []
    for i in range(0, len(data) - window_size, stride):
        window = data.iloc[i:i + window_size]
        X.append(window.drop('Label', axis=1).values)  # Extract features from window (only the first column)
        y.append(window.iloc[-1]['Label'])  # Label for the window
    return np.array(X), np.array(y)


In [4]:
# Parameters for converting data into 28x28 images
window_size = 784  # 28x28 image size
stride = 392  # 50% overlap

# Create windows
X, y = create_windows(all_data, window_size, stride)

# Reshape X into the desired 28x28 format
X = X.reshape((X.shape[0], 28, 28, 1))


In [5]:
# Encode labels as one-hot encoding
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
Y_one_hot = to_categorical(y_encoded)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y_one_hot, test_size=0.3, random_state=42)


In [6]:
# Save dataset to a .npz file
np.savez('C:\\Users\\Efe\\Desktop\\EM28_28dataset.npz', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
print("Dataset saved successfully.")


Dataset saved successfully.


In [7]:
# Count the number of images in the training and test sets
print(f"Number of training images: {X_train.shape[0]}")
print(f"Number of test images: {X_test.shape[0]}")

# Count the number of unique labels in the training and test sets
unique_train_labels = np.unique(np.argmax(y_train, axis=1))  # Find unique labels by decoding one-hot encoding
unique_test_labels = np.unique(np.argmax(y_test, axis=1))

print(f"Number of unique labels in the training set: {len(unique_train_labels)}")
print(f"Number of unique labels in the test set: {len(unique_test_labels)}")

print(f"Unique labels in the training set: {unique_train_labels}")
print(f"Unique labels in the test set: {unique_test_labels}")


Number of training images: 23998
Number of test images: 10286
Number of unique labels in the training set: 8
Number of unique labels in the test set: 8
Unique labels in the training set: [0 1 2 3 4 5 6 7]
Unique labels in the test set: [0 1 2 3 4 5 6 7]
