In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

In [2]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load labels from CSV
reference_path = 'data/processed/training2017/label/REFERENCE-original.csv'
labels_df = pd.read_csv(reference_path ,header=None, names=['sampleName', 'label'])

# Convert string labels to numerical format
label_encoder = LabelEncoder()
labels_df['label'] = label_encoder.fit_transform(labels_df['label'])

# Create a mapping of filename to label
label_mapping = dict(zip(labels_df['sampleName'], labels_df['label']))

In [None]:
labels_df['label'].unique()

In [None]:
a = np.load("data/processed/training2017/sample/A00001.npy")
print(type(a[1]))

In [5]:
from tensorflow.keras.utils import to_categorical, pad_sequences

def data_generator(file_paths, label_mapping, batch_size):
    while True:
        batch_paths = np.random.choice(file_paths, size=batch_size, replace=False)
        batch_data = []
        batch_labels = []

        for file_name in batch_paths:
            file_path = os.path.join("data/processed/training2017/sample", file_name)
            time_series = np.load(file_path)
            label = label_mapping[file_name.split('.')[0]]

            # Normalize data
            time_series = StandardScaler().fit_transform(time_series.reshape(-1, 1)).flatten()

            # Pad or truncate sequences to max_seq_length
            time_series = pad_sequences([time_series], maxlen=max_seq_length, padding='post', truncating='post')[0]

            batch_data.append(time_series)
            batch_labels.append(label)

        batch_labels = to_categorical(batch_labels, num_classes=len(label_encoder.classes_))
        # print(batch_labels)

        yield np.array(batch_data), np.array(batch_labels)

In [15]:
from sklearn.model_selection import train_test_split

# Get the list of file paths
file_paths = [file_name for file_name in os.listdir("data/processed/training2017/sample") if file_name.endswith(".npy")]

max_seq_length = max(len(np.load(os.path.join("data/processed/training2017/sample", file_name))) for file_name in file_paths)

# Split into training, validation, and test sets
train_paths, test_paths = train_test_split(file_paths, test_size=0.2, random_state=42)
train_paths, val_paths = train_test_split(train_paths, test_size=0.1, random_state=42)

# Create data generators for training, validation, and test sets
batch_size = 32
train_generator = data_generator(train_paths, label_mapping, batch_size)
val_generator = data_generator(val_paths, label_mapping, batch_size)
test_generator = data_generator(test_paths, label_mapping, batch_size)

# Get the shape of one sample to set input shape
sample_data, label_data = next(train_generator)
input_shape = sample_data.shape

In [7]:
print(max_seq_length)

18286


In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, SimpleRNN, Dropout, BatchNormalization

learning_rate = 0.1  # Adjust this value as needed

# Build the RNN model
model = Sequential()
model.add(SimpleRNN(units=50, activation='relu', input_shape=(max_seq_length, 1)))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(units=64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(units=len(label_encoder.classes_), activation='softmax'))  # Output layer with softmax activation


In [None]:
model.summary()

In [17]:
from tensorflow.keras.optimizers import SGD

# Use mean squared error as the loss function and set the learning rate
learning_rate = 0.01  # Adjust this value as needed
model.compile(optimizer=SGD(learning_rate=learning_rate), loss='mean_squared_error', metrics=['accuracy'])

# Calculate the number of steps per epoch for training and validation
train_steps_per_epoch = len(train_paths) // batch_size
val_steps_per_epoch = len(val_paths) // batch_size

# Train the model
model.fit(train_generator, steps_per_epoch=train_steps_per_epoch, epochs=10,
          validation_data=val_generator, validation_steps=val_steps_per_epoch)

# Evaluate the model on the test set
test_steps_per_epoch = len(test_paths) // batch_size
test_loss, test_accuracy = model.evaluate(test_generator, steps=test_steps_per_epoch)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.1307, Test Accuracy: 0.6450
