In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import matplotlib.pyplot as plt
import random
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.model_selection import train_test_split
from google.colab import drive
import os


In [None]:
drive.mount('/content/drive')
%cd /write/your/directory/to/AI_4_ATD

In [None]:
epsilon = 0.001

#Define Processing Functions

In [None]:
class Format():
  def __init__(self):
    self.scale = 0
    self.cns = 0
    self.ia = 0
    self.imp = 0

In [None]:
def SCALE(dataset, labels, format):
  shape = dataset.shape
  if len(shape) == 3:
    batch, n_lines, n_points = shape
    for i in range(batch):
      max_val = np.max(np.absolute([y for y in dataset[i][0]]))
      for j in range(n_lines - 1):
        new_max_val = np.max(np.absolute([y for y in dataset[i][j + 1]]))
        if new_max_val > max_val:
          max_val = new_max_val
      for j in range(n_lines):
        for k in range(n_points):
          if dataset[i][j][k] != -1:
            dataset[i][j][k] /= max_val
  else:
    batch, n_points = shape
    for i in range(batch):
      max_val = np.max(np.absolute([y for y in dataset[i]]))
      for j in range(n_points):
        if dataset[i][j] != -1:
          dataset[i][j] /= max_val
  format.scale = 1
  return dataset, labels, format

In [None]:
def CNS(dataset, labels, format):
  batch, n_lines, n_points = dataset.shape
  for i in range(batch):
    c_mean = np.mean([y for y in dataset[i][0] if y != -1])
    c_std = np.std([y for y in dataset[i][0] if y != -1]).clip(epsilon, None)
    for j in range(n_lines):
      for k in range(n_points):
        if dataset[i][j][k] != -1:
          dataset[i][j][k] = (dataset[i][j][k] - c_mean) / c_std
  dataset = np.delete(dataset, 0, axis=1)

  format.cns = 1
  return dataset, labels, format

In [None]:
def IA(dataset, labels, format):

  if format.cns == 0:
    control = dataset[:, 0, :].copy()
    dataset = np.delete(dataset, 0, axis=1)

  batch, n_lines, n_points = dataset.shape
  dataset = (dataset.copy()).reshape(batch*n_lines, n_points)

  if format.cns == 0:
    control = np.repeat(control, repeats=3, axis=0)
    dataset = np.stack((control, dataset), axis=1)
    print(dataset.shape)

  labels = labels.reshape(batch*n_lines)

  format.ia = 1
  return dataset, labels, format

In [None]:
def iqm(series, q):
  q1 = np.quantile(series, 1-q)
  q3 = np.quantile(series, q)
  series = [y for y in series if q1 <= y <= q3]
  return np.mean(series)

def imputate(dataset, labels, format, imp_type):
  shape = dataset.shape
  if len(shape) == 3:
    batch, n_series, n_points = shape
    for i in range(batch):
      for j in range(n_series):
        series = [y for y in dataset[i][j] if y != -1]
        if len(series) == 0:
          dataset[i][j] = np.zeros(n_points)
        elif imp_type == 'mean':
          imput_val = np.mean(series)
        elif imp_type == 'median':
          imput_val = np.median(series)
        elif imp_type == 'iqm':
          imput_val = iqm(series, 0.75)
        else:
          assert False, f"imp_type={imp_type}"

        for k in range(n_points):
          if dataset[i][j][k] == -1:
            dataset[i][j][k] = imput_val
  else:
    batch, n_points = shape
    for i in range(batch):
      series = [y for y in dataset[i] if y != -1]
      if len(series) == 0:
        dataset[i] = np.zeros(n_points)
      elif imp_type == 'mean':
        imput_val = np.mean(series)
      elif imp_type == 'median':
        imput_val = np.median(series)
      elif imp_type == 'iqm':
        imput_val = iqm(series, 0.75)
      else:
        assert False, f"imp_type={imp_type}"

      for j in range(n_points):
          if dataset[i][j] == -1:
            dataset[i][j] = imput_val
  if imp_type == 'mean':
    format.imp = 1
  elif imp_type == 'median':
    format.imp = 2
  elif imp_type == 'iqm':
    format.imp = 3
  else:
    assert False, f"imp_type={imp_type}"

  return dataset, labels, format

In [None]:
def process(dataset, labels, goal_format):
  format = Format()
  if goal_format.cns == 1:
    dataset, labels, format = CNS(dataset, labels, format)
  if goal_format.ia == 1:
    dataset, labels, format = IA(dataset, labels, format)
  if goal_format.imp == 1:
    dataset, labels, format = imputate(dataset, labels, format, 'mean')
  elif goal_format.imp == 2:
    dataset, labels, format = imputate(dataset, labels, format, 'median')
  elif goal_format.imp == 3:
    dataset, labels, format = imputate(dataset, labels, format, 'iqm')
  if goal_format.scale == 1:
    dataset, labels, format = SCALE(dataset, labels, format)

  assert format.cns == goal_format.cns
  assert format.scale == goal_format.scale
  assert format.ia == goal_format.ia
  assert format.imp == goal_format.imp

  return dataset, labels, format

# Run through combinations

In [None]:
run_num = 0

In [None]:
def build_model(format):
  # Determine the output size and input shape based on the format's 'ia' and 'cns' flags
  if format.ia == 0:
    out_size = 3
    if format.cns == 0:
      input_shape = (4, 10)
    else:
      input_shape = (3, 10)
  else:
    out_size = 1
    if format.cns == 0:
      input_shape = (2, 10)
    else:
      input_shape = (10,)

  # Initialize a Sequential Keras model
  model = tf.keras.Sequential()
  # Flatten the input data to a 1D vector
  model.add(layers.Flatten(input_shape=input_shape))
  # Add dense layers with ReLU activation for feature extraction
  model.add(layers.Dense(16, activation='relu'))
  model.add(layers.Dense(16, activation='relu'))
  # Add a Dropout layer to prevent overfitting
  model.add(layers.Dropout(0.2))
  # Add the output dense layer with softmax activation for classification
  model.add(layers.Dense(2*out_size, activation='softmax'))
  # Reshape the output if the original output size was 3, then apply softmax again
  if out_size == 3:
    model.add(layers.Reshape((3, 2)))
    model.add(layers.Softmax(axis=2))
  # Otherwise, apply softmax directly to the output
  else:
    model.add(layers.Softmax())

  return model

In [None]:
dataset = np.load('Datasets/train_dataset.npy')
labels = np.load('Datasets/train_labels.npy')

In [None]:
seed = 42 + 7 * run_num
# Set random seeds for reproducibility across numpy, tensorflow, and random modules
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# Define the path for saving training history plots as a PDF
history_file = f'Training_Runs/SYS_Models_Run{run_num}/history.pdf'
# Initialize a PdfPages object to save multiple plots into a single PDF file
p = PdfPages(history_file)

# List all existing model files in the current run directory
model_files = os.listdir(f'Training_Runs/SYS_Models_Run{run_num}')
# Extract format identifiers from already trained models to skip retraining
trained_models = [f[8:12] for f in model_files if 'FAM-SYS-' in f]

# Configure EarlyStopping callback to prevent overfitting during training
early_stopping = EarlyStopping(
    monitor='val_accuracy',  # Monitor validation accuracy
    patience=15,          # Stop training if no improvement after 15 epochs
    restore_best_weights=True  # Restore the best model weights found during training
)

# Iterate through all combinations of data processing formats (scale, cns, ia, imp)
for scale in range(2):
  for cns in range(2):
    for ia in range(2):
      for imp in range(4):
        # Skip training if a model with the current format combination already exists
        if f'{scale}{cns}{ia}{imp}' in trained_models:
          continue

        # Print the current format combination being processed
        print(scale, cns, ia, imp)
        # Create a Format object to define the desired data processing steps
        goal_format = Format()
        goal_format.scale = scale
        goal_format.cns = cns
        goal_format.ia = ia
        goal_format.imp = imp

        # Process the dataset and labels according to the goal_format
        dataset_it, labels_it, format = process(dataset.copy(), labels, goal_format)

        # Split the processed dataset into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(dataset_it, labels_it, test_size=0.2, random_state=42)

        # Build the Keras model based on the applied data format
        model = build_model(format)

        # Compile the model with an optimizer, loss function, and metrics
        model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

        # Train the model with early stopping
        history = model.fit(X_train, y_train,
                            epochs=500,
                            validation_data=(X_val, y_val,),
                            verbose=0,
                            callbacks=[early_stopping])

        # Construct a descriptive string for the model name based on applied formats
        format_string = ''
        if format.scale == 1:
          format_string += '_scale'
        if format.cns == 1:
          format_string += '_cns'
        if format.ia == 1:
          format_string += '_ia'
        if format.imp >= 1:
          format_string += '_imp'
          if format.imp == 1:
            format_string += '-mean'
          elif format.imp == 2:
            format_string += '-median'
          elif format.imp == 3:
            format_string += '-iqm'
        # Generate the full model name including format, epochs, and a unique identifier
        model_name = f'FAM-SYS-{format.scale}{format.cns}{format.ia}{format.imp}'+f'-ep={len(history.epoch)}'+format_string+'.keras'
        # Print the name of the saved model
        print(model_name)
        # Save the trained model to the specified directory
        model.save(f'Training_Runs/SYS_Models_Run{run_num}/'+model_name)

        # Plot training and validation accuracy over epochs
        plt.plot(history.history['accuracy'], label='accuracy')
        plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.ylim([0.5, 1])
        plt.legend(loc='lower right')
        p.savefig() # Save the accuracy plot to the PDF
        plt.close() # Close the plot to free up memory

        # Plot training and validation loss over epochs
        plt.plot(history.history['loss'], label='loss')
        plt.plot(history.history['val_loss'], label = 'val_loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.ylim([0, 1])
        plt.legend(loc='lower right')
        p.savefig() # Save the loss plot to the PDF
        plt.close() # Close the plot to free up memory

# Close the PDF file after all plots have been saved
p.close()