In [1]:
%run "./2_Methods.ipynb"

In [2]:
class Autoencoder:      
    DEFAULT_EPOCHS = 100
    
    def __init__(self, input_data, password, optimizer_name, learning_rate, loss_function_name,
                 metrics=[], layer_count=2, layer_activation='relu', 
                 random_state = None, epochs=DEFAULT_EPOCHS, early_stopping_patience=None,
                 augmentation_factor=10, data_cols=DATA_COLS, verbose=None, init=True):        
        if(layer_count < 1):
            raise ValueError(f'The minimum number of layers must be 1! (Specified: {layers_count})')
            
        if random_state == None: 
            random_state = random.randrange(1000000000)

        self.verbose = verbose
        
        self.random_state = random_state
        
        self.input_data = input_data
        self.password = password
        self.data_cols = data_cols
        self.feature_count = input_data.shape[1]
        
        self.optimizer_name = optimizer_name
        self.learning_rate = learning_rate
        self.loss_function_name = loss_function_name
        self.metrics = metrics
    
        self.layer_count = layer_count
        self.layer_activation = layer_activation
        self.max_epochs = epochs
        self.augmentation_factor = augmentation_factor
        self.early_stopping_patience = early_stopping_patience
        
        self.autoencoder_data = None
        self.random_dissimilar_data = None
        self.model = None
        self.epochs = None

        self.fit_history = None
        self.trained_class_output = None
        self.positive_class_output = None
        self.negative_class_output = None

        self.initialized = False

        if init:
            self.initialize()
                    
    def __reset_state(self, seed):
        reset_random_state(seed)

    def __generate_training_data(self):
        self.autoencoder_data = np.concatenate((self.input_data, augment(self.input_data, len(self.input_data) * self.augmentation_factor)), axis=0)
        self.random_dissimilar_data = synthesize_dissimilar(self.input_data, len(self.input_data) * (self.augmentation_factor+1))

    def __create_model(self):
        encoder_features = []
        decoder_features = []
    
        for layer in range(self.layer_count): 
            encoder_features.append(int(self.feature_count/2**(layer+1)))
            decoder_features.append(int(self.feature_count/2**(layer)))
        decoder_features.reverse()

        encoder_layers = []
        decoder_layers = []

        for layer in range(self.layer_count): 
            encoder_layers.append(tf.keras.layers.Dense(encoder_features[layer], activation=self.layer_activation))
            decoder_layers.append(tf.keras.layers.Dense(decoder_features[layer], activation=(None if layer == self.layer_count-1 else self.layer_activation)))                

        self.model = tf.keras.Sequential([
            tf.keras.Sequential(encoder_layers), 
            tf.keras.Sequential(decoder_layers)
        ])

    def __train(self, X, epochs=DEFAULT_EPOCHS):
        # sample 5% random negative data to mix into training
        X_neg = self.random_dissimilar_data
        
        X_train, X_val, _, _ = train_test_split(X, X, shuffle=True, random_state=self.random_state)
        
        X_train = np.concatenate((X_train, X_neg[np.random.choice(len(X_neg), int(len(X_neg)*0.05), replace=False)]), axis=0)
        X_val = np.concatenate((X_val, X_neg[np.random.choice(len(X_neg), int(len(X_neg)*0.05), replace=False)]), axis=0)

        callbacks = []
        if self.early_stopping_patience != None:            
            callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=self.early_stopping_patience, restore_best_weights=True))
        
        self.fit_history = self.model.fit(X_train, X_train, epochs=epochs, 
                                          validation_data=(X_val, X_val),
                                          callbacks=callbacks, 
                                          verbose=self.verbose)

        self.epochs = len(self.fit_history.epoch)
        
        self.trained_class_output = self.predict(self.input_data)
        self.positive_class_output = self.predict(self.autoencoder_data)
        self.negative_class_output = self.predict(self.random_dissimilar_data)

        means=np.mean(self.trained_class_output, axis=0)
                
        self.euclidean_dist_positive_class = np.linalg.norm(means - np.mean(self.positive_class_output, axis=0)) 
        self.euclidean_dist_negative_class = np.linalg.norm(means - np.mean(self.negative_class_output, axis=0))
        
        self.euclidean_dist_delta = abs(self.euclidean_dist_negative_class - self.euclidean_dist_positive_class)
        self.euclidean_dist_ratio = self.euclidean_dist_delta / self.euclidean_dist_positive_class if self.euclidean_dist_positive_class > 0 else -1

        self.output = {
            'meta-info': {
                'optimizer': self.optimizer_name,
                'learning rate': self.learning_rate,
                'loss function': self.loss_function_name,
                'random seed': self.random_state,
                'epochs (max)': self.max_epochs,
                'early stopping': self.early_stopping_patience,
                'epochs trained': self.epochs,
            },
            'output': {
                'positive class': self.positive_class_output, 
                'negative class': self.negative_class_output, 
                'raw input': self.trained_class_output
            },
            'euclidean distance evaluation': {
                'l2 positive class': self.euclidean_dist_positive_class,
                'l2 negative class': self.euclidean_dist_negative_class,
                'l2 delta':  self.euclidean_dist_delta,
                'l2 ratio': self.euclidean_dist_ratio
            },
            'history': self.fit_history
        }
    def initialize(self):
        if self.initialized:
            raise ValueError(f'Cannot initialize already initialized autoencoder!')
            
        self.__reset_state(self.random_state)
        self.__generate_training_data()
                
        self.optimizer=globals()[self.optimizer_name](self.learning_rate)
        self.loss_function = tf.keras.losses.get(self.loss_function_name)
        
        self.__create_model()
        self.model.compile(
            optimizer=self.optimizer,
            loss=self.loss_function,
            metrics=self.metrics
        )
        
        self.__train(self.autoencoder_data, epochs=self.max_epochs)
        self.initialized = True
    
    def predict(self, X, verbose=None):
        return self.model.predict(X, verbose=verbose)

    def generate_positive(self, count):
        return self.predict(augment(self.input_data, count))

    def generate_negative(self, count):
        return self.predict(synthesize_dissimilar(self.input_data, count))
    
    def print_evaluation(self, show_history=False, verbose=False):
        print(f'Autoencoder Model:')
        if verbose:
            print(f'  layers: {self.layer_count}')
            print(f'  activation: {self.layer_activation}')
            print(f'  feature count: {self.feature_count}')
            print(f'  data size: {len(self.input_data)}')
            print(f'  augmentation factor: {self.augmentation_factor}')
            print(f'  early stopping: {self.early_stopping_patience}')
            print(f'  epochs (max): {self.max_epochs}')
        print(f'  optimizer: {self.optimizer_name}')
        print(f'  learning rate: {self.learning_rate}')
        print(f'  loss function: {self.loss_function_name}')
        print(f'  random seed: {self.random_state}')
        print(f'  epochs trained: {self.epochs}')
        print(f'L2 norms:')
        print(f'  positive: {self.euclidean_dist_positive_class}')
        print(f'  negative: {self.euclidean_dist_negative_class}')
        print(f'  delta: {self.euclidean_dist_delta}')
        print(f'  ratio: {self.euclidean_dist_ratio}')
        print(f' ')

        if show_history:
            plot_training_loss(self.fit_history)

    def plot_output_data(self, plot_type='line'): 
        #!!! MAKE SURE DATA_COLS IS CORRECT AND CORRESPONDS TO THE DATA PASSED TO THE AUTOENCODER !!!
        #plot_type must be one of ['datapoints', 'violin', 'overlap', 'line']
        
        positive_class_input_dataframe = create_dataframe(self.input_data, 'positive class (input)', self.data_cols)
        positive_class_output_dataframe = create_dataframe(self.autoencoder_data, 'positive class (output)', self.data_cols)
        negative_class_output_dataframe = create_dataframe(self.random_dissimilar_data, 'negative class (output)', self.data_cols)
        
        combined_dataframe = pd.concat([positive_class_input_dataframe, positive_class_output_dataframe, negative_class_output_dataframe], ignore_index=True)
        password_ascii = [ord(c) for c in self.password]
        for idx, key_value in enumerate(password_ascii):
            combined_dataframe['key'+str(idx)] = key_value

        d_regex = re.compile(DURATION_COL_PATTERN)
        pp_regex = re.compile(PP_COL_PATTERN)
        pr_regex = re.compile(PR_COL_PATTERN)
        rp_regex = re.compile(RP_COL_PATTERN)
        rr_regex = re.compile(RR_COL_PATTERN)        
        
        if any((match := d_regex.match(column)) for column in self.data_cols):
            plot_d_data(combined_dataframe, self.password, target_category=None, plot_type=plot_type,
                        trim_outliers=True, display=True, save=False)
        
        if any((match := pp_regex.match(column)) for column in self.data_cols):
            plot_pp_data(combined_dataframe, self.password, target_category=None, plot_type=plot_type,
                         trim_outliers=True, display=True, save=False)
            
        if any((match := pr_regex.match(column)) for column in self.data_cols):
            plot_pr_data(combined_dataframe, self.password, target_category=None, plot_type=plot_type,
                         trim_outliers=True, display=True, save=False)

        if any((match := rp_regex.match(column)) for column in self.data_cols):
            plot_rp_data(combined_dataframe, self.password, target_category=None, plot_type=plot_type, 
                         trim_outliers=True, display=True, save=False)

        if any((match := rr_regex.match(column)) for column in self.data_cols):
            plot_rr_data(combined_dataframe, self.password, target_category=None, plot_type=plot_type, 
                         trim_outliers=True, display=True, save=False)

            

In [7]:
#@background # concurrency breaks reproducability!
def train_autoencoder(autoencoder, index):
    start = time.time()    
    print(f'Starting initialization on autoencoder #{index}.')
    autoencoder.initialize()
    print(f'Finished training autoencoder #{index} after {(time.time() - start):.2f} seconds.')

    return {**autoencoder.output['meta-info'], **autoencoder.output['euclidean distance evaluation']}

def cycle_eval_autoencoders(X, password, optimizers, loss_functions, learning_rates, random_states_count=20, augmentation_factor=10, output=None):        
    autoencoders = []
    autoencoder_results = []

    optimizers_count = len(optimizers)
    loss_functions_count = len(loss_functions)
    learning_rates_count = len(learning_rates)

    total_training_cycles = optimizers_count * loss_functions_count * learning_rates_count * random_states_count
    
    print(f'Starting training & evaluation for {optimizers_count*loss_functions_count*learning_rates_count*random_states_count} autoencoders:')
    
    for opt_idx in range(optimizers_count):       
        optimizer_cycles = opt_idx * loss_functions_count * learning_rates_count * random_states_count
        
        for loss_idx in range(loss_functions_count):           
            loss_function_cycles = loss_idx * learning_rates_count * random_states_count
            
            for learn_idx in range(learning_rates_count):
                learning_rate_cycles = learn_idx * random_states_count

                for rand_idx in range(random_states_count):       
                    random_seed = random.randrange(1000000000)
                    current_cycle = 1+rand_idx+learning_rate_cycles+loss_function_cycles+optimizer_cycles

                    autoencoder = Autoencoder(X, password, 
                                              optimizers[opt_idx], 
                                              learning_rates[learn_idx], 
                                              loss_functions[loss_idx], 
                                              epochs=Autoencoder.DEFAULT_EPOCHS,
                                              early_stopping_patience=20,
                                              random_state=random_seed, 
                                              init=False)

                    if output != None:
                        autoencoder_evaluation_row_to_csv(train_autoencoder(autoencoder, current_cycle), output[0], output[1])
                        #autoencoder.print_evaluation(show_history=False)
                        del autoencoder
                        gc.collect()
                    else:
                        autoencoder_results.append(train_autoencoder(autoencoder, current_cycle))
                        autoencoders.append(autoencoder)
    
    if output == None:
        return autoencoder_results, autoencoders
    else:
        return None, None

def autoencoder_evaluation_row_to_csv(evaluation, filename, path):    
    full_filename = f'{path}/{filename}.csv'
    if os.path.exists(full_filename) == False:
        with open(full_filename, 'w', newline='') as file:
            csv.DictWriter(file, delimiter=";", fieldnames=evaluation.keys()).writeheader()
     
    with open(full_filename, 'a', newline='') as file:
        csv.DictWriter(file, delimiter=";", fieldnames=evaluation.keys()).writerow(evaluation)
        
def autoencoder_evaluation_to_csv(evaluations, filename, path):
    dataframe = pd.DataFrame.from_dict(evaluations)   
    timestamp = str(datetime.datetime.today()).replace(' ', '_').replace(':', '-').split(".")[0]
    
    dataframe.to_csv(f'{path}/{filename}__{timestamp}.csv', sep=';', decimal=',', index=False)
    

In [5]:
# reduced optimizers and loss functions through previous cyclic evaluations
def find_autoencoder_hyperparameters(data, password, direct_to_file=False):
    optimizers = [
        'Adagrad',
        'RMSprop',
        
        #'Adam', 
        #'Lion',
        #'AdamW', 
        #'Adadelta', 
        #'Adamax', 
        #'Ftrl', 
        #'Nadam',
        #'SGD', 
    ]
    
    loss_functions = [
        'Hinge', # best performing for autoencoders through trial and error
        
        #'MeanSquaredError', 
        #'MeanSquaredLogarithmicError',
        #'MeanAbsoluteError',
        #'MeanAbsolutePercentageError',
        #'CosineSimilarity',
        #'Huber',
        #'Poisson',
    ]

    EVALUATIONS_OUTPUT_PATH = "evaluations"
    if not os.path.exists(EVALUATIONS_OUTPUT_PATH):
        os.makedirs(EVALUATIONS_OUTPUT_PATH)

    filename = 'autoencoder_evaluations'
    direct_output = (filename, EVALUATIONS_OUTPUT_PATH) if direct_to_file else None

    # 0.1 and 0.005 were not suited well in previous attempts
    #[0.05, 0.03, 0.01] 
    # 0.03 seems to perform the best for Adagrad with Hinge Loss
    learning_rates = [0.03]
    evaluation_results, autoencoders = cycle_eval_autoencoders(data, password, 
                                                               optimizers, loss_functions, learning_rates, 
                                                               random_states_count=50,
                                                               output=direct_output)
        
    return evaluation_results, autoencoders

In [6]:
# evals, autoencoders = find_autoencoder_hyperparameters(positive_class_data[DATA].values, DATA_PASSWORD)