# IDP MD Simulation Analysis Protocol:
------
Developed for analysis of wildtype simulation 

## Notes:
#### Preprocessing
- Must specify protein length and window sizes
- Data is normalized to 0-1
---
#### Feature Enrichment
- No enrichment or filtering
---
#### Autoencoder Tuning
- No Tuning - Predetermined structure
---
#### Autoencoder Training
- Default Structure, Default Data
- 4, 5, 7, 8 Layer Models
- Local Compaction & Local Angle Data
- Unshuffled Data
---
#### Clustering
- Agglomerative Co-Clustering

In [1]:
import os
import re
import sys
import glob
import time
import json
import subprocess
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from scipy.stats import mode
from kneed import KneeLocator
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from IPython.display import Image, display

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Activation, Dense, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.callbacks import Callback

2024-12-01 18:14:05.409908: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-01 18:14:05.501794: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-01 18:14:05.535464: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-01 18:14:05.546514: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-01 18:14:05.617518: I tensorflow/core/platform/cpu_feature_guar

# Data Import

In [2]:
import glob
import pandas as pd
import os
import re

# Define the range of window sizes
window_range = range(2, 52)

def import_lcc_data(lccdata_folder):
    """
    Imports Local Compaction data files (.csv) and assigns them to a dictionary.

    Parameters:
    - lccdata_folder: The folder where Local Compaction data files are stored.

    Returns:
    - A dictionary with window sizes as keys and pandas DataFrames as values.
    """
    data_dict = {}
    
    for window_size in window_range:
        file_path = os.path.join(lccdata_folder, f"WT_Simulation_WS_{window_size}.csv")
        if os.path.exists(file_path):
            df = pd.read_csv(file_path, index_col=0)
            df.index.name = None 
            data_dict[window_size] = df
        else:
            print(f"Warning: File not found for Window Size {window_size}: {file_path}")
    
    return data_dict


# Folder containing the Local Compaction data
lccdata_folder = 'Local_Compaction/Local_Compaction_Data'

# Import LCC data for the wild-type protein
wt_dict = import_lcc_data(lccdata_folder)

# Data Preparation for Dimension Reduction

### Data Preparation:
- Unfiltered, Shuffled Distance Measurement Data
- Unfiltered, Shuffled Angle Measurement Data
- Unfiltered, Shuffled RMSD Measurement Data
- Unfiltered, UnShuffled Distance Measurement Data
- Filtered, Shuffled Distance Measurement Data

## Unfiltered, Shuffled Distance Measurement Data, Normalized 0-1

In [3]:
import os
import pandas as pd
from sklearn.model_selection import KFold

def save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder):
    """
    Save the datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    os.makedirs(folder_name, exist_ok=True)
    
    X_train.to_csv(os.path.join(folder_name, 'X_train_f.csv'))
    y_train.to_csv(os.path.join(folder_name, 'y_train_f.csv'))
    X_valid.to_csv(os.path.join(folder_name, 'X_valid_f.csv'))
    y_valid.to_csv(os.path.join(folder_name, 'y_valid_f.csv'))

def load_datasets(fold, base_folder):
    """
    Load datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    
    X_train = pd.read_csv(os.path.join(folder_name, 'X_train_f.csv'), index_col=0)
    y_train = pd.read_csv(os.path.join(folder_name, 'y_train_f.csv'), index_col=0)
    X_valid = pd.read_csv(os.path.join(folder_name, 'X_valid_f.csv'), index_col=0)
    y_valid = pd.read_csv(os.path.join(folder_name, 'y_valid_f.csv'), index_col=0)
    
    return X_train, y_train, X_valid, y_valid

def normalize_to_range_0_1(data):
    """
    Normalize data to range [0, 1] and round to 6 decimal places.
    """
    normalized = (data - data.min()) / (data.max() - data.min())
    return normalized.round(6)

def preprocessing_kfold(wt_data, n_splits=5, base_folder="AE_Data/Unfiltered_Shuffled_Compaction_Data"):
    """
    Preprocess data using KFold cross-validation, normalize it, and save it.
    """
    # Ensure all data is numeric
    wt_data = wt_data.apply(pd.to_numeric, errors='coerce')

    # Shuffle the DataFrame rows without resetting the index
    wt_data = wt_data.sample(frac=1, random_state=42)
    
    os.makedirs(base_folder, exist_ok=True)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold = 1
    for train_index, valid_index in kf.split(wt_data):
        folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
        if not os.path.exists(os.path.join(folder_name, 'X_train_f.csv')):
            X_train, X_valid = wt_data.iloc[train_index], wt_data.iloc[valid_index]
            y_train = pd.DataFrame({'class': [0] * len(X_train)}, index=X_train.index)
            y_valid = pd.DataFrame({'class': [0] * len(X_valid)}, index=X_valid.index)

            # Normalize training and validation data to [0, 1] and round to 6 decimal places
            X_train = normalize_to_range_0_1(X_train)
            X_valid = normalize_to_range_0_1(X_valid)

            print(f"Fold {fold}:")
            print(f"Training set shape: {X_train.shape}")
            print(f"Validation set shape: {X_valid.shape}")
            
            save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder)
        else:
            print(f"Data for fold {fold} already exists, skipping generation.")
        fold += 1

In [4]:
# Window sizes to be used
window_sizes = list(range(2, 52))

# Concatenate all DataFrames in wt_dict into a single DataFrame
wt_combined = pd.concat(wt_dict.values(), axis=1)

# Preprocess and save data
preprocessing_kfold(wt_combined, n_splits=5)

# Example of loading a specific fold (e.g., fold 1)
base_folder = "AE_Data/Unfiltered_Shuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

Data for fold 1 already exists, skipping generation.
Data for fold 2 already exists, skipping generation.
Data for fold 3 already exists, skipping generation.
Data for fold 4 already exists, skipping generation.
Data for fold 5 already exists, skipping generation.


## Unfiltered, UnShuffled Distance Measurement Data, Normalized 0-1

In [2]:
import os
import pandas as pd
from sklearn.model_selection import KFold

def save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder):
    """
    Save the datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    os.makedirs(folder_name, exist_ok=True)
    
    X_train.to_csv(os.path.join(folder_name, 'X_train_f.csv'))
    y_train.to_csv(os.path.join(folder_name, 'y_train_f.csv'))
    X_valid.to_csv(os.path.join(folder_name, 'X_valid_f.csv'))
    y_valid.to_csv(os.path.join(folder_name, 'y_valid_f.csv'))

def load_datasets(fold, base_folder):
    """
    Load datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    
    X_train = pd.read_csv(os.path.join(folder_name, 'X_train_f.csv'), index_col=0)
    y_train = pd.read_csv(os.path.join(folder_name, 'y_train_f.csv'), index_col=0)
    X_valid = pd.read_csv(os.path.join(folder_name, 'X_valid_f.csv'), index_col=0)
    y_valid = pd.read_csv(os.path.join(folder_name, 'y_valid_f.csv'), index_col=0)
    
    return X_train, y_train, X_valid, y_valid

def normalize_to_range_0_1(data):
    """
    Normalize data to range [0, 1] and round to 6 decimal places.
    """
    normalized = (data - data.min()) / (data.max() - data.min())
    return normalized.round(6)

def preprocessing_kfold(wt_data, n_splits=5, base_folder="AE_Data/Unfiltered_UnShuffled_Compaction_Data"):
    """
    Preprocess data using KFold cross-validation, normalize it, and save it without shuffling.
    """
    # Ensure all data is numeric
    wt_data = wt_data.apply(pd.to_numeric, errors='coerce')
    
    os.makedirs(base_folder, exist_ok=True)

    kf = KFold(n_splits=n_splits, shuffle=False)  # Do not shuffle
    fold = 1
    for train_index, valid_index in kf.split(wt_data):
        folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
        if not os.path.exists(os.path.join(folder_name, 'X_train_f.csv')):
            X_train, X_valid = wt_data.iloc[train_index], wt_data.iloc[valid_index]
            y_train = pd.DataFrame({'class': [0] * len(X_train)}, index=X_train.index)
            y_valid = pd.DataFrame({'class': [0] * len(X_valid)}, index=X_valid.index)

            # Normalize training and validation data to [0, 1] and round to 6 decimal places
            X_train = normalize_to_range_0_1(X_train)
            X_valid = normalize_to_range_0_1(X_valid)

            print(f"Fold {fold}:")
            print(f"Training set shape: {X_train.shape}")
            print(f"Validation set shape: {X_valid.shape}")
            
            save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder)
        else:
            print(f"Data for fold {fold} already exists, skipping generation.")
        fold += 1


In [20]:
# Window sizes to be used
window_sizes = list(range(2, 52))

# Concatenate all DataFrames in wt_dict into a single DataFrame
wt_combined = pd.concat(wt_dict.values(), axis=1)

# Preprocess and save data
preprocessing_kfold(wt_combined, n_splits=5)

# Example of loading a specific fold (e.g., fold 1)
base_folder = "AE_Data/Unfiltered_UnShuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

Fold 1:
Training set shape: (32000, 2175)
Validation set shape: (8000, 2175)
Fold 2:
Training set shape: (32000, 2175)
Validation set shape: (8000, 2175)
Fold 3:
Training set shape: (32000, 2175)
Validation set shape: (8000, 2175)
Fold 4:
Training set shape: (32000, 2175)
Validation set shape: (8000, 2175)
Fold 5:
Training set shape: (32000, 2175)
Validation set shape: (8000, 2175)


# Filtered, Shuffled Measurement Data, Normalized 0-1

In [1]:
import glob
import pandas as pd
import os
import re

# Define the range of window sizes
window_range = range(2, 49)

def import_lcc_data(lccdata_folder):
    """
    Imports Local Compaction data files (.csv) and assigns them to a dictionary.

    Parameters:
    - lccdata_folder: The folder where Local Compaction data files are stored.

    Returns:
    - A dictionary with window sizes as keys and pandas DataFrames as values.
    """
    data_dict = {}
    
    for window_size in window_range:
        file_path = os.path.join(lccdata_folder, f"WS_{window_size}_f.csv")
        if os.path.exists(file_path):
            df = pd.read_csv(file_path, index_col=0)
            df.index.name = None 
            data_dict[window_size] = df
        else:
            print(f"Warning: File not found for Window Size {window_size}: {file_path}")
    
    return data_dict


# Folder containing the Local Compaction data
lccdata_folder = 'XGB_High_vs_Low_Energy/Filtered_Local_Compaction_Data'

# Import LCC data for the wild-type protein
wt_dict = import_lcc_data(lccdata_folder)

In [2]:
import os
import pandas as pd
from sklearn.model_selection import KFold

def save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder):
    """
    Save the datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    os.makedirs(folder_name, exist_ok=True)
    
    X_train.to_csv(os.path.join(folder_name, 'X_train_f.csv'))
    y_train.to_csv(os.path.join(folder_name, 'y_train_f.csv'))
    X_valid.to_csv(os.path.join(folder_name, 'X_valid_f.csv'))
    y_valid.to_csv(os.path.join(folder_name, 'y_valid_f.csv'))

def load_datasets(fold, base_folder):
    """
    Load datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    
    X_train = pd.read_csv(os.path.join(folder_name, 'X_train_f.csv'), index_col=0)
    y_train = pd.read_csv(os.path.join(folder_name, 'y_train_f.csv'), index_col=0)
    X_valid = pd.read_csv(os.path.join(folder_name, 'X_valid_f.csv'), index_col=0)
    y_valid = pd.read_csv(os.path.join(folder_name, 'y_valid_f.csv'), index_col=0)
    
    return X_train, y_train, X_valid, y_valid

def normalize_to_range_0_1(data):
    """
    Normalize data to range [0, 1] and round to 6 decimal places.
    """
    normalized = (data - data.min()) / (data.max() - data.min())
    return normalized.round(6)

def preprocessing_kfold(wt_data, n_splits=5, base_folder="AE_Data/Filtered_Shuffled_Compaction_Data"):
    """
    Preprocess data using KFold cross-validation, normalize it, and save it.
    """
    # Ensure all data is numeric
    wt_data = wt_data.apply(pd.to_numeric, errors='coerce')

    # Shuffle the DataFrame rows without resetting the index
    wt_data = wt_data.sample(frac=1, random_state=42)
    
    os.makedirs(base_folder, exist_ok=True)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold = 1
    for train_index, valid_index in kf.split(wt_data):
        folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
        if not os.path.exists(os.path.join(folder_name, 'X_train_f.csv')):
            X_train, X_valid = wt_data.iloc[train_index], wt_data.iloc[valid_index]
            y_train = pd.DataFrame({'class': [0] * len(X_train)}, index=X_train.index)
            y_valid = pd.DataFrame({'class': [0] * len(X_valid)}, index=X_valid.index)

            # Normalize training and validation data to [0, 1] and round to 6 decimal places
            X_train = normalize_to_range_0_1(X_train)
            X_valid = normalize_to_range_0_1(X_valid)

            print(f"Fold {fold}:")
            print(f"Training set shape: {X_train.shape}")
            print(f"Validation set shape: {X_valid.shape}")
            
            save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder)
        else:
            print(f"Data for fold {fold} already exists, skipping generation.")
        fold += 1

In [3]:
# Window sizes to be used
window_sizes = list(range(2, 52))

# Concatenate all DataFrames in wt_dict into a single DataFrame
wt_combined = pd.concat(wt_dict.values(), axis=1)

# Preprocess and save data
preprocessing_kfold(wt_combined, n_splits=5)

# Example of loading a specific fold (e.g., fold 1)
base_folder = "AE_Data/Filtered_Shuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

Data for fold 1 already exists, skipping generation.
Data for fold 2 already exists, skipping generation.
Data for fold 3 already exists, skipping generation.
Data for fold 4 already exists, skipping generation.
Data for fold 5 already exists, skipping generation.


# Unfiltered, Shuffled Angle Measurement Data, Normalized 0-1

In [None]:
import os
import pandas as pd
from sklearn.model_selection import KFold

def save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder):
    """
    Save the datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    os.makedirs(folder_name, exist_ok=True)
    
    X_train.to_csv(os.path.join(folder_name, 'X_train_f.csv'))
    y_train.to_csv(os.path.join(folder_name, 'y_train_f.csv'))
    X_valid.to_csv(os.path.join(folder_name, 'X_valid_f.csv'))
    y_valid.to_csv(os.path.join(folder_name, 'y_valid_f.csv'))

def load_datasets(fold, base_folder):
    """
    Load datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    
    X_train = pd.read_csv(os.path.join(folder_name, 'X_train_f.csv'), index_col=0)
    y_train = pd.read_csv(os.path.join(folder_name, 'y_train_f.csv'), index_col=0)
    X_valid = pd.read_csv(os.path.join(folder_name, 'X_valid_f.csv'), index_col=0)
    y_valid = pd.read_csv(os.path.join(folder_name, 'y_valid_f.csv'), index_col=0)
    
    return X_train, y_train, X_valid, y_valid

def normalize_to_range_0_1(data):
    """
    Normalize data to range [0, 1] and round to 6 decimal places.
    """
    normalized = (data - data.min()) / (data.max() - data.min())
    return normalized.round(6)

def load_angle_data(angle_folder, angle_window_sizes):
    """
    Load angle data for all window sizes and combine into a single DataFrame.
    """
    angle_data = {}
    for window_size in angle_window_sizes:
        file_path = os.path.join(angle_folder, f'Angles_WS_{window_size}.csv')
        if os.path.exists(file_path):
            angle_data[window_size] = pd.read_csv(file_path, index_col=0)
        else:
            print(f"Warning: Angle data file not found for window size {window_size}.")
    return angle_data

def combine_angle_data_only(angle_data, angle_window_sizes):
    """
    Combine all angle data into a single DataFrame, sorted by window size.
    """
    # Start with the index of one DataFrame (assuming all indices are consistent)
    first_window_size = list(angle_data.keys())[0]
    combined_data = pd.DataFrame(index=angle_data[first_window_size].index)

    for window_size in angle_window_sizes:
        if window_size in angle_data:
            combined_data = pd.concat([combined_data, angle_data[window_size]], axis=1)
        else:
            print(f"Warning: Missing angle data for window size {window_size}. Skipping.")
    
    return combined_data

def preprocessing_kfold(angle_data_only, n_splits=5, base_folder="AE_Data/Unfiltered_Shuffled_Angle_Data"):
    """
    Preprocess angle data using KFold cross-validation, normalize it, and save it.
    """
    # Ensure all data is numeric
    angle_data_only = angle_data_only.apply(pd.to_numeric, errors='coerce')

    # Shuffle the DataFrame rows without resetting the index
    angle_data_only = angle_data_only.sample(frac=1, random_state=42)
    
    os.makedirs(base_folder, exist_ok=True)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold = 1
    for train_index, valid_index in kf.split(angle_data_only):
        folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
        if not os.path.exists(os.path.join(folder_name, 'X_train_f.csv')):
            X_train, X_valid = angle_data_only.iloc[train_index], angle_data_only.iloc[valid_index]
            y_train = pd.DataFrame({'class': [0] * len(X_train)}, index=X_train.index)
            y_valid = pd.DataFrame({'class': [0] * len(X_valid)}, index=X_valid.index)

            # Normalize training and validation data to [0, 1] and round to 6 decimal places
            X_train = normalize_to_range_0_1(X_train)
            X_valid = normalize_to_range_0_1(X_valid)

            print(f"Fold {fold}:")
            print(f"Training set shape: {X_train.shape}")
            print(f"Validation set shape: {X_valid.shape}")
            
            save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder)
        else:
            print(f"Data for fold {fold} already exists, skipping generation.")
        fold += 1

In [None]:
# Paths and window sizes
angle_window_sizes = list(range(1, 51))
angle_folder = "Local_Angles/Angle_Data"

# Load and combine angle data only
angle_data = load_angle_data(angle_folder, angle_window_sizes)
combined_angle_data = combine_angle_data_only(angle_data, angle_window_sizes)

# Preprocess and save data
preprocessing_kfold(combined_angle_data, n_splits=5, base_folder="AE_Data/Unfiltered_Shuffled_Angle_Data")

# Example of loading a specific fold (e.g., fold 1)
base_folder = "AE_Data/Unfiltered_Shuffled_Angle_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

# Unfiltered, Shuffled RMSD Data, Normalized 0-1

# Unfiltered, Shuffled Rg Data, Normalized 0-1

## Unfiltered, Shuffled Distance and Angle Measurement Data, Normalized 0-1

In [26]:
import os
import pandas as pd
from sklearn.model_selection import KFold

def save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder):
    """
    Save the datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    os.makedirs(folder_name, exist_ok=True)
    
    X_train.to_csv(os.path.join(folder_name, 'X_train_f.csv'))
    y_train.to_csv(os.path.join(folder_name, 'y_train_f.csv'))
    X_valid.to_csv(os.path.join(folder_name, 'X_valid_f.csv'))
    y_valid.to_csv(os.path.join(folder_name, 'y_valid_f.csv'))

def load_datasets(fold, base_folder):
    """
    Load datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    
    X_train = pd.read_csv(os.path.join(folder_name, 'X_train_f.csv'), index_col=0)
    y_train = pd.read_csv(os.path.join(folder_name, 'y_train_f.csv'), index_col=0)
    X_valid = pd.read_csv(os.path.join(folder_name, 'X_valid_f.csv'), index_col=0)
    y_valid = pd.read_csv(os.path.join(folder_name, 'y_valid_f.csv'), index_col=0)
    
    return X_train, y_train, X_valid, y_valid

def normalize_to_range_0_1(data):
    """
    Normalize data to range [0, 1] and round to 6 decimal places.
    """
    normalized = (data - data.min()) / (data.max() - data.min())
    return normalized.round(6)

def load_angle_data(angle_folder, window_sizes):
    """
    Load angle data for all window sizes and combine into a single DataFrame.
    """
    angle_data = {}
    for window_size in window_sizes:
        file_path = os.path.join(angle_folder, f'Angles_WS_{window_size}.csv')
        if os.path.exists(file_path):
            angle_data[window_size] = pd.read_csv(file_path, index_col=0)
        else:
            print(f"Warning: Angle data file not found for window size {window_size}.")
    return angle_data

def combine_distance_and_angle(distance_data, angle_data, distance_window_sizes, angle_window_sizes):
    """
    Combine distance and angle data alternately by window size, starting with distance (window size 2).
    """
    # Start with the index of one DataFrame (assuming all indices are consistent)
    first_window_size = list(distance_data.keys())[0]
    combined_data = pd.DataFrame(index=distance_data[first_window_size].index)

    # Interleave distance and angle window sizes
    max_window_size = max(max(distance_window_sizes), max(angle_window_sizes))
    for i in range(1, max_window_size + 1):
        if i in distance_window_sizes and i in distance_data:
            combined_data = pd.concat([combined_data, distance_data[i]], axis=1)
        if i in angle_window_sizes and i in angle_data:
            combined_data = pd.concat([combined_data, angle_data[i]], axis=1)
    
    return combined_data



def preprocessing_kfold(combined_data, n_splits=5, base_folder="AE_Data/Unfiltered_Shuffled_Compaction_Angle_Data"):
    """
    Preprocess data using KFold cross-validation, normalize it, and save it.
    """
    # Ensure all data is numeric
    combined_data = combined_data.apply(pd.to_numeric, errors='coerce')

    # Shuffle the DataFrame rows without resetting the index
    combined_data = combined_data.sample(frac=1, random_state=42)
    
    os.makedirs(base_folder, exist_ok=True)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold = 1
    for train_index, valid_index in kf.split(combined_data):
        folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
        if not os.path.exists(os.path.join(folder_name, 'X_train_f.csv')):
            X_train, X_valid = combined_data.iloc[train_index], combined_data.iloc[valid_index]
            y_train = pd.DataFrame({'class': [0] * len(X_train)}, index=X_train.index)
            y_valid = pd.DataFrame({'class': [0] * len(X_valid)}, index=X_valid.index)

            # Normalize training and validation data to [0, 1] and round to 6 decimal places
            X_train = normalize_to_range_0_1(X_train)
            X_valid = normalize_to_range_0_1(X_valid)

            print(f"Fold {fold}:")
            print(f"Training set shape: {X_train.shape}")
            print(f"Validation set shape: {X_valid.shape}")
            
            save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder)
        else:
            print(f"Data for fold {fold} already exists, skipping generation.")
        fold += 1

In [28]:
# Paths and window sizes
distance_window_sizes = list(range(2, 52))
angle_window_sizes = list(range(1, 51))

distance_data = {ws: pd.read_csv(f"Local_Compaction/Local_Compaction_Data/WT_Simulation_WS_{ws}.csv", index_col=0) for ws in distance_window_sizes}
angle_folder = "Local_Angles/Angle_Data"
angle_data = load_angle_data(angle_folder, angle_window_sizes)

# Combine distance and angle data alternately
combined_data = combine_distance_and_angle(distance_data, angle_data, distance_window_sizes, angle_window_sizes)

# Preprocess and save data
preprocessing_kfold(combined_data, n_splits=5, base_folder="AE_Data/Unfiltered_Shuffled_Compaction_Angle_Data")

# Example of loading a specific fold (e.g., fold 1)
base_folder = "AE_Data/Unfiltered_Shuffled_Compaction_Angle_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

Fold 1:
Training set shape: (32000, 4300)
Validation set shape: (8000, 4300)
Fold 2:
Training set shape: (32000, 4300)
Validation set shape: (8000, 4300)
Fold 3:
Training set shape: (32000, 4300)
Validation set shape: (8000, 4300)
Fold 4:
Training set shape: (32000, 4300)
Validation set shape: (8000, 4300)
Fold 5:
Training set shape: (32000, 4300)
Validation set shape: (8000, 4300)


# Autoencoder Training
----
Note: 
- Autoencoder trained on unfiltered, normalized local compaction data from 91-160C-Myc wildtype
- Each run has 5 autoencoders trained separately on the different exclusive test-train splits 

# Default AE Default Data (For Comparison)

In [5]:
# Save the model using the recommended Keras format
def save_model(model, model_path):
    if not model_path.endswith('.keras'):
        model_path += '.keras'
    model.save(model_path)
    
# Load the model
def load_existing_model(model_path):
    return load_model(model_path)

In [6]:
# Get autoencoder model
def AE_Model(train_data, LeReLU_alpha=0.01, batch_size=256):
    
    input_layer = Input(shape=(train_data.shape[1],), name='ae_input')
    
    encoder = Dense(1024, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e1')(input_layer)
    encoder = Dense(512, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e2')(encoder)
    encoder = Dense(256, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e3')(encoder)
    encoder = Dense(128, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e4')(encoder)
    encoder = Dense(64, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e5')(encoder)
    encoder = Dense(32, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e6')(encoder)

    encoded = Dense(2, activation='linear', name='ae_latent')(encoder)
    
    decoder = Dense(32, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d6')(encoded)
    decoder = Dense(64, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d5')(decoder)
    decoder = Dense(128, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d4')(decoder)
    decoder = Dense(256, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d3')(decoder)
    decoder = Dense(512, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d2')(decoder)
    decoder = Dense(1024, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d1')(decoder)

    output_layer = Dense(train_data.shape[1], activation='linear', name='ae_output')(decoder)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=1e-5), loss=MeanSquaredError())
    
    model.summary()
    
    return model

# Model 1

In [None]:
base_folder = "AE_Data/Unfiltered_Shuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

In [10]:
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Define the output directory and subdirectories
output_dir = 'Default_AE_Default_Data/AE_Training_1'
subfolders = {
    'AE_Latent_Space_Data_TXT': os.path.join(output_dir, 'AE_Latent_Space_Data_TXT'),
    'AE_Training_Loss_Data': os.path.join(output_dir, 'AE_Training_Loss_Data'),
    'AE_Training_Loss_Graphs': os.path.join(output_dir, 'AE_Training_Loss_Graphs'),
    'AE_Latent_Space_Graphs': os.path.join(output_dir, 'AE_Latent_Space_Graphs'),
    'AE_Latent_Space_Data_PKL': os.path.join(output_dir, 'AE_Latent_Space_Data_PKL'),
    'models': os.path.join(output_dir, 'models'),
}

# Create all subdirectories
for folder in subfolders.values():
    os.makedirs(folder, exist_ok=True)

# Initialize the autoencoder with the loaded training data
autoencoder = AE_Model(X_train_loaded)

# Training loop
for counts in tqdm(range(101)):
    name = "Autoencoder_Training"

    # Save the latent space predictions log
    txt_file_path = os.path.join(subfolders['AE_Latent_Space_Data_TXT'], f"{counts}_Latent_Space_Predictions_Log.txt")
    with open(txt_file_path, "w") as file:
        # Train the model
        history = autoencoder.fit(
            X_train_loaded, X_train_loaded,
            epochs=1000,
            validation_data=(X_valid_loaded, X_valid_loaded),
            verbose=0
        )

        # Save training loss history
        training_history = pd.DataFrame(history.history)
        history_file_path = os.path.join(subfolders['AE_Training_Loss_Data'], f"{counts}_{name}_History.pkl")
        training_history.to_pickle(history_file_path)

        # Save training loss graph
        plt.plot(training_history)
        graph_file_path = os.path.join(subfolders['AE_Training_Loss_Graphs'], f"{counts}_{name}_History.png")
        plt.savefig(graph_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Define the latent space model
        dr_model = tf.keras.models.Model(
            inputs=autoencoder.input,
            outputs=autoencoder.get_layer('ae_latent').output
        )
        dr_model.summary(print_fn=lambda x: file.write(x + '\n'))

        # Initialize latent space variables
        x, y, z, indices = [], [], [], []

        # Process validation data in batches
        batch_size = 32
        for batch_start in range(0, len(X_valid_loaded), batch_size):
            batch_end = min(batch_start + batch_size, len(X_valid_loaded))
            X_batch = np.array(X_valid_loaded.iloc[batch_start:batch_end])
            y_batch = y_valid_loaded.iloc[batch_start:batch_end].values.flatten()

            # Predict latent space values
            op_batch = dr_model.predict(X_batch, verbose=0)
            for i, op in enumerate(op_batch):
                z.append(y_batch[i])
                x.append(op[0])
                y.append(op[1])
                indices.append(y_valid_loaded.index[batch_start + i])
                file.write(f"Prediction {batch_start + i}: {op}\n")

        # Save latent space scatter plot
        df = pd.DataFrame({'x': x, 'y': y, 'z': [f"trajectory-{k}" for k in z], 'index': indices})
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x='x', y='y', hue='z', data=df, s=10)
        scatter_file_path = os.path.join(subfolders['AE_Latent_Space_Graphs'], f"{counts}_Latent_Space.png")
        plt.savefig(scatter_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Save latent space data as pickle
        pkl_file_path = os.path.join(subfolders['AE_Latent_Space_Data_PKL'], f"{counts}_Latent_Space.pkl")
        df.to_pickle(pkl_file_path)

        # Save the model
        model_file_path = os.path.join(subfolders['models'], f"saved_model_{counts}.keras")
        save_model(autoencoder, model_file_path)

  0%|                                                   | 0/101 [00:00<?, ?it/s]

  1%|▍                                      | 1/101 [12:56<21:34:58, 776.99s/it]

  2%|▊                                      | 2/101 [25:54<21:22:09, 777.06s/it]

  3%|█▏                                     | 3/101 [38:51<21:09:22, 777.17s/it]

  4%|█▌                                     | 4/101 [51:50<20:57:33, 777.87s/it]

  5%|█▊                                   | 5/101 [1:04:47<20:44:02, 777.52s/it]

  6%|██▏                                  | 6/101 [1:17:48<20:33:20, 778.96s/it]

  7%|██▌                                  | 7/101 [1:30:46<20:19:24, 778.35s/it]

  8%|██▉                                  | 8/101 [1:43:43<20:06:08, 778.16s/it]

  9%|███▎                                 | 9/101 [1:56:40<19:52:25, 777.67s/it]

 10%|███▌                                | 10/101 [2:09:38<19:39:25, 777.64s/it]

 11%|███▉                                | 11/101 [2:22:34<19:25:49, 777.21s/it]

 12%|████▎                               | 12/101 [2:35:34<19:14:25, 778.26s/it]

 13%|████▋                               | 13/101 [2:48:33<19:01:34, 778.34s/it]

 14%|████▉                               | 14/101 [3:01:30<18:48:00, 777.94s/it]

 15%|█████▎                              | 15/101 [3:14:29<18:35:22, 778.16s/it]

 16%|█████▋                              | 16/101 [3:27:26<18:21:51, 777.78s/it]

 17%|██████                              | 17/101 [3:40:22<18:08:21, 777.40s/it]

 18%|██████▍                             | 18/101 [3:53:21<17:56:12, 777.98s/it]

 19%|██████▊                             | 19/101 [4:06:21<17:43:45, 778.36s/it]

  plt.figure(figsize=(8, 6))
 20%|███████▏                            | 20/101 [4:19:18<17:30:32, 778.18s/it]

 21%|███████▍                            | 21/101 [4:32:18<17:17:58, 778.48s/it]

 22%|███████▊                            | 22/101 [4:45:15<17:04:42, 778.26s/it]

 23%|████████▏                           | 23/101 [4:58:15<16:52:17, 778.68s/it]

 24%|████████▌                           | 24/101 [5:11:15<16:39:56, 779.18s/it]

 25%|████████▉                           | 25/101 [5:24:14<16:26:52, 779.11s/it]

 26%|█████████▎                          | 26/101 [5:37:13<16:13:44, 779.00s/it]

 27%|█████████▌                          | 27/101 [5:50:09<15:59:49, 778.24s/it]

 28%|█████████▉                          | 28/101 [6:03:06<15:46:21, 777.83s/it]

 29%|██████████▎                         | 29/101 [6:16:05<15:33:36, 778.01s/it]

 30%|██████████▋                         | 30/101 [6:29:06<15:21:46, 778.97s/it]

 31%|███████████                         | 31/101 [6:42:06<15:09:01, 779.17s/it]

 32%|███████████▍                        | 32/101 [6:55:05<14:56:02, 779.17s/it]

 33%|███████████▊                        | 33/101 [7:08:03<14:42:51, 779.00s/it]

 34%|████████████                        | 34/101 [7:20:59<14:28:48, 778.04s/it]

 35%|████████████▍                       | 35/101 [7:33:57<14:15:44, 777.95s/it]

 36%|████████████▊                       | 36/101 [7:46:55<14:02:58, 778.13s/it]

 37%|█████████████▏                      | 37/101 [7:59:55<13:50:21, 778.46s/it]

 38%|█████████████▌                      | 38/101 [8:12:54<13:37:37, 778.70s/it]

 39%|█████████████▉                      | 39/101 [8:25:52<13:24:23, 778.44s/it]

 40%|██████████████▎                     | 40/101 [8:38:50<13:11:15, 778.28s/it]

 41%|██████████████▌                     | 41/101 [8:51:49<12:58:42, 778.71s/it]

 42%|██████████████▉                     | 42/101 [9:04:49<12:46:07, 779.11s/it]

 43%|███████████████▎                    | 43/101 [9:18:02<12:37:00, 783.11s/it]

 44%|███████████████▋                    | 44/101 [9:31:13<12:26:15, 785.53s/it]

 45%|████████████████                    | 45/101 [9:44:24<12:14:34, 787.04s/it]

 46%|████████████████▍                   | 46/101 [9:57:34<12:02:23, 788.07s/it]

 47%|████████████████▎                  | 47/101 [10:10:44<11:49:43, 788.59s/it]

 48%|████████████████▋                  | 48/101 [10:23:55<11:37:09, 789.23s/it]

 49%|████████████████▉                  | 49/101 [10:37:08<11:25:02, 790.44s/it]

 50%|█████████████████▎                 | 50/101 [10:50:21<11:12:37, 791.32s/it]

 50%|█████████████████▋                 | 51/101 [11:03:34<10:59:53, 791.87s/it]

 51%|██████████████████                 | 52/101 [11:16:45<10:46:23, 791.49s/it]

 52%|██████████████████▎                | 53/101 [11:29:54<10:32:29, 790.60s/it]

 53%|██████████████████▋                | 54/101 [11:43:02<10:18:54, 790.09s/it]

 54%|███████████████████                | 55/101 [11:56:18<10:06:54, 791.63s/it]

 55%|███████████████████▉                | 56/101 [12:09:29<9:53:39, 791.54s/it]

 56%|████████████████████▎               | 57/101 [12:22:40<9:40:21, 791.40s/it]

 57%|████████████████████▋               | 58/101 [12:35:52<9:27:14, 791.50s/it]

 58%|█████████████████████               | 59/101 [12:49:01<9:13:36, 790.87s/it]

 59%|█████████████████████▍              | 60/101 [13:02:10<9:00:02, 790.30s/it]

 60%|█████████████████████▋              | 61/101 [13:15:26<8:47:53, 791.83s/it]

 61%|██████████████████████              | 62/101 [13:28:38<8:34:49, 792.03s/it]

 62%|██████████████████████▍             | 63/101 [13:41:50<8:21:37, 792.04s/it]

 63%|██████████████████████▊             | 64/101 [13:55:02<8:08:25, 792.05s/it]

 64%|███████████████████████▏            | 65/101 [14:08:13<7:55:05, 791.81s/it]

 65%|███████████████████████▌            | 66/101 [14:21:25<7:41:45, 791.60s/it]

 66%|███████████████████████▉            | 67/101 [14:34:40<7:29:09, 792.62s/it]

 67%|████████████████████████▏           | 68/101 [14:47:50<7:15:37, 792.04s/it]

 68%|████████████████████████▌           | 69/101 [15:01:02<7:02:19, 791.87s/it]

 69%|████████████████████████▉           | 70/101 [15:14:12<6:48:53, 791.40s/it]

 70%|█████████████████████████▎          | 71/101 [15:27:21<6:35:21, 790.73s/it]

 71%|█████████████████████████▋          | 72/101 [15:40:31<6:22:03, 790.47s/it]

 72%|██████████████████████████          | 73/101 [15:53:42<6:08:54, 790.52s/it]

 73%|██████████████████████████▍         | 74/101 [16:06:55<5:56:09, 791.45s/it]

 74%|██████████████████████████▋         | 75/101 [16:20:08<5:43:09, 791.89s/it]

 75%|███████████████████████████         | 76/101 [16:33:19<5:29:46, 791.46s/it]

 76%|███████████████████████████▍        | 77/101 [16:46:29<5:16:28, 791.19s/it]

 77%|███████████████████████████▊        | 78/101 [16:59:39<5:03:06, 790.73s/it]

 78%|████████████████████████████▏       | 79/101 [17:12:48<4:49:43, 790.17s/it]

 79%|████████████████████████████▌       | 80/101 [17:26:03<4:37:02, 791.54s/it]

 80%|████████████████████████████▊       | 81/101 [17:39:15<4:23:57, 791.88s/it]

 81%|█████████████████████████████▏      | 82/101 [17:52:27<4:10:42, 791.72s/it]

 82%|█████████████████████████████▌      | 83/101 [18:05:36<3:57:19, 791.08s/it]

 83%|█████████████████████████████▉      | 84/101 [18:18:47<3:44:05, 790.92s/it]

 84%|██████████████████████████████▎     | 85/101 [18:31:47<3:30:01, 787.60s/it]

 85%|██████████████████████████████▋     | 86/101 [18:44:47<3:16:23, 785.58s/it]

 86%|███████████████████████████████     | 87/101 [18:57:48<3:02:56, 784.02s/it]

 87%|███████████████████████████████▎    | 88/101 [19:10:48<2:49:36, 782.84s/it]

 88%|███████████████████████████████▋    | 89/101 [19:23:45<2:36:12, 781.03s/it]

 89%|████████████████████████████████    | 90/101 [19:36:43<2:23:02, 780.21s/it]

 90%|████████████████████████████████▍   | 91/101 [19:49:41<2:09:55, 779.57s/it]

 91%|████████████████████████████████▊   | 92/101 [20:02:42<1:56:59, 779.89s/it]

 92%|█████████████████████████████████▏  | 93/101 [20:15:42<1:43:59, 779.94s/it]

 93%|█████████████████████████████████▌  | 94/101 [20:28:41<1:30:59, 779.88s/it]

 94%|█████████████████████████████████▊  | 95/101 [20:41:40<1:17:57, 779.61s/it]

 95%|██████████████████████████████████▏ | 96/101 [20:54:38<1:04:55, 779.12s/it]

 96%|████████████████████████████████████▍ | 97/101 [21:07:38<51:56, 779.25s/it]

 97%|████████████████████████████████████▊ | 98/101 [21:20:37<38:57, 779.06s/it]

 98%|█████████████████████████████████████▏| 99/101 [21:33:38<25:59, 779.79s/it]

 99%|████████████████████████████████████▋| 100/101 [21:46:36<12:59, 779.22s/it]

100%|█████████████████████████████████████| 101/101 [21:59:37<00:00, 783.94s/it]


<Figure size 640x480 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

# Model 2

In [None]:
base_folder = "AE_Data/Unfiltered_Shuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=2, base_folder=base_folder)

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Define the output directory and subdirectories
output_dir = 'Default_AE_Default_Data/AE_Training_2'
subfolders = {
    'AE_Latent_Space_Data_TXT': os.path.join(output_dir, 'AE_Latent_Space_Data_TXT'),
    'AE_Training_Loss_Data': os.path.join(output_dir, 'AE_Training_Loss_Data'),
    'AE_Training_Loss_Graphs': os.path.join(output_dir, 'AE_Training_Loss_Graphs'),
    'AE_Latent_Space_Graphs': os.path.join(output_dir, 'AE_Latent_Space_Graphs'),
    'AE_Latent_Space_Data_PKL': os.path.join(output_dir, 'AE_Latent_Space_Data_PKL'),
    'models': os.path.join(output_dir, 'models'),
}

# Create all subdirectories
for folder in subfolders.values():
    os.makedirs(folder, exist_ok=True)

# Initialize the autoencoder with the loaded training data
autoencoder = AE_Model(X_train_loaded)

# Training loop
for counts in tqdm(range(101)):
    name = "Autoencoder_Training"

    # Save the latent space predictions log
    txt_file_path = os.path.join(subfolders['AE_Latent_Space_Data_TXT'], f"{counts}_Latent_Space_Predictions_Log.txt")
    with open(txt_file_path, "w") as file:
        # Train the model
        history = autoencoder.fit(
            X_train_loaded, X_train_loaded,
            epochs=1000,
            validation_data=(X_valid_loaded, X_valid_loaded),
            verbose=0
        )

        # Save training loss history
        training_history = pd.DataFrame(history.history)
        history_file_path = os.path.join(subfolders['AE_Training_Loss_Data'], f"{counts}_{name}_History.pkl")
        training_history.to_pickle(history_file_path)

        # Save training loss graph
        plt.plot(training_history)
        graph_file_path = os.path.join(subfolders['AE_Training_Loss_Graphs'], f"{counts}_{name}_History.png")
        plt.savefig(graph_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Define the latent space model
        dr_model = tf.keras.models.Model(
            inputs=autoencoder.input,
            outputs=autoencoder.get_layer('ae_latent').output
        )
        dr_model.summary(print_fn=lambda x: file.write(x + '\n'))

        # Initialize latent space variables
        x, y, z, indices = [], [], [], []

        # Process validation data in batches
        batch_size = 32
        for batch_start in range(0, len(X_valid_loaded), batch_size):
            batch_end = min(batch_start + batch_size, len(X_valid_loaded))
            X_batch = np.array(X_valid_loaded.iloc[batch_start:batch_end])
            y_batch = y_valid_loaded.iloc[batch_start:batch_end].values.flatten()

            # Predict latent space values
            op_batch = dr_model.predict(X_batch, verbose=0)
            for i, op in enumerate(op_batch):
                z.append(y_batch[i])
                x.append(op[0])
                y.append(op[1])
                indices.append(y_valid_loaded.index[batch_start + i])
                file.write(f"Prediction {batch_start + i}: {op}\n")

        # Save latent space scatter plot
        df = pd.DataFrame({'x': x, 'y': y, 'z': [f"trajectory-{k}" for k in z], 'index': indices})
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x='x', y='y', hue='z', data=df, s=10)
        scatter_file_path = os.path.join(subfolders['AE_Latent_Space_Graphs'], f"{counts}_Latent_Space.png")
        plt.savefig(scatter_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Save latent space data as pickle
        pkl_file_path = os.path.join(subfolders['AE_Latent_Space_Data_PKL'], f"{counts}_Latent_Space.pkl")
        df.to_pickle(pkl_file_path)

        # Save the model
        model_file_path = os.path.join(subfolders['models'], f"saved_model_{counts}.keras")
        save_model(autoencoder, model_file_path)

# Model 3

In [None]:
base_folder = "AE_Data/Unfiltered_Shuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=3, base_folder=base_folder)

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Define the output directory and subdirectories
output_dir = 'Default_AE_Default_Data/AE_Training_3'
subfolders = {
    'AE_Latent_Space_Data_TXT': os.path.join(output_dir, 'AE_Latent_Space_Data_TXT'),
    'AE_Training_Loss_Data': os.path.join(output_dir, 'AE_Training_Loss_Data'),
    'AE_Training_Loss_Graphs': os.path.join(output_dir, 'AE_Training_Loss_Graphs'),
    'AE_Latent_Space_Graphs': os.path.join(output_dir, 'AE_Latent_Space_Graphs'),
    'AE_Latent_Space_Data_PKL': os.path.join(output_dir, 'AE_Latent_Space_Data_PKL'),
    'models': os.path.join(output_dir, 'models'),
}

# Create all subdirectories
for folder in subfolders.values():
    os.makedirs(folder, exist_ok=True)

# Initialize the autoencoder with the loaded training data
autoencoder = AE_Model(X_train_loaded)

# Training loop
for counts in tqdm(range(101)):
    name = "Autoencoder_Training"

    # Save the latent space predictions log
    txt_file_path = os.path.join(subfolders['AE_Latent_Space_Data_TXT'], f"{counts}_Latent_Space_Predictions_Log.txt")
    with open(txt_file_path, "w") as file:
        # Train the model
        history = autoencoder.fit(
            X_train_loaded, X_train_loaded,
            epochs=1000,
            validation_data=(X_valid_loaded, X_valid_loaded),
            verbose=0
        )

        # Save training loss history
        training_history = pd.DataFrame(history.history)
        history_file_path = os.path.join(subfolders['AE_Training_Loss_Data'], f"{counts}_{name}_History.pkl")
        training_history.to_pickle(history_file_path)

        # Save training loss graph
        plt.plot(training_history)
        graph_file_path = os.path.join(subfolders['AE_Training_Loss_Graphs'], f"{counts}_{name}_History.png")
        plt.savefig(graph_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Define the latent space model
        dr_model = tf.keras.models.Model(
            inputs=autoencoder.input,
            outputs=autoencoder.get_layer('ae_latent').output
        )
        dr_model.summary(print_fn=lambda x: file.write(x + '\n'))

        # Initialize latent space variables
        x, y, z, indices = [], [], [], []

        # Process validation data in batches
        batch_size = 32
        for batch_start in range(0, len(X_valid_loaded), batch_size):
            batch_end = min(batch_start + batch_size, len(X_valid_loaded))
            X_batch = np.array(X_valid_loaded.iloc[batch_start:batch_end])
            y_batch = y_valid_loaded.iloc[batch_start:batch_end].values.flatten()

            # Predict latent space values
            op_batch = dr_model.predict(X_batch, verbose=0)
            for i, op in enumerate(op_batch):
                z.append(y_batch[i])
                x.append(op[0])
                y.append(op[1])
                indices.append(y_valid_loaded.index[batch_start + i])
                file.write(f"Prediction {batch_start + i}: {op}\n")

        # Save latent space scatter plot
        df = pd.DataFrame({'x': x, 'y': y, 'z': [f"trajectory-{k}" for k in z], 'index': indices})
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x='x', y='y', hue='z', data=df, s=10)
        scatter_file_path = os.path.join(subfolders['AE_Latent_Space_Graphs'], f"{counts}_Latent_Space.png")
        plt.savefig(scatter_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Save latent space data as pickle
        pkl_file_path = os.path.join(subfolders['AE_Latent_Space_Data_PKL'], f"{counts}_Latent_Space.pkl")
        df.to_pickle(pkl_file_path)

        # Save the model
        model_file_path = os.path.join(subfolders['models'], f"saved_model_{counts}.keras")
        save_model(autoencoder, model_file_path)

# Model 4

In [None]:
base_folder = "AE_Data/Unfiltered_Shuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=4, base_folder=base_folder)

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Define the output directory and subdirectories
output_dir = 'Default_AE_Default_Data/AE_Training_4'
subfolders = {
    'AE_Latent_Space_Data_TXT': os.path.join(output_dir, 'AE_Latent_Space_Data_TXT'),
    'AE_Training_Loss_Data': os.path.join(output_dir, 'AE_Training_Loss_Data'),
    'AE_Training_Loss_Graphs': os.path.join(output_dir, 'AE_Training_Loss_Graphs'),
    'AE_Latent_Space_Graphs': os.path.join(output_dir, 'AE_Latent_Space_Graphs'),
    'AE_Latent_Space_Data_PKL': os.path.join(output_dir, 'AE_Latent_Space_Data_PKL'),
    'models': os.path.join(output_dir, 'models'),
}

# Create all subdirectories
for folder in subfolders.values():
    os.makedirs(folder, exist_ok=True)

# Initialize the autoencoder with the loaded training data
autoencoder = AE_Model(X_train_loaded)

# Training loop
for counts in tqdm(range(101)):
    name = "Autoencoder_Training"

    # Save the latent space predictions log
    txt_file_path = os.path.join(subfolders['AE_Latent_Space_Data_TXT'], f"{counts}_Latent_Space_Predictions_Log.txt")
    with open(txt_file_path, "w") as file:
        # Train the model
        history = autoencoder.fit(
            X_train_loaded, X_train_loaded,
            epochs=1000,
            validation_data=(X_valid_loaded, X_valid_loaded),
            verbose=0
        )

        # Save training loss history
        training_history = pd.DataFrame(history.history)
        history_file_path = os.path.join(subfolders['AE_Training_Loss_Data'], f"{counts}_{name}_History.pkl")
        training_history.to_pickle(history_file_path)

        # Save training loss graph
        plt.plot(training_history)
        graph_file_path = os.path.join(subfolders['AE_Training_Loss_Graphs'], f"{counts}_{name}_History.png")
        plt.savefig(graph_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Define the latent space model
        dr_model = tf.keras.models.Model(
            inputs=autoencoder.input,
            outputs=autoencoder.get_layer('ae_latent').output
        )
        dr_model.summary(print_fn=lambda x: file.write(x + '\n'))

        # Initialize latent space variables
        x, y, z, indices = [], [], [], []

        # Process validation data in batches
        batch_size = 32
        for batch_start in range(0, len(X_valid_loaded), batch_size):
            batch_end = min(batch_start + batch_size, len(X_valid_loaded))
            X_batch = np.array(X_valid_loaded.iloc[batch_start:batch_end])
            y_batch = y_valid_loaded.iloc[batch_start:batch_end].values.flatten()

            # Predict latent space values
            op_batch = dr_model.predict(X_batch, verbose=0)
            for i, op in enumerate(op_batch):
                z.append(y_batch[i])
                x.append(op[0])
                y.append(op[1])
                indices.append(y_valid_loaded.index[batch_start + i])
                file.write(f"Prediction {batch_start + i}: {op}\n")

        # Save latent space scatter plot
        df = pd.DataFrame({'x': x, 'y': y, 'z': [f"trajectory-{k}" for k in z], 'index': indices})
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x='x', y='y', hue='z', data=df, s=10)
        scatter_file_path = os.path.join(subfolders['AE_Latent_Space_Graphs'], f"{counts}_Latent_Space.png")
        plt.savefig(scatter_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Save latent space data as pickle
        pkl_file_path = os.path.join(subfolders['AE_Latent_Space_Data_PKL'], f"{counts}_Latent_Space.pkl")
        df.to_pickle(pkl_file_path)

        # Save the model
        model_file_path = os.path.join(subfolders['models'], f"saved_model_{counts}.keras")
        save_model(autoencoder, model_file_path)

# Model 5

In [None]:
base_folder = "AE_Data/Unfiltered_Shuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=5, base_folder=base_folder)

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Define the output directory and subdirectories
output_dir = 'Default_AE_Default_Data/AE_Training_5'
subfolders = {
    'AE_Latent_Space_Data_TXT': os.path.join(output_dir, 'AE_Latent_Space_Data_TXT'),
    'AE_Training_Loss_Data': os.path.join(output_dir, 'AE_Training_Loss_Data'),
    'AE_Training_Loss_Graphs': os.path.join(output_dir, 'AE_Training_Loss_Graphs'),
    'AE_Latent_Space_Graphs': os.path.join(output_dir, 'AE_Latent_Space_Graphs'),
    'AE_Latent_Space_Data_PKL': os.path.join(output_dir, 'AE_Latent_Space_Data_PKL'),
    'models': os.path.join(output_dir, 'models'),
}

# Create all subdirectories
for folder in subfolders.values():
    os.makedirs(folder, exist_ok=True)

# Initialize the autoencoder with the loaded training data
autoencoder = AE_Model(X_train_loaded)

# Training loop
for counts in tqdm(range(101)):
    name = "Autoencoder_Training"

    # Save the latent space predictions log
    txt_file_path = os.path.join(subfolders['AE_Latent_Space_Data_TXT'], f"{counts}_Latent_Space_Predictions_Log.txt")
    with open(txt_file_path, "w") as file:
        # Train the model
        history = autoencoder.fit(
            X_train_loaded, X_train_loaded,
            epochs=1000,
            validation_data=(X_valid_loaded, X_valid_loaded),
            verbose=0
        )

        # Save training loss history
        training_history = pd.DataFrame(history.history)
        history_file_path = os.path.join(subfolders['AE_Training_Loss_Data'], f"{counts}_{name}_History.pkl")
        training_history.to_pickle(history_file_path)

        # Save training loss graph
        plt.plot(training_history)
        graph_file_path = os.path.join(subfolders['AE_Training_Loss_Graphs'], f"{counts}_{name}_History.png")
        plt.savefig(graph_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Define the latent space model
        dr_model = tf.keras.models.Model(
            inputs=autoencoder.input,
            outputs=autoencoder.get_layer('ae_latent').output
        )
        dr_model.summary(print_fn=lambda x: file.write(x + '\n'))

        # Initialize latent space variables
        x, y, z, indices = [], [], [], []

        # Process validation data in batches
        batch_size = 32
        for batch_start in range(0, len(X_valid_loaded), batch_size):
            batch_end = min(batch_start + batch_size, len(X_valid_loaded))
            X_batch = np.array(X_valid_loaded.iloc[batch_start:batch_end])
            y_batch = y_valid_loaded.iloc[batch_start:batch_end].values.flatten()

            # Predict latent space values
            op_batch = dr_model.predict(X_batch, verbose=0)
            for i, op in enumerate(op_batch):
                z.append(y_batch[i])
                x.append(op[0])
                y.append(op[1])
                indices.append(y_valid_loaded.index[batch_start + i])
                file.write(f"Prediction {batch_start + i}: {op}\n")

        # Save latent space scatter plot
        df = pd.DataFrame({'x': x, 'y': y, 'z': [f"trajectory-{k}" for k in z], 'index': indices})
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x='x', y='y', hue='z', data=df, s=10)
        scatter_file_path = os.path.join(subfolders['AE_Latent_Space_Graphs'], f"{counts}_Latent_Space.png")
        plt.savefig(scatter_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Save latent space data as pickle
        pkl_file_path = os.path.join(subfolders['AE_Latent_Space_Data_PKL'], f"{counts}_Latent_Space.pkl")
        df.to_pickle(pkl_file_path)

        # Save the model
        model_file_path = os.path.join(subfolders['models'], f"saved_model_{counts}.keras")
        save_model(autoencoder, model_file_path)

# UnShuffled Raw Distance Data

In [3]:
# Save the model using the recommended Keras format
def save_model(model, model_path):
    if not model_path.endswith('.keras'):
        model_path += '.keras'
    model.save(model_path)
    
# Load the model
def load_existing_model(model_path):
    return load_model(model_path)

In [4]:
# Get autoencoder model
def AE_Model(train_data, LeReLU_alpha=0.01, batch_size=256):
    
    input_layer = Input(shape=(train_data.shape[1],), name='ae_input')
    
    encoder = Dense(1024, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e1')(input_layer)
    encoder = Dense(512, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e2')(encoder)
    encoder = Dense(256, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e3')(encoder)
    encoder = Dense(128, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e4')(encoder)
    encoder = Dense(64, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e5')(encoder)
    encoder = Dense(32, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e6')(encoder)

    encoded = Dense(2, activation='linear', name='ae_latent')(encoder)
    
    decoder = Dense(32, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d6')(encoded)
    decoder = Dense(64, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d5')(decoder)
    decoder = Dense(128, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d4')(decoder)
    decoder = Dense(256, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d3')(decoder)
    decoder = Dense(512, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d2')(decoder)
    decoder = Dense(1024, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d1')(decoder)

    output_layer = Dense(train_data.shape[1], activation='linear', name='ae_output')(decoder)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=1e-5), loss=MeanSquaredError())
    
    model.summary()
    
    return model

In [5]:
base_folder = "AE_Data/Unfiltered_UnShuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

In [7]:
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Define the output directory and subdirectories
output_dir = 'Default_AE_UnShuffled_Data/AE_Training_1'
subfolders = {
    'AE_Latent_Space_Data_TXT': os.path.join(output_dir, 'AE_Latent_Space_Data_TXT'),
    'AE_Training_Loss_Data': os.path.join(output_dir, 'AE_Training_Loss_Data'),
    'AE_Training_Loss_Graphs': os.path.join(output_dir, 'AE_Training_Loss_Graphs'),
    'AE_Latent_Space_Graphs': os.path.join(output_dir, 'AE_Latent_Space_Graphs'),
    'AE_Latent_Space_Data_PKL': os.path.join(output_dir, 'AE_Latent_Space_Data_PKL'),
    'models': os.path.join(output_dir, 'models'),
}

# Create all subdirectories
for folder in subfolders.values():
    os.makedirs(folder, exist_ok=True)

# Initialize the autoencoder with the loaded training data
autoencoder = AE_Model(X_train_loaded)

# Training loop
for counts in tqdm(range(101)):
    name = "Autoencoder_Training"

    # Save the latent space predictions log
    txt_file_path = os.path.join(subfolders['AE_Latent_Space_Data_TXT'], f"{counts}_Latent_Space_Predictions_Log.txt")
    with open(txt_file_path, "w") as file:
        # Train the model
        history = autoencoder.fit(
            X_train_loaded, X_train_loaded,
            epochs=1000,
            validation_data=(X_valid_loaded, X_valid_loaded),
            verbose=0
        )

        # Save training loss history
        training_history = pd.DataFrame(history.history)
        history_file_path = os.path.join(subfolders['AE_Training_Loss_Data'], f"{counts}_{name}_History.pkl")
        training_history.to_pickle(history_file_path)

        # Save training loss graph
        plt.plot(training_history)
        graph_file_path = os.path.join(subfolders['AE_Training_Loss_Graphs'], f"{counts}_{name}_History.png")
        plt.savefig(graph_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Define the latent space model
        dr_model = tf.keras.models.Model(
            inputs=autoencoder.input,
            outputs=autoencoder.get_layer('ae_latent').output
        )
        dr_model.summary(print_fn=lambda x: file.write(x + '\n'))

        # Initialize latent space variables
        x, y, z, indices = [], [], [], []

        # Process validation data in batches
        batch_size = 32
        for batch_start in range(0, len(X_valid_loaded), batch_size):
            batch_end = min(batch_start + batch_size, len(X_valid_loaded))
            X_batch = np.array(X_valid_loaded.iloc[batch_start:batch_end])
            y_batch = y_valid_loaded.iloc[batch_start:batch_end].values.flatten()

            # Predict latent space values
            op_batch = dr_model.predict(X_batch, verbose=0)
            for i, op in enumerate(op_batch):
                z.append(y_batch[i])
                x.append(op[0])
                y.append(op[1])
                indices.append(y_valid_loaded.index[batch_start + i])
                file.write(f"Prediction {batch_start + i}: {op}\n")

        # Save latent space scatter plot
        df = pd.DataFrame({'x': x, 'y': y, 'z': [f"trajectory-{k}" for k in z], 'index': indices})
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x='x', y='y', hue='z', data=df, s=10)
        scatter_file_path = os.path.join(subfolders['AE_Latent_Space_Graphs'], f"{counts}_Latent_Space.png")
        plt.savefig(scatter_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Save latent space data as pickle
        pkl_file_path = os.path.join(subfolders['AE_Latent_Space_Data_PKL'], f"{counts}_Latent_Space.pkl")
        df.to_pickle(pkl_file_path)

        # Save the model
        model_file_path = os.path.join(subfolders['models'], f"saved_model_{counts}.keras")
        save_model(autoencoder, model_file_path)

I0000 00:00:1732886411.338201    3919 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732886411.412068    3919 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732886411.412208    3919 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732886411.413644    3919 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

I0000 00:00:1732886413.620915    4310 service.cc:146] XLA service 0x7a17bc019420 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732886413.620941    4310 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 4080 SUPER, Compute Capability 8.9
2024-11-29 13:20:13.653291: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-11-29 13:20:13.784695: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907




I0000 00:00:1732886415.094406    4310 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


  1%|▍                                      | 1/101 [12:47<21:18:51, 767.32s/it]

  2%|▊                                      | 2/101 [25:30<21:02:31, 765.17s/it]

  3%|█▏                                     | 3/101 [38:14<20:48:44, 764.54s/it]

  4%|█▌                                     | 4/101 [50:57<20:34:44, 763.76s/it]

  5%|█▊                                   | 5/101 [1:03:44<20:23:57, 764.97s/it]

  6%|██▏                                  | 6/101 [1:16:27<20:10:19, 764.42s/it]

  7%|██▌                                  | 7/101 [1:29:12<19:57:29, 764.36s/it]

  8%|██▉                                  | 8/101 [1:41:55<19:44:31, 764.21s/it]

  9%|███▎                                 | 9/101 [1:54:41<19:32:15, 764.52s/it]

 10%|███▌                                | 10/101 [2:07:25<19:19:17, 764.36s/it]

 11%|███▉                                | 11/101 [2:20:09<19:06:45, 764.50s/it]

 12%|████▎                               | 12/101 [2:32:54<18:54:14, 764.66s/it]

 13%|████▋                               | 13/101 [2:45:39<18:41:37, 764.75s/it]

 14%|████▉                               | 14/101 [2:58:22<18:27:53, 764.07s/it]

 15%|█████▎                              | 15/101 [3:11:07<18:15:25, 764.25s/it]

 16%|█████▋                              | 16/101 [3:23:53<18:03:45, 765.00s/it]

 17%|██████                              | 17/101 [3:36:39<17:51:08, 765.11s/it]

 18%|██████▍                             | 18/101 [3:49:25<17:39:01, 765.56s/it]

 19%|██████▊                             | 19/101 [4:02:11<17:26:20, 765.61s/it]

 20%|███████▏                            | 20/101 [4:14:58<17:13:58, 765.90s/it]

 21%|███████▍                            | 21/101 [4:27:47<17:02:39, 766.99s/it]

 22%|███████▊                            | 22/101 [4:40:34<16:50:00, 767.10s/it]

 23%|████████▏                           | 23/101 [4:53:22<16:37:28, 767.29s/it]

 24%|████████▌                           | 24/101 [5:06:09<16:24:33, 767.19s/it]

 25%|████████▉                           | 25/101 [5:18:59<16:12:40, 767.90s/it]

 26%|█████████▎                          | 26/101 [5:31:44<15:58:45, 767.00s/it]

 27%|█████████▌                          | 27/101 [5:44:33<15:46:42, 767.61s/it]

 28%|█████████▉                          | 28/101 [5:57:21<15:34:07, 767.77s/it]

 29%|██████████▎                         | 29/101 [6:10:07<15:20:42, 767.26s/it]

 30%|██████████▋                         | 30/101 [6:22:53<15:07:25, 766.84s/it]

 31%|███████████                         | 31/101 [6:35:39<14:54:21, 766.59s/it]

 32%|███████████▍                        | 32/101 [6:48:26<14:41:47, 766.78s/it]

 33%|███████████▊                        | 33/101 [7:01:13<14:29:09, 766.90s/it]

 34%|████████████                        | 34/101 [7:13:59<14:16:00, 766.58s/it]

 35%|████████████▍                       | 35/101 [7:26:54<14:05:54, 769.01s/it]

 36%|████████████▊                       | 36/101 [7:39:53<13:56:31, 772.18s/it]

 37%|█████████████▏                      | 37/101 [7:52:52<13:45:41, 774.09s/it]

 38%|█████████████▌                      | 38/101 [8:05:53<13:35:06, 776.30s/it]

 39%|█████████████▉                      | 39/101 [8:18:52<13:22:53, 776.99s/it]

 40%|██████████████▎                     | 40/101 [8:31:51<13:10:38, 777.69s/it]

 41%|██████████████▌                     | 41/101 [8:44:50<12:58:00, 778.02s/it]

 42%|██████████████▉                     | 42/101 [8:57:49<12:45:22, 778.35s/it]

 43%|███████████████▎                    | 43/101 [9:10:54<12:34:27, 780.47s/it]

 44%|███████████████▋                    | 44/101 [9:23:54<12:21:17, 780.31s/it]

 45%|████████████████                    | 45/101 [9:36:51<12:07:08, 779.09s/it]

 46%|████████████████▍                   | 46/101 [9:49:48<11:53:33, 778.43s/it]

 47%|████████████████▎                  | 47/101 [10:02:47<11:40:56, 778.82s/it]

 48%|████████████████▋                  | 48/101 [10:15:48<11:28:20, 779.25s/it]

 49%|████████████████▉                  | 49/101 [10:28:50<11:16:09, 780.18s/it]

 50%|█████████████████▎                 | 50/101 [10:41:49<11:02:58, 779.96s/it]

 50%|█████████████████▋                 | 51/101 [10:54:50<10:50:04, 780.09s/it]

 51%|██████████████████                 | 52/101 [11:07:49<10:36:51, 779.84s/it]

 52%|██████████████████▎                | 53/101 [11:20:48<10:23:37, 779.52s/it]

 53%|██████████████████▋                | 54/101 [11:33:52<10:11:46, 780.99s/it]

 54%|███████████████████▌                | 55/101 [11:46:54<9:58:54, 781.18s/it]

 55%|███████████████████▉                | 56/101 [11:59:54<9:45:38, 780.86s/it]

 56%|████████████████████▎               | 57/101 [12:12:54<9:32:25, 780.58s/it]

 57%|████████████████████▋               | 58/101 [12:25:54<9:19:13, 780.31s/it]

 58%|█████████████████████               | 59/101 [12:38:59<9:07:18, 781.87s/it]

 59%|█████████████████████▍              | 60/101 [12:52:00<8:54:07, 781.65s/it]

 60%|█████████████████████▋              | 61/101 [13:05:01<8:40:57, 781.43s/it]

 61%|██████████████████████              | 62/101 [13:17:59<8:27:16, 780.42s/it]

 62%|██████████████████████▍             | 63/101 [13:30:59<8:14:09, 780.25s/it]

 63%|██████████████████████▊             | 64/101 [13:43:57<8:00:46, 779.63s/it]

 64%|███████████████████████▏            | 65/101 [13:57:02<7:48:40, 781.14s/it]

 65%|███████████████████████▌            | 66/101 [14:10:04<7:35:50, 781.43s/it]

 66%|███████████████████████▉            | 67/101 [14:23:02<7:22:10, 780.32s/it]

 67%|████████████████████████▏           | 68/101 [14:36:01<7:08:56, 779.89s/it]

 68%|████████████████████████▌           | 69/101 [14:48:57<6:55:22, 778.84s/it]

 69%|████████████████████████▉           | 70/101 [15:01:57<6:42:33, 779.14s/it]

 70%|█████████████████████████▎          | 71/101 [15:15:01<6:30:18, 780.60s/it]

 71%|█████████████████████████▋          | 72/101 [15:28:01<6:17:17, 780.59s/it]

 72%|██████████████████████████          | 73/101 [15:41:00<6:04:04, 780.15s/it]

 73%|██████████████████████████▍         | 74/101 [15:53:59<5:50:49, 779.60s/it]

 74%|██████████████████████████▋         | 75/101 [16:06:59<5:37:51, 779.66s/it]

 75%|███████████████████████████         | 76/101 [16:20:05<5:25:44, 781.78s/it]

 76%|███████████████████████████▍        | 77/101 [16:33:08<5:12:48, 782.03s/it]

 77%|███████████████████████████▊        | 78/101 [16:46:08<4:59:31, 781.36s/it]

 78%|████████████████████████████▏       | 79/101 [16:59:08<4:46:25, 781.15s/it]

 79%|████████████████████████████▌       | 80/101 [17:12:07<4:33:08, 780.41s/it]

 80%|████████████████████████████▊       | 81/101 [17:25:05<4:19:51, 779.56s/it]

 81%|█████████████████████████████▏      | 82/101 [17:38:11<4:07:32, 781.72s/it]

 82%|█████████████████████████████▌      | 83/101 [17:51:12<3:54:23, 781.28s/it]

 83%|█████████████████████████████▉      | 84/101 [18:04:11<3:41:13, 780.80s/it]

 84%|██████████████████████████████▎     | 85/101 [18:17:08<3:27:54, 779.64s/it]

 85%|██████████████████████████████▋     | 86/101 [18:30:08<3:14:52, 779.51s/it]

 86%|███████████████████████████████     | 87/101 [18:43:06<3:01:49, 779.28s/it]

 87%|███████████████████████████████▎    | 88/101 [18:56:11<2:49:13, 781.01s/it]

 88%|███████████████████████████████▋    | 89/101 [19:09:14<2:36:17, 781.42s/it]

 89%|████████████████████████████████    | 90/101 [19:22:15<2:23:16, 781.49s/it]

 90%|████████████████████████████████▍   | 91/101 [19:35:05<2:09:40, 778.09s/it]

 91%|████████████████████████████████▊   | 92/101 [19:47:52<1:56:12, 774.76s/it]

 92%|█████████████████████████████████▏  | 93/101 [20:00:44<1:43:10, 773.76s/it]

 93%|█████████████████████████████████▌  | 94/101 [20:13:39<1:30:19, 774.26s/it]

 94%|█████████████████████████████████▊  | 95/101 [20:26:31<1:17:20, 773.39s/it]

 95%|██████████████████████████████████▏ | 96/101 [20:39:14<1:04:11, 770.26s/it]

 96%|████████████████████████████████████▍ | 97/101 [20:52:01<51:17, 769.36s/it]

 97%|████████████████████████████████████▊ | 98/101 [21:04:49<38:26, 768.93s/it]

 98%|█████████████████████████████████████▏| 99/101 [21:17:38<25:38, 769.11s/it]

 99%|████████████████████████████████████▋| 100/101 [21:30:29<12:49, 769.45s/it]

100%|█████████████████████████████████████| 101/101 [21:43:17<00:00, 774.23s/it]


# Batch Size 512

# Model 1

In [5]:
# Save the model using the recommended Keras format
def save_model(model, model_path):
    if not model_path.endswith('.keras'):
        model_path += '.keras'
    model.save(model_path)
    
# Load the model
def load_existing_model(model_path):
    return load_model(model_path)

In [6]:
# Get autoencoder model
def AE_Model(train_data, LeReLU_alpha=0.01, batch_size=512):
    
    input_layer = Input(shape=(train_data.shape[1],), name='ae_input')
    
    encoder = Dense(1024, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e1')(input_layer)
    encoder = Dense(512, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e2')(encoder)
    encoder = Dense(256, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e3')(encoder)
    encoder = Dense(128, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e4')(encoder)
    encoder = Dense(64, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e5')(encoder)
    encoder = Dense(32, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e6')(encoder)

    encoded = Dense(2, activation='linear', name='ae_latent')(encoder)
    
    decoder = Dense(32, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d6')(encoded)
    decoder = Dense(64, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d5')(decoder)
    decoder = Dense(128, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d4')(decoder)
    decoder = Dense(256, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d3')(decoder)
    decoder = Dense(512, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d2')(decoder)
    decoder = Dense(1024, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d1')(decoder)

    output_layer = Dense(train_data.shape[1], activation='linear', name='ae_output')(decoder)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=1e-5), loss=MeanSquaredError())
    
    model.summary()
    
    return model

In [7]:
base_folder = "AE_Data/Unfiltered_Shuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

In [9]:
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Define the output directory and subdirectories
output_dir = 'Default_AE_BS_512_Default_Data/AE_Training_1'
subfolders = {
    'AE_Latent_Space_Data_TXT': os.path.join(output_dir, 'AE_Latent_Space_Data_TXT'),
    'AE_Training_Loss_Data': os.path.join(output_dir, 'AE_Training_Loss_Data'),
    'AE_Training_Loss_Graphs': os.path.join(output_dir, 'AE_Training_Loss_Graphs'),
    'AE_Latent_Space_Graphs': os.path.join(output_dir, 'AE_Latent_Space_Graphs'),
    'AE_Latent_Space_Data_PKL': os.path.join(output_dir, 'AE_Latent_Space_Data_PKL'),
    'models': os.path.join(output_dir, 'models'),
}

# Create all subdirectories
for folder in subfolders.values():
    os.makedirs(folder, exist_ok=True)

# Initialize the autoencoder with the loaded training data
autoencoder = AE_Model(X_train_loaded)

# Training loop
for counts in tqdm(range(101)):
    name = "Autoencoder_Training"

    # Save the latent space predictions log
    txt_file_path = os.path.join(subfolders['AE_Latent_Space_Data_TXT'], f"{counts}_Latent_Space_Predictions_Log.txt")
    with open(txt_file_path, "w") as file:
        # Train the model
        history = autoencoder.fit(
            X_train_loaded, X_train_loaded,
            epochs=1000,
            validation_data=(X_valid_loaded, X_valid_loaded),
            verbose=0
        )

        # Save training loss history
        training_history = pd.DataFrame(history.history)
        history_file_path = os.path.join(subfolders['AE_Training_Loss_Data'], f"{counts}_{name}_History.pkl")
        training_history.to_pickle(history_file_path)

        # Save training loss graph
        plt.plot(training_history)
        graph_file_path = os.path.join(subfolders['AE_Training_Loss_Graphs'], f"{counts}_{name}_History.png")
        plt.savefig(graph_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Define the latent space model
        dr_model = tf.keras.models.Model(
            inputs=autoencoder.input,
            outputs=autoencoder.get_layer('ae_latent').output
        )
        dr_model.summary(print_fn=lambda x: file.write(x + '\n'))

        # Initialize latent space variables
        x, y, z, indices = [], [], [], []

        # Process validation data in batches
        batch_size = 32
        for batch_start in range(0, len(X_valid_loaded), batch_size):
            batch_end = min(batch_start + batch_size, len(X_valid_loaded))
            X_batch = np.array(X_valid_loaded.iloc[batch_start:batch_end])
            y_batch = y_valid_loaded.iloc[batch_start:batch_end].values.flatten()

            # Predict latent space values
            op_batch = dr_model.predict(X_batch, verbose=0)
            for i, op in enumerate(op_batch):
                z.append(y_batch[i])
                x.append(op[0])
                y.append(op[1])
                indices.append(y_valid_loaded.index[batch_start + i])
                file.write(f"Prediction {batch_start + i}: {op}\n")

        # Save latent space scatter plot
        df = pd.DataFrame({'x': x, 'y': y, 'z': [f"trajectory-{k}" for k in z], 'index': indices})
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x='x', y='y', hue='z', data=df, s=10)
        scatter_file_path = os.path.join(subfolders['AE_Latent_Space_Graphs'], f"{counts}_Latent_Space.png")
        plt.savefig(scatter_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Save latent space data as pickle
        pkl_file_path = os.path.join(subfolders['AE_Latent_Space_Data_PKL'], f"{counts}_Latent_Space.pkl")
        df.to_pickle(pkl_file_path)

        # Save the model
        model_file_path = os.path.join(subfolders['models'], f"saved_model_{counts}.keras")
        save_model(autoencoder, model_file_path)

  0%|                                                   | 0/101 [00:00<?, ?it/s]

  1%|▍                                      | 1/101 [12:56<21:34:00, 776.41s/it]

  2%|▊                                      | 2/101 [25:50<21:18:32, 774.87s/it]

  3%|█▏                                     | 3/101 [38:39<21:01:36, 772.41s/it]

  4%|█▌                                     | 4/101 [51:29<20:46:45, 771.19s/it]

  5%|█▊                                   | 5/101 [1:04:34<20:42:20, 776.46s/it]

  6%|██▏                                  | 6/101 [1:17:38<20:33:30, 779.06s/it]

  7%|██▌                                  | 7/101 [1:30:42<20:22:57, 780.61s/it]

  8%|██▉                                  | 8/101 [1:43:48<20:12:29, 782.25s/it]

  9%|███▎                                 | 9/101 [1:56:52<20:00:19, 782.82s/it]

 10%|███▌                                | 10/101 [2:09:58<19:48:48, 783.83s/it]

 11%|███▉                                | 11/101 [2:23:04<19:36:40, 784.45s/it]

 12%|████▎                               | 12/101 [2:36:05<19:21:54, 783.30s/it]

 13%|████▋                               | 13/101 [2:49:08<19:08:38, 783.17s/it]

 14%|████▉                               | 14/101 [3:02:08<18:54:15, 782.25s/it]

 15%|█████▎                              | 15/101 [3:15:12<18:42:08, 782.89s/it]

 16%|█████▋                              | 16/101 [3:28:13<18:28:19, 782.34s/it]

 17%|██████                              | 17/101 [3:41:16<18:15:33, 782.54s/it]

 18%|██████▍                             | 18/101 [3:54:19<18:02:41, 782.66s/it]

 19%|██████▊                             | 19/101 [4:07:24<17:50:35, 783.36s/it]

 20%|███████▏                            | 20/101 [4:20:28<17:37:46, 783.54s/it]

 21%|███████▍                            | 21/101 [4:33:31<17:24:27, 783.34s/it]

 22%|███████▊                            | 22/101 [4:46:37<17:12:35, 784.25s/it]

 23%|████████▏                           | 23/101 [4:59:38<16:58:11, 783.22s/it]

 24%|████████▌                           | 24/101 [5:12:41<16:45:02, 783.15s/it]

 25%|████████▉                           | 25/101 [5:25:45<16:32:18, 783.40s/it]

 26%|█████████▎                          | 26/101 [5:38:50<16:19:52, 783.89s/it]

 27%|█████████▌                          | 27/101 [5:51:54<16:06:53, 783.97s/it]

 28%|█████████▉                          | 28/101 [6:04:56<15:52:52, 783.18s/it]

 29%|██████████▎                         | 29/101 [6:17:59<15:39:56, 783.28s/it]

 30%|██████████▋                         | 30/101 [6:31:01<15:26:20, 782.82s/it]

 31%|███████████                         | 31/101 [6:44:05<15:13:42, 783.17s/it]

 32%|███████████▍                        | 32/101 [6:57:09<15:00:52, 783.37s/it]

 33%|███████████▊                        | 33/101 [7:10:11<14:47:23, 782.99s/it]

 34%|████████████                        | 34/101 [7:23:14<14:34:21, 783.01s/it]

 35%|████████████▍                       | 35/101 [7:36:17<14:21:31, 783.21s/it]

 36%|████████████▊                       | 36/101 [7:49:20<14:08:18, 783.06s/it]

 37%|█████████████▏                      | 37/101 [8:02:26<13:56:09, 783.89s/it]

 38%|█████████████▌                      | 38/101 [8:15:33<13:43:56, 784.70s/it]

 39%|█████████████▉                      | 39/101 [8:28:36<13:30:19, 784.19s/it]

 40%|██████████████▎                     | 40/101 [8:41:40<13:17:13, 784.15s/it]

 41%|██████████████▌                     | 41/101 [8:54:42<13:03:36, 783.60s/it]

 42%|██████████████▉                     | 42/101 [9:07:45<12:50:22, 783.43s/it]

 43%|███████████████▎                    | 43/101 [9:20:50<12:37:42, 783.83s/it]

 44%|███████████████▋                    | 44/101 [9:33:54<12:24:51, 784.05s/it]

 45%|████████████████                    | 45/101 [9:46:58<12:11:35, 783.85s/it]

 46%|███████████████▉                   | 46/101 [10:00:01<11:58:26, 783.76s/it]

 47%|████████████████▎                  | 47/101 [10:13:03<11:44:48, 783.13s/it]

 48%|████████████████▋                  | 48/101 [10:26:06<11:31:47, 783.17s/it]

 49%|████████████████▉                  | 49/101 [10:39:11<11:19:08, 783.63s/it]

 50%|█████████████████▎                 | 50/101 [10:52:17<11:06:43, 784.37s/it]

 50%|█████████████████▋                 | 51/101 [11:05:21<10:53:34, 784.29s/it]

 51%|██████████████████                 | 52/101 [11:18:24<10:40:08, 783.85s/it]

 52%|██████████████████▎                | 53/101 [11:31:27<10:26:51, 783.57s/it]

 53%|██████████████████▋                | 54/101 [11:44:31<10:13:52, 783.67s/it]

 54%|███████████████████                | 55/101 [11:57:38<10:01:32, 784.62s/it]

 55%|███████████████████▉                | 56/101 [12:10:43<9:48:33, 784.74s/it]

 56%|████████████████████▎               | 57/101 [12:23:45<9:34:51, 783.90s/it]

 57%|████████████████████▋               | 58/101 [12:36:47<9:21:30, 783.50s/it]

 58%|█████████████████████               | 59/101 [12:49:50<9:08:22, 783.39s/it]

 59%|█████████████████████▍              | 60/101 [13:02:53<8:55:16, 783.32s/it]

 60%|█████████████████████▋              | 61/101 [13:16:02<8:43:12, 784.80s/it]

 61%|██████████████████████              | 62/101 [13:29:07<8:30:11, 784.90s/it]

 62%|██████████████████████▍             | 63/101 [13:42:11<8:17:01, 784.79s/it]

 63%|██████████████████████▊             | 64/101 [13:55:09<8:02:36, 782.59s/it]

 64%|███████████████████████▏            | 65/101 [14:07:55<7:46:37, 777.71s/it]

 65%|███████████████████████▌            | 66/101 [14:20:42<7:31:42, 774.36s/it]

 66%|███████████████████████▉            | 67/101 [14:33:29<7:17:33, 772.18s/it]

 67%|████████████████████████▏           | 68/101 [14:46:21<7:04:40, 772.14s/it]

 68%|████████████████████████▌           | 69/101 [14:59:10<6:51:17, 771.18s/it]

 69%|████████████████████████▉           | 70/101 [15:11:57<6:37:48, 769.94s/it]

 70%|█████████████████████████▎          | 71/101 [15:24:49<6:25:21, 770.72s/it]

 71%|█████████████████████████▋          | 72/101 [15:37:34<6:11:41, 769.02s/it]

 72%|██████████████████████████          | 73/101 [15:50:27<5:59:25, 770.19s/it]

 73%|██████████████████████████▍         | 74/101 [16:03:16<5:46:24, 769.78s/it]

 74%|██████████████████████████▋         | 75/101 [16:16:06<5:33:32, 769.72s/it]

 75%|███████████████████████████         | 76/101 [16:28:53<5:20:22, 768.91s/it]

 76%|███████████████████████████▍        | 77/101 [16:41:41<5:07:29, 768.73s/it]

 77%|███████████████████████████▊        | 78/101 [16:54:28<4:54:26, 768.10s/it]

 78%|████████████████████████████▏       | 79/101 [17:07:14<4:41:27, 767.59s/it]

 79%|████████████████████████████▌       | 80/101 [17:20:02<4:28:43, 767.80s/it]

 80%|████████████████████████████▊       | 81/101 [17:32:51<4:15:59, 767.95s/it]

 81%|█████████████████████████████▏      | 82/101 [17:45:38<4:03:07, 767.77s/it]

 82%|█████████████████████████████▌      | 83/101 [17:58:29<3:50:36, 768.67s/it]

 83%|█████████████████████████████▉      | 84/101 [18:11:19<3:37:53, 769.04s/it]

 84%|██████████████████████████████▎     | 85/101 [18:24:07<3:25:01, 768.87s/it]

 85%|██████████████████████████████▋     | 86/101 [18:36:55<3:12:08, 768.59s/it]

 86%|███████████████████████████████     | 87/101 [18:49:44<2:59:21, 768.67s/it]

 87%|███████████████████████████████▎    | 88/101 [19:02:31<2:46:27, 768.27s/it]

 88%|███████████████████████████████▋    | 89/101 [19:15:19<2:33:37, 768.09s/it]

 89%|████████████████████████████████    | 90/101 [19:28:05<2:20:41, 767.44s/it]

 90%|████████████████████████████████▍   | 91/101 [19:40:51<2:07:51, 767.11s/it]

 91%|████████████████████████████████▊   | 92/101 [19:53:39<1:55:04, 767.16s/it]

 92%|█████████████████████████████████▏  | 93/101 [20:06:26<1:42:18, 767.26s/it]

 93%|█████████████████████████████████▌  | 94/101 [20:19:13<1:29:29, 767.14s/it]

 94%|█████████████████████████████████▊  | 95/101 [20:32:00<1:16:41, 766.99s/it]

 95%|██████████████████████████████████▏ | 96/101 [20:44:45<1:03:53, 766.68s/it]

 96%|████████████████████████████████████▍ | 97/101 [20:57:32<51:06, 766.73s/it]

 97%|████████████████████████████████████▊ | 98/101 [21:10:20<38:20, 766.89s/it]

 98%|█████████████████████████████████████▏| 99/101 [21:23:06<25:33, 766.83s/it]

 99%|████████████████████████████████████▋| 100/101 [21:35:54<12:47, 767.23s/it]

100%|█████████████████████████████████████| 101/101 [21:48:43<00:00, 777.46s/it]


# 8 Layer AE

# Model 1

In [None]:
# Save the model using the recommended Keras format
def save_model(model, model_path):
    if not model_path.endswith('.keras'):
        model_path += '.keras'
    model.save(model_path)
    
# Load the model
def load_existing_model(model_path):
    return load_model(model_path)

In [None]:
# Get autoencoder model
def AE_Model(train_data, LeReLU_alpha=0.01, batch_size=256):
    
    input_layer = Input(shape=(train_data.shape[1],), name='ae_input')

    encoder = Dense(4096, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e1')(input_layer)
    encoder = Dense(2048, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e2')(encoder)
    encoder = Dense(1024, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e3')(encoder)
    encoder = Dense(512, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e4')(encoder)
    encoder = Dense(256, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e5')(encoder)
    encoder = Dense(128, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e6')(encoder)
    encoder = Dense(64, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e7')(encoder)
    encoder = Dense(32, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e8')(encoder)

    encoded = Dense(2, activation='linear', name='ae_latent')(encoder)
    
    decoder = Dense(32, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d8')(encoded)
    decoder = Dense(64, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d7')(decoder)
    decoder = Dense(128, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d6')(decoder)
    decoder = Dense(256, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d5')(decoder)
    decoder = Dense(512, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d4')(decoder)
    decoder = Dense(1024, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d3')(decoder)
    decoder = Dense(2048, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d2')(decoder)
    decoder = Dense(4096, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d1')(decoder)
    

    output_layer = Dense(train_data.shape[1], activation='linear', name='ae_output')(decoder)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=1e-5), loss=MeanSquaredError())
    
    model.summary()
    
    return model

In [None]:
def load_datasets(fold, base_folder):
    """
    Load datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    
    X_train = pd.read_csv(os.path.join(folder_name, 'X_train_f.csv'), index_col=0)
    y_train = pd.read_csv(os.path.join(folder_name, 'y_train_f.csv'), index_col=0)
    X_valid = pd.read_csv(os.path.join(folder_name, 'X_valid_f.csv'), index_col=0)
    y_valid = pd.read_csv(os.path.join(folder_name, 'y_valid_f.csv'), index_col=0)
    
    return X_train, y_train, X_valid, y_valid

base_folder = "AE_Data/Unfiltered_Shuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Define the output directory and subdirectories
output_dir = '8_Layer_AE_Default_Data/AE_Training_1'
subfolders = {
    'AE_Latent_Space_Data_TXT': os.path.join(output_dir, 'AE_Latent_Space_Data_TXT'),
    'AE_Training_Loss_Data': os.path.join(output_dir, 'AE_Training_Loss_Data'),
    'AE_Training_Loss_Graphs': os.path.join(output_dir, 'AE_Training_Loss_Graphs'),
    'AE_Latent_Space_Graphs': os.path.join(output_dir, 'AE_Latent_Space_Graphs'),
    'AE_Latent_Space_Data_PKL': os.path.join(output_dir, 'AE_Latent_Space_Data_PKL'),
    'models': os.path.join(output_dir, 'models'),
}

# Create all subdirectories
for folder in subfolders.values():
    os.makedirs(folder, exist_ok=True)

# Initialize the autoencoder with the loaded training data
autoencoder = AE_Model(X_train_loaded)

# Training loop
for counts in tqdm(range(20)):
    name = "Autoencoder_Training"

    # Save the latent space predictions log
    txt_file_path = os.path.join(subfolders['AE_Latent_Space_Data_TXT'], f"{counts}_Latent_Space_Predictions_Log.txt")
    with open(txt_file_path, "w") as file:
        # Train the model
        history = autoencoder.fit(
            X_train_loaded, X_train_loaded,
            epochs=1000,
            validation_data=(X_valid_loaded, X_valid_loaded),
            verbose=0
        )

        # Save training loss history
        training_history = pd.DataFrame(history.history)
        history_file_path = os.path.join(subfolders['AE_Training_Loss_Data'], f"{counts}_{name}_History.pkl")
        training_history.to_pickle(history_file_path)

        # Save training loss graph
        plt.plot(training_history)
        graph_file_path = os.path.join(subfolders['AE_Training_Loss_Graphs'], f"{counts}_{name}_History.png")
        plt.savefig(graph_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Define the latent space model
        dr_model = tf.keras.models.Model(
            inputs=autoencoder.input,
            outputs=autoencoder.get_layer('ae_latent').output
        )
        dr_model.summary(print_fn=lambda x: file.write(x + '\n'))

        # Initialize latent space variables
        x, y, z, indices = [], [], [], []

        # Process validation data in batches
        batch_size = 32
        for batch_start in range(0, len(X_valid_loaded), batch_size):
            batch_end = min(batch_start + batch_size, len(X_valid_loaded))
            X_batch = np.array(X_valid_loaded.iloc[batch_start:batch_end])
            y_batch = y_valid_loaded.iloc[batch_start:batch_end].values.flatten()

            # Predict latent space values
            op_batch = dr_model.predict(X_batch, verbose=0)
            for i, op in enumerate(op_batch):
                z.append(y_batch[i])
                x.append(op[0])
                y.append(op[1])
                indices.append(y_valid_loaded.index[batch_start + i])
                file.write(f"Prediction {batch_start + i}: {op}\n")

        # Save latent space scatter plot
        df = pd.DataFrame({'x': x, 'y': y, 'z': [f"trajectory-{k}" for k in z], 'index': indices})
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x='x', y='y', hue='z', data=df, s=10)
        scatter_file_path = os.path.join(subfolders['AE_Latent_Space_Graphs'], f"{counts}_Latent_Space.png")
        plt.savefig(scatter_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Save latent space data as pickle
        pkl_file_path = os.path.join(subfolders['AE_Latent_Space_Data_PKL'], f"{counts}_Latent_Space.pkl")
        df.to_pickle(pkl_file_path)

        # Save the model
        model_file_path = os.path.join(subfolders['models'], f"saved_model_{counts}.keras")
        save_model(autoencoder, model_file_path)

# 7 Layer AE

# Model 1

In [None]:
# Save the model using the recommended Keras format
def save_model(model, model_path):
    if not model_path.endswith('.keras'):
        model_path += '.keras'
    model.save(model_path)
    
# Load the model
def load_existing_model(model_path):
    return load_model(model_path)

In [None]:
# Get autoencoder model
def AE_Model(train_data, LeReLU_alpha=0.01, batch_size=256):
    
    input_layer = Input(shape=(train_data.shape[1],), name='ae_input')

    encoder = Dense(2048, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e1')(input_layer)
    encoder = Dense(1024, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e2')(encoder)
    encoder = Dense(512, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e3')(encoder)
    encoder = Dense(256, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e4')(encoder)
    encoder = Dense(128, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e5')(encoder)
    encoder = Dense(64, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e6')(encoder)
    encoder = Dense(32, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e7')(encoder)

    encoded = Dense(2, activation='linear', name='ae_latent')(encoder)
    
    decoder = Dense(32, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d7')(encoded)
    decoder = Dense(64, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d6')(decoder)
    decoder = Dense(128, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d5')(decoder)
    decoder = Dense(256, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d4')(decoder)
    decoder = Dense(512, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d3')(decoder)
    decoder = Dense(1024, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d2')(decoder)
    decoder = Dense(2048, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d1')(decoder)
    

    output_layer = Dense(train_data.shape[1], activation='linear', name='ae_output')(decoder)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=1e-5), loss=MeanSquaredError())
    
    model.summary()
    
    return model

In [None]:
def load_datasets(fold, base_folder):
    """
    Load datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    
    X_train = pd.read_csv(os.path.join(folder_name, 'X_train_f.csv'), index_col=0)
    y_train = pd.read_csv(os.path.join(folder_name, 'y_train_f.csv'), index_col=0)
    X_valid = pd.read_csv(os.path.join(folder_name, 'X_valid_f.csv'), index_col=0)
    y_valid = pd.read_csv(os.path.join(folder_name, 'y_valid_f.csv'), index_col=0)
    
    return X_train, y_train, X_valid, y_valid

base_folder = "AE_Data/Filtered_Shuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Define the output directory and subdirectories
output_dir = '7_Layer_AE_Default_Data/AE_Training_1'
subfolders = {
    'AE_Latent_Space_Data_TXT': os.path.join(output_dir, 'AE_Latent_Space_Data_TXT'),
    'AE_Training_Loss_Data': os.path.join(output_dir, 'AE_Training_Loss_Data'),
    'AE_Training_Loss_Graphs': os.path.join(output_dir, 'AE_Training_Loss_Graphs'),
    'AE_Latent_Space_Graphs': os.path.join(output_dir, 'AE_Latent_Space_Graphs'),
    'AE_Latent_Space_Data_PKL': os.path.join(output_dir, 'AE_Latent_Space_Data_PKL'),
    'models': os.path.join(output_dir, 'models'),
}

# Create all subdirectories
for folder in subfolders.values():
    os.makedirs(folder, exist_ok=True)

# Initialize the autoencoder with the loaded training data
autoencoder = AE_Model(X_train_loaded)

# Training loop
for counts in tqdm(range(101)):
    name = "Autoencoder_Training"

    # Save the latent space predictions log
    txt_file_path = os.path.join(subfolders['AE_Latent_Space_Data_TXT'], f"{counts}_Latent_Space_Predictions_Log.txt")
    with open(txt_file_path, "w") as file:
        # Train the model
        history = autoencoder.fit(
            X_train_loaded, X_train_loaded,
            epochs=1000,
            validation_data=(X_valid_loaded, X_valid_loaded),
            verbose=0
        )

        # Save training loss history
        training_history = pd.DataFrame(history.history)
        history_file_path = os.path.join(subfolders['AE_Training_Loss_Data'], f"{counts}_{name}_History.pkl")
        training_history.to_pickle(history_file_path)

        # Save training loss graph
        plt.plot(training_history)
        graph_file_path = os.path.join(subfolders['AE_Training_Loss_Graphs'], f"{counts}_{name}_History.png")
        plt.savefig(graph_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Define the latent space model
        dr_model = tf.keras.models.Model(
            inputs=autoencoder.input,
            outputs=autoencoder.get_layer('ae_latent').output
        )
        dr_model.summary(print_fn=lambda x: file.write(x + '\n'))

        # Initialize latent space variables
        x, y, z, indices = [], [], [], []

        # Process validation data in batches
        batch_size = 32
        for batch_start in range(0, len(X_valid_loaded), batch_size):
            batch_end = min(batch_start + batch_size, len(X_valid_loaded))
            X_batch = np.array(X_valid_loaded.iloc[batch_start:batch_end])
            y_batch = y_valid_loaded.iloc[batch_start:batch_end].values.flatten()

            # Predict latent space values
            op_batch = dr_model.predict(X_batch, verbose=0)
            for i, op in enumerate(op_batch):
                z.append(y_batch[i])
                x.append(op[0])
                y.append(op[1])
                indices.append(y_valid_loaded.index[batch_start + i])
                file.write(f"Prediction {batch_start + i}: {op}\n")

        # Save latent space scatter plot
        df = pd.DataFrame({'x': x, 'y': y, 'z': [f"trajectory-{k}" for k in z], 'index': indices})
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x='x', y='y', hue='z', data=df, s=10)
        scatter_file_path = os.path.join(subfolders['AE_Latent_Space_Graphs'], f"{counts}_Latent_Space.png")
        plt.savefig(scatter_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Save latent space data as pickle
        pkl_file_path = os.path.join(subfolders['AE_Latent_Space_Data_PKL'], f"{counts}_Latent_Space.pkl")
        df.to_pickle(pkl_file_path)

        # Save the model
        model_file_path = os.path.join(subfolders['models'], f"saved_model_{counts}.keras")
        save_model(autoencoder, model_file_path)

# Reduce LR on Plateau

In [15]:
# Save the model using the recommended Keras format
def save_model(model, model_path):
    if not model_path.endswith('.keras'):
        model_path += '.keras'
    model.save(model_path)
    
# Load the model
def load_existing_model(model_path):
    return load_model(model_path)

In [16]:
# Get autoencoder model
def AE_Model(train_data, LeReLU_alpha=0.01, batch_size=256, learning_rate=1e-4):
    input_layer = Input(shape=(train_data.shape[1],), name='ae_input')
    
    encoder = Dense(1024, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e1')(input_layer)
    encoder = Dense(512, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e2')(encoder)
    encoder = Dense(256, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e3')(encoder)
    encoder = Dense(128, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e4')(encoder)
    encoder = Dense(64, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e5')(encoder)
    encoder = Dense(32, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e6')(encoder)

    encoded = Dense(2, activation='linear', name='ae_latent')(encoder)
    
    decoder = Dense(32, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d6')(encoded)
    decoder = Dense(64, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d5')(decoder)
    decoder = Dense(128, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d4')(decoder)
    decoder = Dense(256, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d3')(decoder)
    decoder = Dense(512, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d2')(decoder)
    decoder = Dense(1024, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d1')(decoder)

    output_layer = Dense(train_data.shape[1], activation='linear', name='ae_output')(decoder)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss=MeanSquaredError())
    
    model.summary()
    
    return model

# Model 1

In [17]:
base_folder = "AE_Data/Unfiltered_Shuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

In [18]:
## Learning Rate Multiply by 0.5 if sum val_loss (E900-E1000) / sum val_loss(E0-E100) > 0.9
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Define the output directory and subdirectories
output_dir = 'Default_AE_Default_Data_RLROP/AE_Training_1'
subfolders = {
    'AE_Latent_Space_Data_TXT': os.path.join(output_dir, 'AE_Latent_Space_Data_TXT'),
    'AE_Training_Loss_Data': os.path.join(output_dir, 'AE_Training_Loss_Data'),
    'AE_Training_Loss_Graphs': os.path.join(output_dir, 'AE_Training_Loss_Graphs'),
    'AE_Latent_Space_Graphs': os.path.join(output_dir, 'AE_Latent_Space_Graphs'),
    'AE_Latent_Space_Data_PKL': os.path.join(output_dir, 'AE_Latent_Space_Data_PKL'),
    'models': os.path.join(output_dir, 'models'),
}

# Create all subdirectories
for folder in subfolders.values():
    os.makedirs(folder, exist_ok=True)

# Initialize the autoencoder with the loaded training data
initial_learning_rate = 1e-4
min_learning_rate = 1e-6
autoencoder = AE_Model(X_train_loaded, learning_rate=initial_learning_rate)

# Initialize variables for RLROP
current_lr = initial_learning_rate

# Training loop
for counts in tqdm(range(101)):
    name = "Autoencoder_Training"

    # Save the latent space predictions log
    txt_file_path = os.path.join(subfolders['AE_Latent_Space_Data_TXT'], f"{counts}_Latent_Space_Predictions_Log.txt")
    with open(txt_file_path, "w") as file:
        # Train the model
        history = autoencoder.fit(
            X_train_loaded, X_train_loaded,
            epochs=1000,
            validation_data=(X_valid_loaded, X_valid_loaded),
            verbose=0
        )

        # Save training loss history
        training_history = pd.DataFrame(history.history)
        history_file_path = os.path.join(subfolders['AE_Training_Loss_Data'], f"{counts}_{name}_History.pkl")
        training_history.to_pickle(history_file_path)

        # Save training loss graph
        plt.plot(training_history['loss'], label='Train Loss')
        plt.plot(training_history['val_loss'], label='Validation Loss')
        plt.title(f"{name} (LR={current_lr:.1e})")
        plt.legend()
        graph_file_path = os.path.join(subfolders['AE_Training_Loss_Graphs'], f"{counts}_{name}_History.png")
        plt.savefig(graph_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Define the latent space model
        dr_model = tf.keras.models.Model(
            inputs=autoencoder.input,
            outputs=autoencoder.get_layer('ae_latent').output
        )
        dr_model.summary(print_fn=lambda x: file.write(x + '\n'))

        # Initialize latent space variables
        x, y, z, indices = [], [], [], []

        # Process validation data in batches
        batch_size = 32
        for batch_start in range(0, len(X_valid_loaded), batch_size):
            batch_end = min(batch_start + batch_size, len(X_valid_loaded))
            X_batch = np.array(X_valid_loaded.iloc[batch_start:batch_end])
            y_batch = y_valid_loaded.iloc[batch_start:batch_end].values.flatten()

            # Predict latent space values
            op_batch = dr_model.predict(X_batch, verbose=0)
            for i, op in enumerate(op_batch):
                z.append(y_batch[i])
                x.append(op[0])
                y.append(op[1])
                indices.append(y_valid_loaded.index[batch_start + i])
                file.write(f"Prediction {batch_start + i}: {op}\n")

        # Save latent space scatter plot
        df = pd.DataFrame({'x': x, 'y': y, 'z': [f"trajectory-{k}" for k in z], 'index': indices})
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x='x', y='y', hue='z', data=df, s=10)
        scatter_file_path = os.path.join(subfolders['AE_Latent_Space_Graphs'], f"{counts}_Latent_Space.png")
        plt.savefig(scatter_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Save latent space data as pickle
        pkl_file_path = os.path.join(subfolders['AE_Latent_Space_Data_PKL'], f"{counts}_Latent_Space.pkl")
        df.to_pickle(pkl_file_path)

        # Save the model
        model_file_path = os.path.join(subfolders['models'], f"saved_model_{counts}.keras")
        save_model(autoencoder, model_file_path)

        # Implement RLROP: Calculate average validation loss reduction
        if counts > 0:
            first_100_avg = training_history['val_loss'][:100].mean()
            last_100_avg = training_history['val_loss'][-100:].mean()
            if (first_100_avg - last_100_avg) / first_100_avg < 0.1:
                current_lr = max(current_lr * 0.5, min_learning_rate)
                autoencoder.optimizer.learning_rate.assign(current_lr)

  0%|                                                   | 0/101 [00:00<?, ?it/s]

  1%|▍                                      | 1/101 [13:06<21:51:10, 786.71s/it]

  2%|▊                                      | 2/101 [26:07<21:32:11, 783.15s/it]

  3%|█▏                                     | 3/101 [39:10<21:19:02, 783.09s/it]

  4%|█▌                                     | 4/101 [52:14<21:06:32, 783.43s/it]

  5%|█▊                                   | 5/101 [1:05:17<20:53:27, 783.41s/it]

  6%|██▏                                  | 6/101 [1:18:23<20:41:32, 784.14s/it]

  7%|██▌                                  | 7/101 [1:31:24<20:27:11, 783.31s/it]

  8%|██▉                                  | 8/101 [1:44:31<20:15:33, 784.24s/it]

  9%|███▎                                 | 9/101 [1:57:32<20:01:22, 783.50s/it]

 10%|███▌                                | 10/101 [2:10:34<19:47:31, 782.98s/it]

 11%|███▉                                | 11/101 [2:23:37<19:34:09, 782.77s/it]

 12%|████▎                               | 12/101 [2:36:34<19:18:41, 781.14s/it]

 13%|████▋                               | 13/101 [2:49:35<19:05:29, 781.02s/it]

 14%|████▉                               | 14/101 [3:02:34<18:51:39, 780.45s/it]

 15%|█████▎                              | 15/101 [3:15:39<18:40:33, 781.78s/it]

 16%|█████▋                              | 16/101 [3:28:39<18:26:48, 781.27s/it]

 17%|██████                              | 17/101 [3:41:39<18:13:22, 780.98s/it]

 18%|██████▍                             | 18/101 [3:54:39<17:59:58, 780.71s/it]

 19%|██████▊                             | 19/101 [4:07:41<17:47:22, 781.01s/it]

 20%|███████▏                            | 20/101 [4:20:40<17:33:30, 780.37s/it]

 21%|███████▍                            | 21/101 [4:33:39<17:20:09, 780.11s/it]

 22%|███████▊                            | 22/101 [4:46:39<17:07:03, 780.04s/it]

 23%|████████▏                           | 23/101 [4:59:42<16:55:12, 780.93s/it]

 24%|████████▌                           | 24/101 [5:12:42<16:41:37, 780.49s/it]

 25%|████████▉                           | 25/101 [5:25:42<16:28:44, 780.59s/it]

 26%|█████████▎                          | 26/101 [5:38:44<16:15:55, 780.74s/it]

 27%|█████████▌                          | 27/101 [5:51:42<16:02:04, 780.06s/it]

 28%|█████████▉                          | 28/101 [6:04:42<15:49:06, 780.09s/it]

 29%|██████████▎                         | 29/101 [6:17:40<15:35:22, 779.48s/it]

 30%|██████████▋                         | 30/101 [6:30:39<15:22:11, 779.32s/it]

 31%|███████████                         | 31/101 [6:43:41<15:09:54, 779.93s/it]

 32%|███████████▍                        | 32/101 [6:56:39<14:56:27, 779.53s/it]

 33%|███████████▊                        | 33/101 [7:09:38<14:43:20, 779.42s/it]

 34%|████████████                        | 34/101 [7:22:36<14:29:53, 779.00s/it]

 35%|████████████▍                       | 35/101 [7:35:37<14:17:29, 779.54s/it]

 36%|████████████▊                       | 36/101 [7:48:36<14:04:20, 779.40s/it]

 37%|█████████████▏                      | 37/101 [8:01:37<13:51:51, 779.87s/it]

 38%|█████████████▌                      | 38/101 [8:14:40<13:39:56, 780.89s/it]

 39%|█████████████▉                      | 39/101 [8:27:39<13:26:06, 780.11s/it]

 40%|██████████████▎                     | 40/101 [8:40:40<13:13:33, 780.55s/it]

 41%|██████████████▌                     | 41/101 [8:53:37<12:59:22, 779.38s/it]

 42%|██████████████▉                     | 42/101 [9:06:37<12:46:39, 779.66s/it]

 43%|███████████████▎                    | 43/101 [9:19:37<12:33:34, 779.56s/it]

 44%|███████████████▋                    | 44/101 [9:32:35<12:20:07, 779.09s/it]

 45%|████████████████                    | 45/101 [9:45:33<12:06:55, 778.84s/it]

 46%|████████████████▍                   | 46/101 [9:58:33<11:54:14, 779.18s/it]

 47%|████████████████▎                  | 47/101 [10:11:34<11:41:44, 779.71s/it]

 48%|████████████████▋                  | 48/101 [10:24:33<11:28:30, 779.43s/it]

 49%|████████████████▉                  | 49/101 [10:37:30<11:15:02, 778.90s/it]

 50%|█████████████████▎                 | 50/101 [10:50:29<11:02:03, 778.90s/it]

 50%|█████████████████▋                 | 51/101 [11:03:26<10:48:37, 778.35s/it]

 51%|██████████████████                 | 52/101 [11:16:25<10:35:39, 778.36s/it]

 52%|██████████████████▎                | 53/101 [11:29:21<10:22:16, 777.84s/it]

 53%|██████████████████▋                | 54/101 [11:42:23<10:10:20, 779.17s/it]

 54%|███████████████████▌                | 55/101 [11:55:25<9:57:50, 779.80s/it]

 55%|███████████████████▉                | 56/101 [12:08:24<9:44:39, 779.55s/it]

 56%|████████████████████▎               | 57/101 [12:21:23<9:31:34, 779.42s/it]

 57%|████████████████████▋               | 58/101 [12:34:22<9:18:32, 779.35s/it]

 58%|█████████████████████               | 59/101 [12:47:20<9:05:11, 778.84s/it]

 59%|█████████████████████▍              | 60/101 [13:00:17<8:51:56, 778.45s/it]

 60%|█████████████████████▋              | 61/101 [13:13:14<8:38:41, 778.04s/it]

 61%|██████████████████████              | 62/101 [13:26:16<8:26:28, 779.18s/it]

 62%|██████████████████████▍             | 63/101 [13:39:17<8:13:52, 779.79s/it]

 63%|██████████████████████▊             | 64/101 [13:52:18<8:01:06, 780.18s/it]

 64%|███████████████████████▏            | 65/101 [14:05:18<7:47:57, 779.93s/it]

 65%|███████████████████████▌            | 66/101 [14:18:15<7:34:27, 779.08s/it]

 66%|███████████████████████▉            | 67/101 [14:31:13<7:21:19, 778.80s/it]

 67%|████████████████████████▏           | 68/101 [14:44:11<7:08:11, 778.52s/it]

 68%|████████████████████████▌           | 69/101 [14:57:10<6:55:17, 778.66s/it]

 69%|████████████████████████▉           | 70/101 [15:10:13<6:42:56, 779.89s/it]

 70%|█████████████████████████▎          | 71/101 [15:23:11<6:29:43, 779.45s/it]

 71%|█████████████████████████▋          | 72/101 [15:36:11<6:16:50, 779.69s/it]

 72%|██████████████████████████          | 73/101 [15:49:11<6:03:51, 779.70s/it]

 73%|██████████████████████████▍         | 74/101 [16:02:08<5:50:31, 778.94s/it]

 74%|██████████████████████████▋         | 75/101 [16:15:06<5:37:24, 778.64s/it]

 75%|███████████████████████████         | 76/101 [16:28:02<5:24:03, 777.75s/it]

 76%|███████████████████████████▍        | 77/101 [16:40:58<5:10:57, 777.40s/it]

 77%|███████████████████████████▊        | 78/101 [16:54:02<4:58:43, 779.30s/it]

 78%|████████████████████████████▏       | 79/101 [17:07:01<4:45:40, 779.10s/it]

 79%|████████████████████████████▌       | 80/101 [17:20:00<4:32:40, 779.09s/it]

 80%|████████████████████████████▊       | 81/101 [17:32:59<4:19:41, 779.08s/it]

 81%|█████████████████████████████▏      | 82/101 [17:45:58<4:06:40, 778.98s/it]

 82%|█████████████████████████████▌      | 83/101 [17:58:56<3:53:40, 778.94s/it]

 83%|█████████████████████████████▉      | 84/101 [18:11:54<3:40:32, 778.40s/it]

 84%|██████████████████████████████▎     | 85/101 [18:24:50<3:27:26, 777.93s/it]

 85%|██████████████████████████████▋     | 86/101 [18:37:50<3:14:38, 778.55s/it]

 86%|███████████████████████████████     | 87/101 [18:50:51<3:01:48, 779.20s/it]

 87%|███████████████████████████████▎    | 88/101 [19:03:51<2:48:51, 779.38s/it]

 88%|███████████████████████████████▋    | 89/101 [19:16:50<2:35:50, 779.24s/it]

 89%|████████████████████████████████    | 90/101 [19:29:50<2:22:55, 779.55s/it]

 90%|████████████████████████████████▍   | 91/101 [19:42:49<2:09:52, 779.27s/it]

 91%|████████████████████████████████▊   | 92/101 [19:55:47<1:56:50, 778.99s/it]

 92%|█████████████████████████████████▏  | 93/101 [20:08:45<1:43:50, 778.78s/it]

 93%|█████████████████████████████████▌  | 94/101 [20:21:48<1:31:00, 780.01s/it]

 94%|█████████████████████████████████▊  | 95/101 [20:34:50<1:18:03, 780.54s/it]

 95%|██████████████████████████████████▏ | 96/101 [20:47:51<1:05:02, 780.57s/it]

 96%|████████████████████████████████████▍ | 97/101 [21:00:56<52:08, 782.08s/it]

 97%|████████████████████████████████████▊ | 98/101 [21:14:02<39:09, 783.11s/it]

 98%|█████████████████████████████████████▏| 99/101 [21:27:01<26:03, 781.86s/it]

 99%|████████████████████████████████████▋| 100/101 [21:39:58<13:00, 780.57s/it]

100%|█████████████████████████████████████| 101/101 [21:52:58<00:00, 779.99s/it]


# Shuffled, Filtered Dataset

In [2]:
# Save the model using the recommended Keras format
def save_model(model, model_path):
    if not model_path.endswith('.keras'):
        model_path += '.keras'
    model.save(model_path)
    
# Load the model
def load_existing_model(model_path):
    return load_model(model_path)

In [3]:
# Get autoencoder model
def AE_Model(train_data, LeReLU_alpha=0.01, batch_size=256):
    
    input_layer = Input(shape=(train_data.shape[1],), name='ae_input')
    
    encoder = Dense(1024, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e1')(input_layer)
    encoder = Dense(512, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e2')(encoder)
    encoder = Dense(256, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e3')(encoder)
    encoder = Dense(128, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e4')(encoder)
    encoder = Dense(64, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e5')(encoder)
    encoder = Dense(32, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='e6')(encoder)

    encoded = Dense(2, activation='linear', name='ae_latent')(encoder)
    
    decoder = Dense(32, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d6')(encoded)
    decoder = Dense(64, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d5')(decoder)
    decoder = Dense(128, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d4')(decoder)
    decoder = Dense(256, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d3')(decoder)
    decoder = Dense(512, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d2')(decoder)
    decoder = Dense(1024, activation=LeakyReLU(negative_slope=LeReLU_alpha), name='d1')(decoder)

    output_layer = Dense(train_data.shape[1], activation='linear', name='ae_output')(decoder)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=1e-5), loss=MeanSquaredError())
    
    model.summary()
    
    return model

In [4]:
def load_datasets(fold, base_folder):
    """
    Load datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    
    X_train = pd.read_csv(os.path.join(folder_name, 'X_train_f.csv'), index_col=0)
    y_train = pd.read_csv(os.path.join(folder_name, 'y_train_f.csv'), index_col=0)
    X_valid = pd.read_csv(os.path.join(folder_name, 'X_valid_f.csv'), index_col=0)
    y_valid = pd.read_csv(os.path.join(folder_name, 'y_valid_f.csv'), index_col=0)
    
    return X_train, y_train, X_valid, y_valid

base_folder = "AE_Data/Filtered_Shuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

In [5]:
print(X_train_loaded)

             10        26        29        30        38         7         8  \
32823  0.581820  0.580513  0.465041  0.219771  0.394883  0.369550  0.414636   
28505  0.605215  0.357156  0.805869  0.619109  0.487875  0.147931  0.142953   
6689   0.458832  0.222296  0.751903  0.273659  0.483952  0.260578  0.636252   
36572  0.242426  0.269379  0.401734  0.490664  0.503974  0.245061  0.803909   
12335  0.546329  0.397127  0.774965  0.125088  0.447993  0.149885  0.678769   
...         ...       ...       ...       ...       ...       ...       ...   
6265   0.625532  0.281942  0.681292  0.164125  0.666801  0.155468  0.574162   
11284  0.532060  0.722206  0.678063  0.189079  0.550389  0.267015  0.705913   
38158  0.247786  0.795716  0.586447  0.505211  0.549095  0.369128  0.542627   
860    0.467415  0.540923  0.864833  0.199565  0.800465  0.178709  0.074222   
15795  0.530086  0.638857  0.831911  0.242039  0.624472  0.318827  0.628040   

             28        43       8.1  ...         2 

In [6]:
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Define the output directory and subdirectories
output_dir = 'Default_AE_Filtered_Data/AE_Training_1'
subfolders = {
    'AE_Latent_Space_Data_TXT': os.path.join(output_dir, 'AE_Latent_Space_Data_TXT'),
    'AE_Training_Loss_Data': os.path.join(output_dir, 'AE_Training_Loss_Data'),
    'AE_Training_Loss_Graphs': os.path.join(output_dir, 'AE_Training_Loss_Graphs'),
    'AE_Latent_Space_Graphs': os.path.join(output_dir, 'AE_Latent_Space_Graphs'),
    'AE_Latent_Space_Data_PKL': os.path.join(output_dir, 'AE_Latent_Space_Data_PKL'),
    'models': os.path.join(output_dir, 'models'),
}

# Create all subdirectories
for folder in subfolders.values():
    os.makedirs(folder, exist_ok=True)

# Initialize the autoencoder with the loaded training data
autoencoder = AE_Model(X_train_loaded)

# Training loop
for counts in tqdm(range(101)):
    name = "Autoencoder_Training"

    # Save the latent space predictions log
    txt_file_path = os.path.join(subfolders['AE_Latent_Space_Data_TXT'], f"{counts}_Latent_Space_Predictions_Log.txt")
    with open(txt_file_path, "w") as file:
        # Train the model
        history = autoencoder.fit(
            X_train_loaded, X_train_loaded,
            epochs=1000,
            validation_data=(X_valid_loaded, X_valid_loaded),
            verbose=0
        )

        # Save training loss history
        training_history = pd.DataFrame(history.history)
        history_file_path = os.path.join(subfolders['AE_Training_Loss_Data'], f"{counts}_{name}_History.pkl")
        training_history.to_pickle(history_file_path)

        # Save training loss graph
        plt.plot(training_history)
        graph_file_path = os.path.join(subfolders['AE_Training_Loss_Graphs'], f"{counts}_{name}_History.png")
        plt.savefig(graph_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Define the latent space model
        dr_model = tf.keras.models.Model(
            inputs=autoencoder.input,
            outputs=autoencoder.get_layer('ae_latent').output
        )
        dr_model.summary(print_fn=lambda x: file.write(x + '\n'))

        # Initialize latent space variables
        x, y, z, indices = [], [], [], []

        # Process validation data in batches
        batch_size = 32
        for batch_start in range(0, len(X_valid_loaded), batch_size):
            batch_end = min(batch_start + batch_size, len(X_valid_loaded))
            X_batch = np.array(X_valid_loaded.iloc[batch_start:batch_end])
            y_batch = y_valid_loaded.iloc[batch_start:batch_end].values.flatten()

            # Predict latent space values
            op_batch = dr_model.predict(X_batch, verbose=0)
            for i, op in enumerate(op_batch):
                z.append(y_batch[i])
                x.append(op[0])
                y.append(op[1])
                indices.append(y_valid_loaded.index[batch_start + i])
                file.write(f"Prediction {batch_start + i}: {op}\n")

        # Save latent space scatter plot
        df = pd.DataFrame({'x': x, 'y': y, 'z': [f"trajectory-{k}" for k in z], 'index': indices})
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x='x', y='y', hue='z', data=df, s=10)
        scatter_file_path = os.path.join(subfolders['AE_Latent_Space_Graphs'], f"{counts}_Latent_Space.png")
        plt.savefig(scatter_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Save latent space data as pickle
        pkl_file_path = os.path.join(subfolders['AE_Latent_Space_Data_PKL'], f"{counts}_Latent_Space.pkl")
        df.to_pickle(pkl_file_path)

        # Save the model
        model_file_path = os.path.join(subfolders['models'], f"saved_model_{counts}.keras")
        save_model(autoencoder, model_file_path)

I0000 00:00:1733076887.172368    4131 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1733076887.239965    4131 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1733076887.240081    4131 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1733076887.241761    4131 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

I0000 00:00:1733076888.812731    4256 service.cc:146] XLA service 0x708ef00198a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733076888.812747    4256 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 4080 SUPER, Compute Capability 8.9
2024-12-01 18:14:48.841904: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-12-01 18:14:48.971537: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907
I0000 00:00:1733076890.097025    4256 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


  1%|▍                                      | 1/101 [08:37<14:21:54, 517.15s/it]

  2%|▊                                      | 2/101 [17:14<14:13:23, 517.20s/it]

  3%|█▏                                     | 3/101 [25:49<14:02:51, 516.03s/it]

  4%|█▌                                     | 4/101 [34:27<13:55:52, 517.04s/it]

  5%|█▉                                     | 5/101 [43:01<13:45:41, 516.05s/it]

  6%|██▎                                    | 6/101 [51:35<13:35:49, 515.26s/it]

  7%|██▌                                  | 7/101 [1:00:11<13:27:47, 515.62s/it]

  8%|██▉                                  | 8/101 [1:08:47<13:19:00, 515.48s/it]

  9%|███▎                                 | 9/101 [1:17:22<13:10:10, 515.33s/it]

 10%|███▌                                | 10/101 [1:25:57<13:01:42, 515.41s/it]

 11%|███▉                                | 11/101 [1:34:36<12:54:35, 516.39s/it]

 12%|████▎                               | 12/101 [1:43:12<12:45:47, 516.26s/it]

 13%|████▋                               | 13/101 [1:51:48<12:37:10, 516.25s/it]

 14%|████▉                               | 14/101 [2:00:22<12:27:40, 515.64s/it]

 15%|█████▎                              | 15/101 [2:08:58<12:18:54, 515.52s/it]

 16%|█████▋                              | 16/101 [2:17:34<12:10:45, 515.83s/it]

 17%|██████                              | 17/101 [2:26:10<12:02:12, 515.87s/it]

 18%|██████▍                             | 18/101 [2:34:47<11:53:59, 516.13s/it]

 19%|██████▊                             | 19/101 [2:43:28<11:47:19, 517.56s/it]

 20%|███████▏                            | 20/101 [2:52:03<11:37:45, 516.86s/it]

 21%|███████▍                            | 21/101 [3:00:44<11:30:42, 518.03s/it]

 22%|███████▊                            | 22/101 [3:09:20<11:21:24, 517.52s/it]

 23%|████████▏                           | 23/101 [3:17:55<11:11:39, 516.66s/it]

 24%|████████▌                           | 24/101 [3:26:30<11:02:26, 516.18s/it]

 25%|████████▉                           | 25/101 [3:35:10<10:55:26, 517.46s/it]

 26%|█████████▎                          | 26/101 [3:44:00<10:51:32, 521.24s/it]

 27%|█████████▌                          | 27/101 [3:52:54<10:47:25, 524.94s/it]

 28%|█████████▉                          | 28/101 [4:01:48<10:41:59, 527.66s/it]

 29%|██████████▎                         | 29/101 [4:10:39<10:34:19, 528.60s/it]

 30%|██████████▋                         | 30/101 [4:19:31<10:26:50, 529.73s/it]

 31%|███████████                         | 31/101 [4:28:25<10:19:24, 530.92s/it]

 32%|███████████▍                        | 32/101 [4:37:20<10:12:02, 532.21s/it]

 33%|███████████▊                        | 33/101 [4:46:13<10:03:30, 532.50s/it]

 34%|████████████▍                        | 34/101 [4:55:06<9:54:42, 532.58s/it]

 35%|████████████▊                        | 35/101 [5:03:58<9:45:47, 532.54s/it]

 36%|█████████████▏                       | 36/101 [5:12:50<9:36:32, 532.19s/it]

 37%|█████████████▌                       | 37/101 [5:21:46<9:28:50, 533.29s/it]

 38%|█████████████▉                       | 38/101 [5:30:40<9:20:12, 533.54s/it]

 39%|██████████████▎                      | 39/101 [5:39:32<9:10:54, 533.13s/it]

 40%|██████████████▋                      | 40/101 [5:48:26<9:02:20, 533.45s/it]

 41%|███████████████                      | 41/101 [5:57:19<8:53:27, 533.46s/it]

 42%|███████████████▍                     | 42/101 [6:06:11<8:43:59, 532.87s/it]

 43%|███████████████▊                     | 43/101 [6:15:06<8:35:43, 533.50s/it]

 44%|████████████████                     | 44/101 [6:23:58<8:26:29, 533.15s/it]

 45%|████████████████▍                    | 45/101 [6:32:52<8:17:52, 533.43s/it]

 46%|████████████████▊                    | 46/101 [6:41:46<8:08:54, 533.36s/it]

 47%|█████████████████▏                   | 47/101 [6:50:36<7:59:15, 532.50s/it]

 48%|█████████████████▌                   | 48/101 [6:59:34<7:51:41, 534.00s/it]

 49%|█████████████████▉                   | 49/101 [7:08:27<7:42:36, 533.78s/it]

 50%|██████████████████▎                  | 50/101 [7:17:19<7:33:24, 533.42s/it]

 50%|██████████████████▋                  | 51/101 [7:26:12<7:24:25, 533.30s/it]

 51%|███████████████████                  | 52/101 [7:35:03<7:14:45, 532.36s/it]

 52%|███████████████████▍                 | 53/101 [7:43:54<7:05:39, 532.07s/it]

 53%|███████████████████▊                 | 54/101 [7:52:50<6:57:45, 533.31s/it]

 54%|████████████████████▏                | 55/101 [8:01:44<6:48:54, 533.35s/it]

 55%|████████████████████▌                | 56/101 [8:10:36<6:39:43, 532.97s/it]

 56%|████████████████████▉                | 57/101 [8:19:29<6:30:51, 532.99s/it]

 57%|█████████████████████▏               | 58/101 [8:28:21<6:21:51, 532.82s/it]

 58%|█████████████████████▌               | 59/101 [8:37:19<6:14:05, 534.41s/it]

 59%|█████████████████████▉               | 60/101 [8:46:14<6:05:09, 534.38s/it]

 60%|██████████████████████▎              | 61/101 [8:55:06<5:55:57, 533.93s/it]

 61%|██████████████████████▋              | 62/101 [9:03:59<5:46:41, 533.36s/it]

 62%|███████████████████████              | 63/101 [9:12:50<5:37:30, 532.90s/it]

 63%|███████████████████████▍             | 64/101 [9:21:47<5:29:20, 534.07s/it]

 64%|███████████████████████▊             | 65/101 [9:30:43<5:20:42, 534.51s/it]

 65%|████████████████████████▏            | 66/101 [9:39:38<5:11:59, 534.83s/it]

 66%|████████████████████████▌            | 67/101 [9:48:31<5:02:40, 534.12s/it]

 67%|████████████████████████▉            | 68/101 [9:57:22<4:53:21, 533.37s/it]

 68%|████████████████████████▌           | 69/101 [10:06:18<4:44:45, 533.93s/it]

 69%|████████████████████████▉           | 70/101 [10:15:16<4:36:33, 535.27s/it]

 70%|█████████████████████████▎          | 71/101 [10:24:11<4:27:39, 535.33s/it]

 71%|█████████████████████████▋          | 72/101 [10:33:03<4:18:13, 534.24s/it]

 72%|██████████████████████████          | 73/101 [10:41:56<4:09:09, 533.90s/it]

 73%|██████████████████████████▍         | 74/101 [10:50:50<4:00:10, 533.72s/it]

 74%|██████████████████████████▋         | 75/101 [10:59:41<3:51:02, 533.16s/it]

 75%|███████████████████████████         | 76/101 [11:08:38<3:42:36, 534.26s/it]

 76%|███████████████████████████▍        | 77/101 [11:17:35<3:34:03, 535.15s/it]

 77%|███████████████████████████▊        | 78/101 [11:26:29<3:25:00, 534.79s/it]

 78%|████████████████████████████▏       | 79/101 [11:35:22<3:15:51, 534.16s/it]

 79%|████████████████████████████▌       | 80/101 [11:44:14<3:06:43, 533.48s/it]

 80%|████████████████████████████▊       | 81/101 [11:53:11<2:58:08, 534.43s/it]

 81%|█████████████████████████████▏      | 82/101 [12:02:07<2:49:25, 535.01s/it]

 82%|█████████████████████████████▌      | 83/101 [12:11:01<2:40:23, 534.63s/it]

 83%|█████████████████████████████▉      | 84/101 [12:19:54<2:31:23, 534.35s/it]

 84%|██████████████████████████████▎     | 85/101 [12:28:47<2:22:22, 533.90s/it]

 85%|██████████████████████████████▋     | 86/101 [12:37:39<2:13:17, 533.14s/it]

 86%|███████████████████████████████     | 87/101 [12:46:37<2:04:45, 534.65s/it]

 87%|███████████████████████████████▎    | 88/101 [12:55:31<1:55:48, 534.51s/it]

 88%|███████████████████████████████▋    | 89/101 [13:04:25<1:46:51, 534.30s/it]

 89%|████████████████████████████████    | 90/101 [13:13:17<1:37:50, 533.67s/it]

 90%|████████████████████████████████▍   | 91/101 [13:22:08<1:28:48, 532.89s/it]

 91%|████████████████████████████████▊   | 92/101 [13:31:00<1:19:54, 532.73s/it]

 92%|█████████████████████████████████▏  | 93/101 [13:39:59<1:11:16, 534.52s/it]

 93%|█████████████████████████████████▌  | 94/101 [13:48:54<1:02:22, 534.63s/it]

 94%|███████████████████████████████████▋  | 95/101 [13:57:48<53:27, 534.52s/it]

 95%|████████████████████████████████████  | 96/101 [14:06:43<44:33, 534.70s/it]

 96%|████████████████████████████████████▍ | 97/101 [14:15:35<35:35, 533.89s/it]

 97%|████████████████████████████████████▊ | 98/101 [14:24:32<26:43, 534.59s/it]

 98%|█████████████████████████████████████▏| 99/101 [14:33:20<17:45, 532.80s/it]

 99%|████████████████████████████████████▋| 100/101 [14:41:56<08:47, 527.67s/it]

100%|█████████████████████████████████████| 101/101 [14:50:32<00:00, 529.03s/it]


# Restart Training

In [None]:
# base_folder = "AE_Data/Filtered_Shuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

In [None]:
def save_model(model, model_path):
    if not model_path.endswith('.keras'):
        model_path += '.keras'
    model.save(model_path)

def load_existing_model(model_path):
    return load_model(model_path)

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import load_model
import numpy as np

# Define the output directory and subdirectories
# output_dir = 'Default_AE_Default_Data/AE_Training_X'
subfolders = {
    'AE_Latent_Space_Data_TXT': os.path.join(output_dir, 'AE_Latent_Space_Data_TXT'),
    'AE_Training_Loss_Data': os.path.join(output_dir, 'AE_Training_Loss_Data'),
    'AE_Training_Loss_Graphs': os.path.join(output_dir, 'AE_Training_Loss_Graphs'),
    'AE_Latent_Space_Graphs': os.path.join(output_dir, 'AE_Latent_Space_Graphs'),
    'AE_Latent_Space_Data_PKL': os.path.join(output_dir, 'AE_Latent_Space_Data_PKL'),
    'models': os.path.join(output_dir, 'models'),
}

# Create all subdirectories if they don't exist
for folder in subfolders.values():
    os.makedirs(folder, exist_ok=True)

# Load the model from the last saved checkpoint
# checkpoint_epoch_set = XXX
model_file_path = os.path.join(subfolders['models'], f"saved_model_{checkpoint_epoch_set}.keras")
autoencoder = load_model(model_file_path)

# Resume training loop starting from the specified epoch set
for counts in tqdm(range(checkpoint_epoch_set + 1, 101)):
    name = "Autoencoder_Training"

    # Save the latent space predictions log
    txt_file_path = os.path.join(subfolders['AE_Latent_Space_Data_TXT'], f"{counts}_Latent_Space_Predictions_Log.txt")
    with open(txt_file_path, "w") as file:
        # Train the model
        history = autoencoder.fit(
            X_train_loaded, X_train_loaded,
            epochs=1000,
            validation_data=(X_valid_loaded, X_valid_loaded),
            verbose=0
        )

        # Save training loss history
        training_history = pd.DataFrame(history.history)
        history_file_path = os.path.join(subfolders['AE_Training_Loss_Data'], f"{counts}_{name}_History.pkl")
        training_history.to_pickle(history_file_path)

        # Save training loss graph
        plt.plot(training_history)
        graph_file_path = os.path.join(subfolders['AE_Training_Loss_Graphs'], f"{counts}_{name}_History.png")
        plt.savefig(graph_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Define the latent space model
        dr_model = tf.keras.models.Model(
            inputs=autoencoder.input,
            outputs=autoencoder.get_layer('ae_latent').output
        )
        dr_model.summary(print_fn=lambda x: file.write(x + '\n'))

        # Initialize latent space variables
        x, y, z, indices = [], [], [], []

        # Process validation data in batches
        batch_size = 32
        for batch_start in range(0, len(X_valid_loaded), batch_size):
            batch_end = min(batch_start + batch_size, len(X_valid_loaded))
            X_batch = np.array(X_valid_loaded.iloc[batch_start:batch_end])
            y_batch = y_valid_loaded.iloc[batch_start:batch_end].values.flatten()

            # Predict latent space values
            op_batch = dr_model.predict(X_batch, verbose=0)
            for i, op in enumerate(op_batch):
                z.append(y_batch[i])
                x.append(op[0])
                y.append(op[1])
                indices.append(y_valid_loaded.index[batch_start + i])
                file.write(f"Prediction {batch_start + i}: {op}\n")

        # Save latent space scatter plot
        df = pd.DataFrame({'x': x, 'y': y, 'z': [f"trajectory-{k}" for k in z], 'index': indices})
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x='x', y='y', hue='z', data=df, s=10)
        scatter_file_path = os.path.join(subfolders['AE_Latent_Space_Graphs'], f"{counts}_Latent_Space.png")
        plt.savefig(scatter_file_path, dpi=300)
        plt.clf()
        plt.close()

        # Save latent space data as pickle
        pkl_file_path = os.path.join(subfolders['AE_Latent_Space_Data_PKL'], f"{counts}_Latent_Space.pkl")
        df.to_pickle(pkl_file_path)

        # Save the model
        model_file_path = os.path.join(subfolders['models'], f"saved_model_{counts}.keras")
        autoencoder.save(model_file_path)

# Original Training Code

# Generating Full Latent Space Representations:

In [None]:
# Concatenate training and validation datasets
X_loaded = pd.concat([X_train_loaded, X_valid_loaded], axis=0)

In [None]:
# Function to create the latent space representations and save to CSV
def generate_and_save_latent_space_representations(model_path, data, output_dir, output_filename):
    # Load the saved model
    autoencoder = load_model(model_path)

    # Create a model to get the latent space representation
    dr_model = tf.keras.models.Model(inputs=autoencoder.get_layer('ae_input').input,
                                     outputs=autoencoder.get_layer('ae_latent').output)
    
    # Generate latent space representations for the entire dataset
    latent_space_representations = dr_model.predict(data, verbose=1)

    # Create a DataFrame to store the representations with the index from the input data
    df_latent_space = pd.DataFrame(latent_space_representations, columns=['x', 'y'], index=data.index)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Save the DataFrame to a CSV file
    output_filepath = os.path.join(output_dir, output_filename)
    df_latent_space.to_pickle(output_filepath)

    print(f"Latent space representations saved to {output_filepath}")

# Specify the path to the saved model
model_path = 'AE_Training_1/models/saved_model_20.keras'

# Specify the output directory and filename
output_dir = 'Full_LSPs_Same_Training_Data'
output_filename = 'AE1_ES_20_LSR_Full'

# Generate and save the latent space representations
generate_and_save_latent_space_representations(model_path, X_loaded, output_dir, output_filename)

In [None]:
# Function to create the latent space representations and save to CSV
def generate_and_save_latent_space_representations(model_path, data, output_dir, output_filename):
    # Load the saved model
    autoencoder = load_model(model_path)

    # Create a model to get the latent space representation
    dr_model = tf.keras.models.Model(inputs=autoencoder.get_layer('ae_input').input,
                                     outputs=autoencoder.get_layer('ae_latent').output)
    
    # Generate latent space representations for the entire dataset
    latent_space_representations = dr_model.predict(data, verbose=1)

    # Create a DataFrame to store the representations with the index from the input data
    df_latent_space = pd.DataFrame(latent_space_representations, columns=['x', 'y'], index=data.index)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Save the DataFrame to a CSV file
    output_filepath = os.path.join(output_dir, output_filename)
    df_latent_space.to_pickle(output_filepath)

    print(f"Latent space representations saved to {output_filepath}")

# Specify the path to the saved model
model_path = 'AE_Training_2/models/saved_model_20.keras'

# Specify the output directory and filename
output_dir = 'Full_LSPs_Same_Training_Data'
output_filename = 'AE2_ES_20_LSR_Full'

# Generate and save the latent space representations
generate_and_save_latent_space_representations(model_path, X_loaded, output_dir, output_filename)

In [None]:
# Function to create the latent space representations and save to CSV
def generate_and_save_latent_space_representations(model_path, data, output_dir, output_filename):
    # Load the saved model
    autoencoder = load_model(model_path)

    # Create a model to get the latent space representation
    dr_model = tf.keras.models.Model(inputs=autoencoder.get_layer('ae_input').input,
                                     outputs=autoencoder.get_layer('ae_latent').output)
    
    # Generate latent space representations for the entire dataset
    latent_space_representations = dr_model.predict(data, verbose=1)

    # Create a DataFrame to store the representations with the index from the input data
    df_latent_space = pd.DataFrame(latent_space_representations, columns=['x', 'y'], index=data.index)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Save the DataFrame to a CSV file
    output_filepath = os.path.join(output_dir, output_filename)
    df_latent_space.to_pickle(output_filepath)

    print(f"Latent space representations saved to {output_filepath}")

# Specify the path to the saved model
model_path = 'AE_Training_3/models/saved_model_20.keras'

# Specify the output directory and filename
output_dir = 'Full_LSPs_Same_Training_Data'
output_filename = 'AE3_ES_20_LSR_Full'

# Generate and save the latent space representations
generate_and_save_latent_space_representations(model_path, X_loaded, output_dir, output_filename)

In [None]:
# Function to create the latent space representations and save to CSV
def generate_and_save_latent_space_representations(model_path, data, output_dir, output_filename):
    # Load the saved model
    autoencoder = load_model(model_path)

    # Create a model to get the latent space representation
    dr_model = tf.keras.models.Model(inputs=autoencoder.get_layer('ae_input').input,
                                     outputs=autoencoder.get_layer('ae_latent').output)
    
    # Generate latent space representations for the entire dataset
    latent_space_representations = dr_model.predict(data, verbose=1)

    # Create a DataFrame to store the representations with the index from the input data
    df_latent_space = pd.DataFrame(latent_space_representations, columns=['x', 'y'], index=data.index)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Save the DataFrame to a CSV file
    output_filepath = os.path.join(output_dir, output_filename)
    df_latent_space.to_pickle(output_filepath)

    print(f"Latent space representations saved to {output_filepath}")

# Specify the path to the saved model
model_path = 'AE_Training_4/models/saved_model_20.keras'

# Specify the output directory and filename
output_dir = 'Full_LSPs_Same_Training_Data'
output_filename = 'AE4_ES_20_LSR_Full'

# Generate and save the latent space representations
generate_and_save_latent_space_representations(model_path, X_loaded, output_dir, output_filename)

# Load Latent Space Data For Clustering

In [None]:
import os
import gc
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from matplotlib.image import imread
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, fcluster

import MDAnalysis as mda
from itertools import combinations
from mpl_toolkits.mplot3d import Axes3D
from MDAnalysis.analysis import align, rms, distances

# Load the topology and trajectory files for both MD simulations
u1 = mda.Universe("myc_091-160_stripped.prmtop", "aMD_091-160_no1-4_stripped.nc")
u2 = mda.Universe("myc_091-160_D132-H_stripped.prmtop", "aMD_091-160_D132-H_no1-4_stripped.nc")

In [None]:
# Latent Space 1
folder_name_1 = 'Full_LSPs'
file_name_1 = 'AE_Fold_1_ES_6.csv'
latent_1 = pd.read_csv(f'{folder_name_1}/{file_name_1}')

# Latent Space 2
folder_name_2 = 'Full_LSPs'
file_name_2 = 'AE_Fold_2_ES_6.csv'
latent_2 = pd.read_csv(f'{folder_name_2}/{file_name_2}')

# Latent Space 3
folder_name_3 = 'Full_LSPs'
file_name_3 = 'AE_Fold_3_ES_6.csv'
latent_3 = pd.read_csv(f'{folder_name_3}/{file_name_3}')

# Latent Space 4
folder_name_4 = 'Full_LSPs'
file_name_4 = 'AE_Fold_4_ES_6.csv'
latent_4 = pd.read_csv(f'{folder_name_4}/{file_name_4}')

# Latent Space 5
folder_name_5 = 'Full_LSPs'
file_name_5 = 'AE_Fold_5_ES_6.csv'
latent_5 = pd.read_csv(f'{folder_name_5}/{file_name_5}')

# Clustering (Batched)

In [None]:
threshold_val = 1.0
length_val = 1

# Function to perform clustering in batches
def batch_clustering(data, batch_size, threshold):
    clusters = []
    start = 0
    while start < data.shape[0]:
        end = min(start + batch_size, data.shape[0])
        batch = data[start:end]
        linkage_matrix = linkage(batch, method='ward')
        batch_clusters = fcluster(linkage_matrix, t=threshold, criterion='distance')
        clusters.extend(batch_clusters)
        start = end
    return np.array(clusters)

# Define a function to process and cluster each latent space in batches
def process_and_cluster_in_batches(latent_df, scaler, batch_size, threshold=threshold_val):
    coords = latent_df[['x', 'y']].values
    coords_scaled = scaler.fit_transform(coords)
    clusters = batch_clustering(coords_scaled, batch_size, threshold)
    return clusters

# Initialize the scaler
scaler = StandardScaler()

# List of latent DataFrames
latent_dfs = [latent_1, latent_2, latent_3, latent_4, latent_5]

# Dictionary to store clusters
clusters_dict = {}

# Define batch size and threshold
batch_size = 20000
threshold = threshold_val

# Process and cluster each latent DataFrame in batches
for i, latent_df in enumerate(latent_dfs, start=1):
    clusters_dict[f'clusters{i}'] = process_and_cluster_in_batches(latent_df, scaler, batch_size, threshold)

# Combine the cluster assignments from all latent spaces
combined_clusters = np.vstack(list(clusters_dict.values())).T

# Find consensus clusters
consensus_clusters = {}
for i, cluster_tuple in enumerate(combined_clusters):
    cluster_tuple = tuple(cluster_tuple)
    if cluster_tuple not in consensus_clusters:
        consensus_clusters[cluster_tuple] = []
    consensus_clusters[cluster_tuple].append(int(latent_1.iloc[i, 0]))

# Filter out singleton clusters (clusters with only one member)
consensus_clusters = {k: v for k, v in consensus_clusters.items() if len(v) > length_val}

# Calculate cluster sizes for plotting
cluster_sizes = {k: len(v) for k, v in consensus_clusters.items()}

# Visualization function
def plot_clusters(latent, clusters, cluster_sizes, title):
    cluster_count = pd.Series(clusters).value_counts()
    sizes = cluster_count.loc[clusters].values
    
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(latent['x'], latent['y'], c=clusters, s=sizes, cmap='viridis', marker='o', alpha=0.6)
    plt.title(title)
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.colorbar(scatter, label='Cluster')
    plt.show()

# Add cluster labels to the DataFrames for plotting and plot each
for i, latent_df in enumerate(latent_dfs, start=1):
    latent_df['cluster'] = clusters_dict[f'clusters{i}']
    plot_clusters(latent_df, latent_df['cluster'], cluster_sizes, f'Clusters in Latent Space {i}')

In [None]:
# Group the DataFrame by 'cluster' and print the indices sorted by the cluster
clustered_indices = latent_1.groupby('cluster', group_keys=False).apply(lambda df: df.index.tolist(), include_groups=False)

# Print the indices sorted by the cluster
for cluster_id, indices in clustered_indices.items():
    print(f"Cluster {cluster_id}: {indices}")

In [None]:
# Group the DataFrame by 'cluster' and count the number of indices in each cluster
cluster_counts = latent_1['cluster'].value_counts().sort_values()

# Print the clusters and their counts sorted by the number of indices in ascending order
for cluster_id, count in cluster_counts.items():
    print(f"Cluster {cluster_id}: {count}")

# Generate Cluster Statistics

In [None]:
import os
import pandas as pd
import numpy as np
import MDAnalysis as mda
from MDAnalysis.analysis import align
from tqdm import tqdm

threshold_val = 1.0
length_val = 1
minimum_indices = 1

# Create the directory to save the files
output_dir = f"N4_Threshold_{threshold_val}_Length_{length_val}"
os.makedirs(output_dir, exist_ok=True)

# Function to extract frames for residues 15-30
def extract_frames(indices, universe1, universe2, switch_index=40000, selection="resid 15-30 and name CA"):
    frames = []
    for idx in indices:
        universe = universe1 if idx < switch_index else universe2
        frame_idx = idx if idx < switch_index else idx - switch_index
        universe.trajectory[frame_idx]
        frames.append(universe.select_atoms(selection).positions.copy())
    return frames

# Function to create a temporary universe
def create_temp_universe(positions, universe, selection="resid 15-30 and name CA"):
    temp_universe = mda.Merge(universe.select_atoms(selection))
    temp_universe.atoms.positions = positions
    return temp_universe

# Function to align frames and calculate normalized RMSD
def align_and_calculate_rmsd(reference_universe, frames, select_for_alignment):
    rmsd_list = []
    for frame_positions in frames:
        temp_universe = create_temp_universe(frame_positions, reference_universe, selection=select_for_alignment)
        aligner = align.AlignTraj(temp_universe, reference_universe, select=select_for_alignment, in_memory=True)
        aligner.run()
        rmsd = align.rms.rmsd(temp_universe.atoms.positions, reference_universe.atoms.positions)
        rmsd_normalized = rmsd / np.linalg.norm(reference_universe.atoms.positions)
        rmsd_list.append(np.mean(rmsd_normalized))
    return np.mean(rmsd_list)

# Define selection string for alignment
select_residues_15_30 = "resid 15-30 and name CA"

# Prepare the alignment CSV file
alignment_file = os.path.join(output_dir, "Latent_Alignment.csv")
alignment_cols = ['index', 'x', 'y', 'cluster', 'reference_frame']

# Initialize the reference frames dictionary
reference_frames = {}

# Check if the file exists and load existing data
if os.path.exists(alignment_file):
    existing_data = pd.read_csv(alignment_file)
    last_cluster_id = existing_data['cluster'].max()
else:
    with open(alignment_file, 'w') as f:
        f.write(','.join(alignment_cols) + '\n')
    existing_data = pd.DataFrame(columns=alignment_cols)
    last_cluster_id = -1

# Calculate reference frames and update the alignment CSV file incrementally
for cluster_id, cluster_data in tqdm(latent_1.groupby('cluster'), desc="Analyzing clusters", unit="cluster"):
    if cluster_id <= last_cluster_id:
        continue

    indices = cluster_data.index.tolist()
    if len(indices) < minimum_indices:
        continue 

    frames = extract_frames(indices, u1, u2, selection=select_residues_15_30)
    
    # Determine the reference frame that minimizes the RMSD
    min_rmsd = float('inf')
    reference_frame_idx = -1
    for i, ref_frame_positions in enumerate(frames):
        reference_universe = create_temp_universe(ref_frame_positions, u1, selection=select_residues_15_30)
        rmsd = align_and_calculate_rmsd(reference_universe, frames, select_residues_15_30)
        if rmsd < min_rmsd:
            min_rmsd = rmsd
            reference_frame_idx = indices[i]
    
    reference_frames[cluster_id] = reference_frame_idx
    
    # Update the alignment data
    cluster_data['reference_frame'] = reference_frame_idx
    cluster_data['index'] = cluster_data.index
    
    # Append the alignment data to the CSV file
    cluster_data[alignment_cols].to_csv(alignment_file, mode='a', header=False, index=False)

# Perform statistical analysis on the clusters
latent_1_sorted = latent_1.sort_index()
cluster_durations = []
current_cluster = None
current_start_index = None

# Iterate through the sorted DataFrame to find cluster durations
for idx, row in latent_1_sorted.iterrows():
    cluster = row['cluster']
    if cluster == current_cluster:
        current_end_index = idx
    else:
        if current_cluster is not None:
            duration = current_end_index - current_start_index + 1
            cluster_durations.append((current_cluster, duration))
        current_cluster = cluster
        current_start_index = idx
        current_end_index = idx

# Append the last cluster duration
if current_cluster is not None:
    duration = current_end_index - current_start_index + 1
    cluster_durations.append((current_cluster, duration))

# Convert durations to DataFrame
durations_df = pd.DataFrame(cluster_durations, columns=['Cluster', 'Duration'])

# Group by cluster and calculate summary metrics
summary_df = durations_df.groupby('Cluster').agg(
    Entries=('Cluster', 'size'),
    Tot_Dur=('Duration', 'sum'),
    Max_Dur=('Duration', 'max'),
    Av_Dur=('Duration', 'mean')
).reset_index().sort_values(by='Cluster')

# Save the summary DataFrame to CSV
statistics_file = os.path.join(output_dir, "Cluster_Statistics.csv")
summary_df.to_csv(statistics_file, index=False)

# Display the summary DataFrame
pd.set_option('display.max_rows', None)
print(summary_df)
pd.reset_option('display.max_rows')

In [None]:
import os
import numpy as np
import pandas as pd
import MDAnalysis as mda
from MDAnalysis.analysis import align, rms, distances
import gc
from tqdm import tqdm

threshold_val = 1.0
length_val = 1
minimum_indices = 1

# Define the output directories
output_dir = f"N4_Threshold_{threshold_val}_Length_{length_val}"
expanded_csv_path = os.path.join(output_dir, "Cluster_Statistics_Expanded.csv")

# Load the CSV files
latent_1_alignment = pd.read_csv(os.path.join(output_dir, "Latent_Alignment.csv"))
latent_1_statistics = pd.read_csv(os.path.join(output_dir, "Cluster_Statistics.csv"))

# Load the MD simulation data
u1 = mda.Universe("myc_091-160_stripped.prmtop", "aMD_091-160_no1-4_stripped.nc")
u2 = mda.Universe("myc_091-160_D132-H_stripped.prmtop", "aMD_091-160_D132-H_no1-4_stripped.nc")

# Define the function to extract frames from both universes
def extract_frames(indices, universe1, universe2, switch_index=40000):
    frames = []
    for idx in indices:
        if idx < switch_index:
            universe1.trajectory[idx]
            frames.append(universe1.select_atoms("name CA").positions.copy())
        else:
            universe2.trajectory[idx - switch_index]
            frames.append(universe2.select_atoms("name CA").positions.copy())
    return frames

# Create a temporary Universe for alignment
def create_temp_universe(positions, universe):
    temp_universe = mda.Merge(universe.select_atoms("name CA"))
    temp_universe.atoms.positions = positions
    return temp_universe

# Align frames using AlignTraj
def align_frames(reference_universe, frames, select_for_alignment):
    aligned_frames = []
    for frame_positions in frames:
        temp_universe = create_temp_universe(frame_positions, reference_universe)
        aligner = align.AlignTraj(temp_universe, reference_universe, select=select_for_alignment, in_memory=False)
        aligner.run()
        aligned_frames.append(temp_universe.atoms.positions.copy())
        del temp_universe  # Explicitly delete to free memory
        gc.collect()
    return aligned_frames

# Calculate RMSD for aligned frames
def calculate_rmsd(reference_positions, frames):
    rmsd_values = [rms.rmsd(frame, reference_positions) for frame in frames]
    return np.mean(rmsd_values), np.std(rmsd_values)

# Measure distances between residues
def calculate_distances(universe, res1, res2):
    distance_values = distances.distance_array(
        universe.select_atoms(f"resid {res1} and name CA").positions,
        universe.select_atoms(f"resid {res2} and name CA").positions
    )
    mean_distance = np.mean(distance_values)
    return mean_distance

# Selection string for residues 15-30 to ensure alignment of corresponding residues
select_residues_15_30 = "resid 15-30 and name CA"

# Create a function to calculate cluster statistics
def calculate_cluster_statistics(cluster_id, indices, reference_idx):
    # Extract frames for the cluster
    frames = extract_frames(indices, u1, u2)
    
    # Extract the reference frame for alignment
    if reference_idx < 40000:
        u1.trajectory[reference_idx]
        reference_positions = u1.select_atoms("name CA").positions.copy()
    else:
        u2.trajectory[reference_idx - 40000]
        reference_positions = u2.select_atoms("name CA").positions.copy()
    
    reference_universe = create_temp_universe(reference_positions, u1)
    
    # Align frames using the specific residues 15-30
    aligned_frames = align_frames(reference_universe, frames, select_residues_15_30)
    
    # Calculate RMSD
    mean_rmsd, std_rmsd = calculate_rmsd(reference_positions, aligned_frames)
    
    # Measure distances
    mean_dist_15_22 = calculate_distances(reference_universe, 15, 22)
    mean_dist_22_30 = calculate_distances(reference_universe, 22, 30)
    mean_dist_15_30 = calculate_distances(reference_universe, 15, 30)
    
    # Save the statistics to a list for the new DataFrame
    cluster_stats = {
        "Cluster": cluster_id,
        "Frames": len(indices),
        "Mean_RMSD": mean_rmsd,
        "Std_RMSD": std_rmsd,
        "Mean_Dist_15_22": mean_dist_15_22,
        "Mean_Dist_22_30": mean_dist_22_30,
        "Mean_Dist_15_30": mean_dist_15_30,
    }
    
    return cluster_stats

# Function to load existing cluster statistics if the CSV file exists
def load_existing_stats(file_path):
    if os.path.exists(file_path):
        return pd.read_csv(file_path)
    else:
        return pd.DataFrame()

# Load existing statistics
existing_stats_df = load_existing_stats(expanded_csv_path)
processed_clusters = set(existing_stats_df['Cluster']) if not existing_stats_df.empty else set()

# Group the DataFrame by 'cluster' and get the indices for each cluster
cluster_indices = latent_1_alignment.groupby('cluster').apply(lambda df: df['index'].tolist()).to_dict()

# Get the best reference frames for each cluster
reference_frames = latent_1_alignment.groupby('cluster')['reference_frame'].first().to_dict()

# Sort the summary DataFrame by cluster number
sorted_summary_df = latent_1_statistics.sort_values(by='Cluster')

# Calculate statistics for each cluster in order of cluster number with a simple progress bar
for cluster_id in tqdm(sorted_summary_df['Cluster'], desc="Processing clusters", unit="cluster"):
    if cluster_id in processed_clusters:
        continue
    indices = cluster_indices[cluster_id]
    reference_idx = reference_frames[cluster_id]
    cluster_stats = calculate_cluster_statistics(cluster_id, indices, reference_idx)
    
    # Convert the dictionary to a DataFrame and concatenate it with the existing DataFrame
    cluster_stats_df = pd.DataFrame([cluster_stats])
    existing_stats_df = pd.concat([existing_stats_df, cluster_stats_df], ignore_index=True)
    existing_stats_df.to_csv(expanded_csv_path, index=False)

print("Processing complete. Cluster statistics have been updated.")

# Generate Figures:

In [None]:
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import MDAnalysis as mda
import pandas as pd
from MDAnalysis.analysis import align

threshold_val = 1.0
length_val = 1
minimum_indices = 1

# Define the output directories
output_dir = f"N4_Threshold_{threshold_val}_Length_{length_val}"

# Load the CSV files
latent_1_alignment = pd.read_csv(os.path.join(output_dir, "Latent_Alignment.csv"))
latent_1_statistics = pd.read_csv(os.path.join(output_dir, "Cluster_Statistics.csv"))

# Load the MD simulation data
u1 = mda.Universe("myc_091-160_stripped.prmtop", "aMD_091-160_no1-4_stripped.nc")
u2 = mda.Universe("myc_091-160_D132-H_stripped.prmtop", "aMD_091-160_D132-H_no1-4_stripped.nc")

# Define the function to extract frames from both universes
def extract_frames(indices, universe1, universe2, switch_index=40000):
    frames = []
    for idx in indices:
        if idx < switch_index:
            universe1.trajectory[idx]
            frames.append(universe1.select_atoms("name CA").positions.copy())
        else:
            universe2.trajectory[idx - switch_index]
            frames.append(universe2.select_atoms("name CA").positions.copy())
    return frames

# Create a temporary Universe for alignment
def create_temp_universe(positions, universe):
    temp_universe = mda.Merge(universe.select_atoms("name CA"))
    temp_universe.atoms.positions = positions
    return temp_universe

# Align frames using AlignTraj
def align_frames(reference_universe, frames, select_for_alignment):
    aligned_frames = []
    for frame_positions in frames:
        temp_universe = create_temp_universe(frame_positions, reference_universe)
        aligner = align.AlignTraj(temp_universe, reference_universe, select=select_for_alignment, in_memory=False)
        aligner.run()
        aligned_frames.append(temp_universe.atoms.positions.copy())
    return aligned_frames

# Selection string for residues 15-30 to ensure alignment of corresponding residues
select_residues_15_30 = "resid 15-30 and name CA"

# Function to plot aligned frames for a given cluster and save the figures
def plot_aligned_frames(cluster_id, indices, best_reference_idx, save_dir):
    # Define the filename
    filename = f"Cluster_{cluster_id}_N_{len(indices)}.png"
    filepath = os.path.join(save_dir, filename)
    
    # Check if the file already exists
    if (os.path.exists(filepath)):
        print(f"Skipping cluster {cluster_id}, file already exists.")
        return
    
    # Extract frames for the cluster
    frames = extract_frames(indices, u1, u2)
    
    # Find the position of best_reference_idx within the cluster's indices
    reference_idx_in_cluster = indices.index(best_reference_idx)
    
    # Create reference universe for alignment
    reference_universe = create_temp_universe(frames[reference_idx_in_cluster], u1)
    
    # Align frames using the specific residues 15-30
    aligned_frames = align_frames(reference_universe, frames, select_residues_15_30)
    
    # Set the opacity value with a minimum threshold
    opacity = max(1 / len(aligned_frames), 0.002)
    
    # Plot the aligned frames from different angles
    angles = [(0, 0), (90, 0), (0, 90), (90, 90)]
    
    fig, axs = plt.subplots(2, 2, subplot_kw={'projection': '3d'}, figsize=(15, 15))
    
    for ax, angle in zip(axs.flatten(), angles):
        # Plot cluster frames with lines connecting the alpha carbons
        for positions in aligned_frames:
            for i in range(len(positions) - 1):
                if i == 14:
                    color = 'red'  # alpha carbon 105
                elif i == 29:
                    color = 'green'  # alpha carbon 120
                else:
                    color = 'grey'
                ax.plot(positions[i:i+2, 0], positions[i:i+2, 1], positions[i:i+2, 2], 
                        'o-', color=color, markersize=1, markeredgewidth=0.8, alpha=opacity)
        
        ax.view_init(elev=angle[0], azim=angle[1])
        ax.set_axis_off() 
        ax.grid(False)
    
    # Save the figure to the specified directory with the appropriate filename
    plt.savefig(filepath, bbox_inches='tight', pad_inches=0, dpi=300)
    plt.close(fig)

# Create the directory to save the figures if it doesn't exist
save_dir = os.path.join(output_dir, f"Cluster_N4_Full_Threshold_{threshold_val}_All_Angles")
os.makedirs(save_dir, exist_ok=True)

# Group the DataFrame by 'cluster' and get the indices for each cluster
cluster_indices = latent_1_alignment.groupby('cluster').apply(lambda df: df['index'].tolist()).to_dict()

# Sort the summary DataFrame by cluster number
sorted_summary_df = latent_1_statistics.sort_values(by='Cluster')

# Plot aligned frames for each cluster in order of cluster number with a simple progress bar
for cluster_id in tqdm(sorted_summary_df['Cluster'], desc="Processing clusters", unit="cluster"):
    indices = cluster_indices[cluster_id]
    best_reference_idx = latent_1_alignment.loc[latent_1_alignment['cluster'] == cluster_id, 'reference_frame'].values[0]
    plot_aligned_frames(cluster_id, indices, best_reference_idx, save_dir)

# Display Figures:

In [None]:
# Load the CSV files
threshold_val = 1.0
length_val = 1
minimum_indices = 1

cluster_stats_path = f"N4_Threshold_{threshold_val}_Length_{length_val}/Cluster_Statistics.csv"
expanded_stats_path = f"N4_Threshold_{threshold_val}_Length_{length_val}/Cluster_Statistics_Expanded.csv"
latent_alignment_path = f"N4_Threshold_{threshold_val}_Length_{length_val}/Latent_Alignment.csv"

cluster_stats_df = pd.read_csv(cluster_stats_path)
expanded_stats_df = pd.read_csv(expanded_stats_path)
latent_alignment_df = pd.read_csv(latent_alignment_path)

# Filter clusters with at least 20 frames
filtered_clusters_df = cluster_stats_df[cluster_stats_df['Tot_Dur'] >= 20]

# Merge the filtered clusters with the expanded statistics
merged_df = filtered_clusters_df.merge(expanded_stats_df, on='Cluster')

# Calculate the average values for distance measurements
mean_std_rmsd_avg = merged_df['Std_RMSD'].mean()
mean_dist_15_22_avg = merged_df['Mean_Dist_15_22'].mean()
mean_dist_22_30_avg = merged_df['Mean_Dist_22_30'].mean()
mean_dist_15_30_avg = merged_df['Mean_Dist_15_30'].mean()

# Define the scoring function
def calculate_score(row, weights):
    score = weights['RMSD'] * row['Mean_RMSD']
    score += weights['Std_RMSD_w'] * (row['Std_RMSD'] - mean_std_rmsd_avg)
    score += weights['Dist_15_22'] * (row['Mean_Dist_15_22'] - mean_dist_15_22_avg)
    score += weights['Dist_22_30'] * (row['Mean_Dist_22_30'] - mean_dist_22_30_avg)
    score += weights['Dist_15_30'] * (row['Mean_Dist_15_30'] - mean_dist_15_30_avg)
    return score

# Set weights
weights = {
    'RMSD': 1.0,
    'Std_RMSD_w': 0.5, 
    'Dist_15_22': 0.5,  
    'Dist_22_30': 0.5, 
    'Dist_15_30': 0.5,  
}

# Calculate scores and ranks
tqdm.pandas(desc="Calculating scores")
merged_df['Score'] = merged_df.progress_apply(calculate_score, axis=1, weights=weights)
merged_df['Rank'] = merged_df['Score'].rank(ascending=True)

# Save the results to a CSV file
output_csv_path = f"N4_Threshold_{threshold_val}_Length_{length_val}/Cluster_Rankings.csv"
merged_df.to_csv(output_csv_path, index=False)

# Display the figures
image_folder = f"N4_Threshold_{threshold_val}_Length_{length_val}/Cluster_N4_Full_Threshold_{threshold_val}_All_Angles"

# Sort by rank to display in order
merged_df = merged_df.sort_values(by='Rank', ascending=True)

for _, row in merged_df.iterrows():
    cluster_number = row['Cluster']
    num_frames = row['Tot_Dur']
    rank = row['Rank']
    score = row['Score']
    
    mean_rmsd = row['Mean_RMSD']
    std_rmsd = row['Std_RMSD']
    mean_dist_15_22 = row['Mean_Dist_15_22']
    mean_dist_22_30 = row['Mean_Dist_22_30']
    mean_dist_15_30 = row['Mean_Dist_15_30']
    
    weighted_std_rmsd = weights['Std_RMSD_w'] * (std_rmsd - mean_std_rmsd_avg)
    weighted_dist_15_22 = weights['Dist_15_22'] * (mean_dist_15_22 - mean_dist_15_22_avg)
    weighted_dist_22_30 = weights['Dist_22_30'] * (mean_dist_22_30 - mean_dist_22_30_avg)
    weighted_dist_15_30 = weights['Dist_15_30'] * (mean_dist_15_30 - mean_dist_15_30_avg)
    
    image_path = os.path.join(image_folder, f"Cluster_{int(cluster_number)}_N_{int(num_frames)}.png")
    if os.path.exists(image_path):
        img = plt.imread(image_path)
        plt.figure(figsize=(10, 10))
        plt.imshow(img)
        plt.title(f"Cluster: {cluster_number}, Frames: {num_frames}, Rank: {rank:.2f}, Score: {score:.2f}\n"
                  f"RMSD: {mean_rmsd:.2f}, Weighted Std RMSD: {weighted_std_rmsd:.2f}, "
                  f"Weighted Dist 15-22: {weighted_dist_15_22:.2f}, "
                  f"Weighted Dist 22-30: {weighted_dist_22_30:.2f}, "
                  f"Weighted Dist 15-30: {weighted_dist_15_30:.2f}")
        plt.axis('off')
        plt.show()
    else:
        print(f"Image for Cluster {cluster_number} not found.")