### Data Preparation:
- Unfiltered, Shuffled Distance Measurement Data
- Unfiltered, Shuffled Angle Measurement Data
- Unfiltered, Shuffled RMSD Measurement Data
- Unfiltered, UnShuffled Distance Measurement Data
- Filtered, Shuffled Distance Measurement Data

# Unfiltered Distance Measurement Data

In [None]:
import glob
import pandas as pd
import os
import re

# Define the range of window sizes
window_range = range(2, 52)

def import_lcc_data(lccdata_folder):
    """
    Imports Local Compaction data files (.csv) and assigns them to a dictionary.

    Parameters:
    - lccdata_folder: The folder where Local Compaction data files are stored.

    Returns:
    - A dictionary with window sizes as keys and pandas DataFrames as values.
    """
    data_dict = {}
    
    for window_size in window_range:
        file_path = os.path.join(lccdata_folder, f"WT_Simulation_WS_{window_size}.csv")
        if os.path.exists(file_path):
            df = pd.read_csv(file_path, index_col=0)
            df.index.name = None 
            data_dict[window_size] = df
        else:
            print(f"Warning: File not found for Window Size {window_size}: {file_path}")
    
    return data_dict


# Folder containing the Local Compaction data
lccdata_folder = 'Local_Compaction/Local_Compaction_Data'

# Import LCC data for the wild-type protein
wt_dict = import_lcc_data(lccdata_folder)

# Unfiltered, Shuffled Distance Measurement Data

In [None]:
import os
import pandas as pd
from sklearn.model_selection import KFold

def save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder):
    """
    Save the datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    os.makedirs(folder_name, exist_ok=True)
    
    X_train.to_csv(os.path.join(folder_name, 'X_train_f.csv'))
    y_train.to_csv(os.path.join(folder_name, 'y_train_f.csv'))
    X_valid.to_csv(os.path.join(folder_name, 'X_valid_f.csv'))
    y_valid.to_csv(os.path.join(folder_name, 'y_valid_f.csv'))

def load_datasets(fold, base_folder):
    """
    Load datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    
    X_train = pd.read_csv(os.path.join(folder_name, 'X_train_f.csv'), index_col=0)
    y_train = pd.read_csv(os.path.join(folder_name, 'y_train_f.csv'), index_col=0)
    X_valid = pd.read_csv(os.path.join(folder_name, 'X_valid_f.csv'), index_col=0)
    y_valid = pd.read_csv(os.path.join(folder_name, 'y_valid_f.csv'), index_col=0)
    
    return X_train, y_train, X_valid, y_valid

def normalize_to_range_0_1(data):
    """
    Normalize data to range [0, 1] and round to 6 decimal places.
    """
    normalized = (data - data.min()) / (data.max() - data.min())
    return normalized.round(6)

def preprocessing_kfold(wt_data, n_splits=5, base_folder="AE_Data/Unfiltered_Shuffled_Compaction_Data"):
    """
    Preprocess data using KFold cross-validation, normalize it, and save it.
    """
    # Ensure all data is numeric
    wt_data = wt_data.apply(pd.to_numeric, errors='coerce')

    # Shuffle the DataFrame rows without resetting the index
    wt_data = wt_data.sample(frac=1, random_state=42)
    
    os.makedirs(base_folder, exist_ok=True)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold = 1
    for train_index, valid_index in kf.split(wt_data):
        folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
        if not os.path.exists(os.path.join(folder_name, 'X_train_f.csv')):
            X_train, X_valid = wt_data.iloc[train_index], wt_data.iloc[valid_index]
            y_train = pd.DataFrame({'class': [0] * len(X_train)}, index=X_train.index)
            y_valid = pd.DataFrame({'class': [0] * len(X_valid)}, index=X_valid.index)

            # Normalize training and validation data to [0, 1] and round to 6 decimal places
            X_train = normalize_to_range_0_1(X_train)
            X_valid = normalize_to_range_0_1(X_valid)

            print(f"Fold {fold}:")
            print(f"Training set shape: {X_train.shape}")
            print(f"Validation set shape: {X_valid.shape}")
            
            save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder)
        else:
            print(f"Data for fold {fold} already exists, skipping generation.")
        fold += 1

In [None]:
# Window sizes to be used
window_sizes = list(range(2, 52))

# Concatenate all DataFrames in wt_dict into a single DataFrame
wt_combined = pd.concat(wt_dict.values(), axis=1)

# Preprocess and save data
preprocessing_kfold(wt_combined, n_splits=5)

# Example of loading a specific fold (e.g., fold 1)
base_folder = "AE_Data/Unfiltered_Shuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

# Unfiltered, UnShuffled Distance Measurement Data

In [None]:
import os
import pandas as pd
from sklearn.model_selection import KFold

def save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder):
    """
    Save the datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    os.makedirs(folder_name, exist_ok=True)
    
    X_train.to_csv(os.path.join(folder_name, 'X_train_f.csv'))
    y_train.to_csv(os.path.join(folder_name, 'y_train_f.csv'))
    X_valid.to_csv(os.path.join(folder_name, 'X_valid_f.csv'))
    y_valid.to_csv(os.path.join(folder_name, 'y_valid_f.csv'))

def load_datasets(fold, base_folder):
    """
    Load datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    
    X_train = pd.read_csv(os.path.join(folder_name, 'X_train_f.csv'), index_col=0)
    y_train = pd.read_csv(os.path.join(folder_name, 'y_train_f.csv'), index_col=0)
    X_valid = pd.read_csv(os.path.join(folder_name, 'X_valid_f.csv'), index_col=0)
    y_valid = pd.read_csv(os.path.join(folder_name, 'y_valid_f.csv'), index_col=0)
    
    return X_train, y_train, X_valid, y_valid

def normalize_to_range_0_1(data):
    """
    Normalize data to range [0, 1] and round to 6 decimal places.
    """
    normalized = (data - data.min()) / (data.max() - data.min())
    return normalized.round(6)

def preprocessing_kfold(wt_data, n_splits=5, base_folder="AE_Data/Unfiltered_UnShuffled_Compaction_Data"):
    """
    Preprocess data using KFold cross-validation, normalize it, and save it without shuffling.
    """
    # Ensure all data is numeric
    wt_data = wt_data.apply(pd.to_numeric, errors='coerce')
    
    os.makedirs(base_folder, exist_ok=True)

    kf = KFold(n_splits=n_splits, shuffle=False)  # Do not shuffle
    fold = 1
    for train_index, valid_index in kf.split(wt_data):
        folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
        if not os.path.exists(os.path.join(folder_name, 'X_train_f.csv')):
            X_train, X_valid = wt_data.iloc[train_index], wt_data.iloc[valid_index]
            y_train = pd.DataFrame({'class': [0] * len(X_train)}, index=X_train.index)
            y_valid = pd.DataFrame({'class': [0] * len(X_valid)}, index=X_valid.index)

            # Normalize training and validation data to [0, 1] and round to 6 decimal places
            X_train = normalize_to_range_0_1(X_train)
            X_valid = normalize_to_range_0_1(X_valid)

            print(f"Fold {fold}:")
            print(f"Training set shape: {X_train.shape}")
            print(f"Validation set shape: {X_valid.shape}")
            
            save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder)
        else:
            print(f"Data for fold {fold} already exists, skipping generation.")
        fold += 1

In [None]:
# Window sizes to be used
window_sizes = list(range(2, 52))

# Concatenate all DataFrames in wt_dict into a single DataFrame
wt_combined = pd.concat(wt_dict.values(), axis=1)

# Preprocess and save data
preprocessing_kfold(wt_combined, n_splits=5)

# Example of loading a specific fold (e.g., fold 1)
base_folder = "AE_Data/Unfiltered_UnShuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

# Filtered, Shuffled Distance Measurement Data

In [None]:
import glob
import pandas as pd
import os
import re

# Define the range of window sizes
window_range = range(2, 49)

def import_lcc_data(lccdata_folder):
    """
    Imports Local Compaction data files (.csv) and assigns them to a dictionary.

    Parameters:
    - lccdata_folder: The folder where Local Compaction data files are stored.

    Returns:
    - A dictionary with window sizes as keys and pandas DataFrames as values.
    """
    data_dict = {}
    
    for window_size in window_range:
        file_path = os.path.join(lccdata_folder, f"WT_{window_size}_f.csv")
        if os.path.exists(file_path):
            df = pd.read_csv(file_path, index_col=0)
            df.index.name = None 
            data_dict[window_size] = df
        else:
            print(f"Warning: File not found for Window Size {window_size}: {file_path}")
    
    return data_dict


# Folder containing the Local Compaction data
lccdata_folder = 'XGB_High_vs_Low_Energy/Filtered_Local_Compaction_Data'

# Import LCC data for the wild-type protein
wt_dict = import_lcc_data(lccdata_folder)

In [None]:
import os
import pandas as pd
from sklearn.model_selection import KFold

def save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder):
    """
    Save the datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    os.makedirs(folder_name, exist_ok=True)
    
    X_train.to_csv(os.path.join(folder_name, 'X_train_f.csv'))
    y_train.to_csv(os.path.join(folder_name, 'y_train_f.csv'))
    X_valid.to_csv(os.path.join(folder_name, 'X_valid_f.csv'))
    y_valid.to_csv(os.path.join(folder_name, 'y_valid_f.csv'))

def load_datasets(fold, base_folder):
    """
    Load datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    
    X_train = pd.read_csv(os.path.join(folder_name, 'X_train_f.csv'), index_col=0)
    y_train = pd.read_csv(os.path.join(folder_name, 'y_train_f.csv'), index_col=0)
    X_valid = pd.read_csv(os.path.join(folder_name, 'X_valid_f.csv'), index_col=0)
    y_valid = pd.read_csv(os.path.join(folder_name, 'y_valid_f.csv'), index_col=0)
    
    return X_train, y_train, X_valid, y_valid

def normalize_to_range_0_1(data):
    """
    Normalize data to range [0, 1] and round to 6 decimal places.
    """
    normalized = (data - data.min()) / (data.max() - data.min())
    return normalized.round(6)

def preprocessing_kfold(wt_data, n_splits=5, base_folder="AE_Data/Filtered_Shuffled_Compaction_Data"):
    """
    Preprocess data using KFold cross-validation, normalize it, and save it.
    """
    # Ensure all data is numeric
    wt_data = wt_data.apply(pd.to_numeric, errors='coerce')

    # Shuffle the DataFrame rows without resetting the index
    wt_data = wt_data.sample(frac=1, random_state=42)
    
    os.makedirs(base_folder, exist_ok=True)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold = 1
    for train_index, valid_index in kf.split(wt_data):
        folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
        if not os.path.exists(os.path.join(folder_name, 'X_train_f.csv')):
            X_train, X_valid = wt_data.iloc[train_index], wt_data.iloc[valid_index]
            y_train = pd.DataFrame({'class': [0] * len(X_train)}, index=X_train.index)
            y_valid = pd.DataFrame({'class': [0] * len(X_valid)}, index=X_valid.index)

            # Normalize training and validation data to [0, 1] and round to 6 decimal places
            X_train = normalize_to_range_0_1(X_train)
            X_valid = normalize_to_range_0_1(X_valid)

            print(f"Fold {fold}:")
            print(f"Training set shape: {X_train.shape}")
            print(f"Validation set shape: {X_valid.shape}")
            
            save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder)
        else:
            print(f"Data for fold {fold} already exists, skipping generation.")
        fold += 1

In [None]:
# Window sizes to be used
window_sizes = list(range(2, 52))

# Concatenate all DataFrames in wt_dict into a single DataFrame
wt_combined = pd.concat(wt_dict.values(), axis=1)

# Preprocess and save data
preprocessing_kfold(wt_combined, n_splits=5)

# Example of loading a specific fold (e.g., fold 1)
base_folder = "AE_Data/Filtered_Shuffled_Compaction_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

# Unfiltered, Shuffled Angle Data

In [None]:
import os
import pandas as pd
from sklearn.model_selection import KFold

def save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder):
    """
    Save the datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    os.makedirs(folder_name, exist_ok=True)
    
    X_train.to_csv(os.path.join(folder_name, 'X_train_f.csv'))
    y_train.to_csv(os.path.join(folder_name, 'y_train_f.csv'))
    X_valid.to_csv(os.path.join(folder_name, 'X_valid_f.csv'))
    y_valid.to_csv(os.path.join(folder_name, 'y_valid_f.csv'))

def load_datasets(fold, base_folder):
    """
    Load datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    
    X_train = pd.read_csv(os.path.join(folder_name, 'X_train_f.csv'), index_col=0)
    y_train = pd.read_csv(os.path.join(folder_name, 'y_train_f.csv'), index_col=0)
    X_valid = pd.read_csv(os.path.join(folder_name, 'X_valid_f.csv'), index_col=0)
    y_valid = pd.read_csv(os.path.join(folder_name, 'y_valid_f.csv'), index_col=0)
    
    return X_train, y_train, X_valid, y_valid

def normalize_to_range_0_1(data):
    """
    Normalize data to range [0, 1] and round to 6 decimal places.
    """
    normalized = (data - data.min()) / (data.max() - data.min())
    return normalized.round(6)

def load_angle_data(angle_folder, angle_window_sizes):
    """
    Load angle data for all window sizes and combine into a single DataFrame.
    """
    angle_data = {}
    for window_size in angle_window_sizes:
        file_path = os.path.join(angle_folder, f'Angles_WS_{window_size}.csv')
        if os.path.exists(file_path):
            angle_data[window_size] = pd.read_csv(file_path, index_col=0)
        else:
            print(f"Warning: Angle data file not found for window size {window_size}.")
    return angle_data

def combine_angle_data_only(angle_data, angle_window_sizes):
    """
    Combine all angle data into a single DataFrame, sorted by window size.
    """
    # Start with the index of one DataFrame (assuming all indices are consistent)
    first_window_size = list(angle_data.keys())[0]
    combined_data = pd.DataFrame(index=angle_data[first_window_size].index)

    for window_size in angle_window_sizes:
        if window_size in angle_data:
            combined_data = pd.concat([combined_data, angle_data[window_size]], axis=1)
        else:
            print(f"Warning: Missing angle data for window size {window_size}. Skipping.")
    
    return combined_data

def preprocessing_kfold(angle_data_only, n_splits=5, base_folder="AE_Data/Unfiltered_Shuffled_Angle_Data"):
    """
    Preprocess angle data using KFold cross-validation, normalize it, and save it.
    """
    # Ensure all data is numeric
    angle_data_only = angle_data_only.apply(pd.to_numeric, errors='coerce')

    # Shuffle the DataFrame rows without resetting the index
    angle_data_only = angle_data_only.sample(frac=1, random_state=42)
    
    os.makedirs(base_folder, exist_ok=True)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold = 1
    for train_index, valid_index in kf.split(angle_data_only):
        folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
        if not os.path.exists(os.path.join(folder_name, 'X_train_f.csv')):
            X_train, X_valid = angle_data_only.iloc[train_index], angle_data_only.iloc[valid_index]
            y_train = pd.DataFrame({'class': [0] * len(X_train)}, index=X_train.index)
            y_valid = pd.DataFrame({'class': [0] * len(X_valid)}, index=X_valid.index)

            # Normalize training and validation data to [0, 1] and round to 6 decimal places
            X_train = normalize_to_range_0_1(X_train)
            X_valid = normalize_to_range_0_1(X_valid)

            print(f"Fold {fold}:")
            print(f"Training set shape: {X_train.shape}")
            print(f"Validation set shape: {X_valid.shape}")
            
            save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder)
        else:
            print(f"Data for fold {fold} already exists, skipping generation.")
        fold += 1

In [None]:
# Paths and window sizes
angle_window_sizes = list(range(1, 51))
angle_folder = "Local_Angles/Angle_Data"

# Load and combine angle data only
angle_data = load_angle_data(angle_folder, angle_window_sizes)
combined_angle_data = combine_angle_data_only(angle_data, angle_window_sizes)

# Preprocess and save data
preprocessing_kfold(combined_angle_data, n_splits=5, base_folder="AE_Data/Unfiltered_Shuffled_Angle_Data")

# Example of loading a specific fold (e.g., fold 1)
base_folder = "AE_Data/Unfiltered_Shuffled_Angle_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)

# Unfiltered, Shuffled, RMSD Data

# Unfiltered, Shuffled Angle and Distance Data

In [None]:
import os
import pandas as pd
from sklearn.model_selection import KFold

def save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder):
    """
    Save the datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    os.makedirs(folder_name, exist_ok=True)
    
    X_train.to_csv(os.path.join(folder_name, 'X_train_f.csv'))
    y_train.to_csv(os.path.join(folder_name, 'y_train_f.csv'))
    X_valid.to_csv(os.path.join(folder_name, 'X_valid_f.csv'))
    y_valid.to_csv(os.path.join(folder_name, 'y_valid_f.csv'))

def load_datasets(fold, base_folder):
    """
    Load datasets for a specific fold.
    """
    folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
    
    X_train = pd.read_csv(os.path.join(folder_name, 'X_train_f.csv'), index_col=0)
    y_train = pd.read_csv(os.path.join(folder_name, 'y_train_f.csv'), index_col=0)
    X_valid = pd.read_csv(os.path.join(folder_name, 'X_valid_f.csv'), index_col=0)
    y_valid = pd.read_csv(os.path.join(folder_name, 'y_valid_f.csv'), index_col=0)
    
    return X_train, y_train, X_valid, y_valid

def normalize_to_range_0_1(data):
    """
    Normalize data to range [0, 1] and round to 6 decimal places.
    """
    normalized = (data - data.min()) / (data.max() - data.min())
    return normalized.round(6)

def load_angle_data(angle_folder, window_sizes):
    """
    Load angle data for all window sizes and combine into a single DataFrame.
    """
    angle_data = {}
    for window_size in window_sizes:
        file_path = os.path.join(angle_folder, f'Angles_WS_{window_size}.csv')
        if os.path.exists(file_path):
            angle_data[window_size] = pd.read_csv(file_path, index_col=0)
        else:
            print(f"Warning: Angle data file not found for window size {window_size}.")
    return angle_data

def combine_distance_and_angle(distance_data, angle_data, distance_window_sizes, angle_window_sizes):
    """
    Combine distance and angle data alternately by window size, starting with distance (window size 2).
    """
    # Start with the index of one DataFrame (assuming all indices are consistent)
    first_window_size = list(distance_data.keys())[0]
    combined_data = pd.DataFrame(index=distance_data[first_window_size].index)

    # Interleave distance and angle window sizes
    max_window_size = max(max(distance_window_sizes), max(angle_window_sizes))
    for i in range(1, max_window_size + 1):
        if i in distance_window_sizes and i in distance_data:
            combined_data = pd.concat([combined_data, distance_data[i]], axis=1)
        if i in angle_window_sizes and i in angle_data:
            combined_data = pd.concat([combined_data, angle_data[i]], axis=1)
    
    return combined_data



def preprocessing_kfold(combined_data, n_splits=5, base_folder="AE_Data/Unfiltered_Shuffled_Compaction_Angle_Data"):
    """
    Preprocess data using KFold cross-validation, normalize it, and save it.
    """
    # Ensure all data is numeric
    combined_data = combined_data.apply(pd.to_numeric, errors='coerce')

    # Shuffle the DataFrame rows without resetting the index
    combined_data = combined_data.sample(frac=1, random_state=42)
    
    os.makedirs(base_folder, exist_ok=True)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold = 1
    for train_index, valid_index in kf.split(combined_data):
        folder_name = os.path.join(base_folder, f'Training_Set_{fold}')
        if not os.path.exists(os.path.join(folder_name, 'X_train_f.csv')):
            X_train, X_valid = combined_data.iloc[train_index], combined_data.iloc[valid_index]
            y_train = pd.DataFrame({'class': [0] * len(X_train)}, index=X_train.index)
            y_valid = pd.DataFrame({'class': [0] * len(X_valid)}, index=X_valid.index)

            # Normalize training and validation data to [0, 1] and round to 6 decimal places
            X_train = normalize_to_range_0_1(X_train)
            X_valid = normalize_to_range_0_1(X_valid)

            print(f"Fold {fold}:")
            print(f"Training set shape: {X_train.shape}")
            print(f"Validation set shape: {X_valid.shape}")
            
            save_datasets(X_train, y_train, X_valid, y_valid, fold, base_folder)
        else:
            print(f"Data for fold {fold} already exists, skipping generation.")
        fold += 1

In [None]:
# Paths and window sizes
distance_window_sizes = list(range(2, 52))
angle_window_sizes = list(range(1, 51))

distance_data = {ws: pd.read_csv(f"Local_Compaction/Local_Compaction_Data/WT_Simulation_WS_{ws}.csv", index_col=0) for ws in distance_window_sizes}
angle_folder = "Local_Angles/Angle_Data"
angle_data = load_angle_data(angle_folder, angle_window_sizes)

# Combine distance and angle data alternately
combined_data = combine_distance_and_angle(distance_data, angle_data, distance_window_sizes, angle_window_sizes)

# Preprocess and save data
preprocessing_kfold(combined_data, n_splits=5, base_folder="AE_Data/Unfiltered_Shuffled_Compaction_Angle_Data")

# Example of loading a specific fold (e.g., fold 1)
base_folder = "AE_Data/Unfiltered_Shuffled_Compaction_Angle_Data"
X_train_loaded, y_train_loaded, X_valid_loaded, y_valid_loaded = load_datasets(fold=1, base_folder=base_folder)