# Pre-process CB513

In [2]:
# Import necessary libraries
import gzip
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
# Define the directory for the dataset
dataset_dir = '../Data/Original/'

## Define functions and arrays

## Preprocess CB513

In [None]:
# Define the array of characters for amino acid residues
amino_acid_residues = [
    'A', 'C', 'E', 'D', 'G', 
    'F', 'I', 'H', 'K', 'M', 
    'L', 'N', 'Q', 'P', 'S', 
    'R', 'T', 'W', 'V', 'Y', 
    'X', '_'
]

# Define the array of characters for secondary structure labels
secondary_structure_labels = ['L', 'B', 'E', 'G', 'I', 'H', 'S', 'T', '_']
# Function to convert one-hot encoded residue to string and clean trailing underscores
def residue_to_string(residue):
    indices = np.argmax(residue[:, :22], axis=-1)
    residue_str = ''.join([amino_acid_residues[i] for i in indices])
    residue_str = residue_str.rstrip('_')  # Remove trailing underscores
    return residue_str

# Function to convert one-hot encoded secondary structure to string
def secondary_structure_to_string(secondary_structure):
    indices = np.argmax(secondary_structure[:, 22:31], axis=-1)
    ss_str = ''.join([secondary_structure_labels[i] for i in indices])
    ss_str = ss_str.rstrip('_')  # Remove trailing underscores
    return ss_str

# Function to convert one-hot encoded 8-state secondary structure to 3-state
def convert_8state_to_3state(ss):
    ss_3state = ""
    for s in ss:
        if s in ['H', 'G', 'I']:
            ss_3state += 'H'
        elif s in ['E', 'B']:
            ss_3state += 'E'
        else:
            ss_3state += 'C'
    return ss_3state

In [None]:
def process_protein_data_cb513(input_file, output_file):
    """
    Preprocess the CB513 dataset by reshaping, converting to string representations,
    and restructuring the data.
    """
    # Load CB513 data from the compressed numpy file
    with gzip.GzipFile(input_file, "r") as f:
        cb513_data = np.load(f)

    # Reshape the data
    cb513_reshaped = cb513_data.reshape((514, 700, 57))

    preprocessed_data = []

    for protein_features in cb513_reshaped:
        # Convert features to string representations
        residue_sequence = residue_to_string(protein_features)
        residue_length = len(residue_sequence)
        dssp8_sequence = secondary_structure_to_string(protein_features)
        dssp3_sequence = convert_8state_to_3state(dssp8_sequence)

        # Append the processed data to the list
        preprocessed_data.append({
            'length': residue_length,
            'residue': residue_sequence,
            'dssp8': dssp8_sequence,
            'dssp3': dssp3_sequence
        })

    df = pd.DataFrame(preprocessed_data)
    df.to_csv(output_file, index_label="id")


input_dir = dataset_dir + 'cb513+profile_split1.npy.gz'
output_dir = dataset_dir + 'CB513_preprocessed_2.csv'
process_protein_data_cb513(input_dir, output_dir)

  cb513_data = np.load(f)


## Preprocess CASP14

In [None]:
def process_protein_data_casp14(input_file, output_file):
    """
    Preprocess the CASP14 dataset by combining sequences, 
    restructuring the data.
    """
    data = pd.read_csv(input_file)

    # Group the data by PDB code
    grouped = data.groupby('pdb')

    processed_data = []
    
    for _, group in grouped:
        # Combine the amino acid and secondary structure sequences
        combined_aa = ''.join(group['aa'].tolist())
        combined_q8 = ''.join(group['q8'].tolist())
        combined_q3 = ''.join(group['q3'].tolist())
        protein_length = len(combined_aa)

        # Append the processed data to the list
        processed_data.append({
            'length': protein_length,
            'residue': combined_aa,
            'dssp8': combined_q8,
            'dssp3': combined_q3
        })

    processed_df = pd.DataFrame(processed_data)
    processed_df.to_csv(output_file, index_label="id")


input_file = dataset_dir + 'CASP14_dataset.csv'
output_file = dataset_dir + 'CASP14_preprocessed.csv'
process_protein_data_casp14(input_file, output_file)

## Preprocess TS115

In [None]:
def process_protein_data_ts115(input_file, output_file):
    """
    Preprocess the TS115 dataset by restructuring the data.
    """
    df = pd.read_csv(input_file)

    # Preprocess the DataFrame
    df['length'] = df['input'].str.len() 
    df.rename(columns={'input': 'residue'}, inplace=True)
    df = df[['length', 'residue', 'dssp8', 'dssp3']] 

    df.to_csv(output_file, index_label="id")

input_file = dataset_dir + 'TS115.csv'
output_file = dataset_dir + 'TS115_preprocessed.csv'
process_protein_data_ts115(input_file, output_file)

## Preprocess CASP12

In [6]:
def process_protein_data_CASP12(input_file, output_file):
    """
    Preprocess the CASP12 dataset by cleaning, 
    restructuring the data, and replacing unknown residues.
    """
    df = pd.read_csv(input_file)

    # Preprocess the DataFrame
    df['length'] = df['seq'].str.len()
    df.rename(columns={'seq': 'residue', 'sst3': 'dssp3', 'sst8':'dssp8'}, inplace=True) 
    df['residue'] = df['residue'].str.replace('X', 'A') # Replace X with A for compatibility

    df = df[['length', 'residue', 'dssp8', 'dssp3']]

    # Save
    df.to_csv(output_file, index_label="id")

input_file = dataset_dir + 'CASP12.csv'
output_file = dataset_dir + 'CASP12_preprocessed.csv'
process_protein_data_CASP12(input_file, output_file)