# Pre-process CB513

In [1]:
# Import necessary libraries
import gzip
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Load cb513 data from compressed numpy file
f = gzip.GzipFile('../Data/Original/cb513+profile_split1.npy.gz', "r")
cb513 = np.load(f)
f.close()

print(cb513.shape)
# convert all to 700 x 57
cb513_reshaped = cb513.reshape((514, 700, 57))
print(cb513_reshaped.shape)

# example of the first protein in the reshaped data
print(cb513_reshaped[0])
print(cb513_reshaped[0].shape)

(514, 39900)
(514, 700, 57)
[[0.         0.         0.         ... 0.26894142 0.9234378  0.        ]
 [0.         0.         0.         ... 0.11920292 0.00919371 0.        ]
 [0.         0.         0.         ... 0.5        0.02508696 0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         1.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]
(700, 57)


  cb513 = np.load(f)


In [3]:
# Define the array of characters for amino acid residues
amino_acid_residues = [
    'A', 'C', 'E', 'D', 'G', 
    'F', 'I', 'H', 'K', 'M', 
    'L', 'N', 'Q', 'P', 'S', 
    'R', 'T', 'W', 'V', 'Y', 
    'X', '_'
]

# Define the array of characters for secondary structure labels
secondary_structure_labels = ['L', 'B', 'E', 'G', 'I', 'H', 'S', 'T', '_']

# Define the array of characters for N- and C- terminals
terminals = ['N', 'C']

# Define the array of characters for sequence profile (ACDEF,GHIKL,MNPQR,STVWX,Y)
sequence_profile_residues = [
    'A', 'C', 'D', 'E', 'F',
    'G', 'H', 'I', 'K', 'L',
    'M', 'N', 'P', 'Q', 'R', 
    'S', 'T', 'V', 'W', 'X', 
    'Y', '_'
]

In [4]:
# Function to convert one-hot encoded residue to string and clean trailing underscores
def residue_to_string(residue):
    indices = np.argmax(residue[:, :22], axis=-1)
    residue_str = ''.join([amino_acid_residues[i] for i in indices])
    residue_str = residue_str.rstrip('_')  # Remove trailing underscores
    return residue_str

# Function to convert one-hot encoded secondary structure to string
def secondary_structure_to_string(secondary_structure):
    indices = np.argmax(secondary_structure[:, 22:31], axis=-1)
    ss_str = ''.join([secondary_structure_labels[i] for i in indices])
    ss_str = ss_str.rstrip('_')  # Remove trailing underscores
    return ss_str

# Function to convert one-hot encoded terminals to string
def terminals_to_string(terminals_one_hot):
    terminals_str = ''
    for terminal in terminals_one_hot[:, 31:33]:
        if np.array_equal(terminal, [1, 0]):
            terminals_str += 'N'
        elif np.array_equal(terminal, [0, 1]):
            terminals_str += 'C'
        else:
            terminals_str += ''
    return terminals_str

# Function to convert solvent accessibility to string
def solvent_accessibility_to_string(solvent_accessibility):
    relative_accessibility = solvent_accessibility[:, 33]
    absolute_accessibility = solvent_accessibility[:, 34]
    return f"Relative: {relative_accessibility.tolist()}, Absolute: {absolute_accessibility.tolist()}"

# Function to convert sequence profile to string
def sequence_profile_to_string(sequence_profile):
    indices = np.argmax(sequence_profile[:, 35:57], axis=-1)
    seq_profile_str = ''.join([sequence_profile_residues[i] for i in indices])
    seq_profile_str = seq_profile_str.rstrip('_')  # Remove trailing underscores
    return seq_profile_str

# Function to convert all features to string
def features_to_string(features):
    residue_str = residue_to_string(features)
    secondary_structure_str = secondary_structure_to_string(features)
    terminals_str = terminals_to_string(features)
    solvent_accessibility_str = solvent_accessibility_to_string(features)
    sequence_profile_str = sequence_profile_to_string(features)
    
    return f"Residue: {residue_str}\n" \
           f"Secondary Structure: {secondary_structure_str}\n" \
           f"Terminals: {terminals_str}\n" \
           f"Solvent Accessibility: {solvent_accessibility_str}\n" \
           f"Sequence Profile: {sequence_profile_str}"

# Example usage for the first protein
features = cb513_reshaped[0]
print(features_to_string(features))

Residue: VPSLATISLENSWSGLSKQIQLAQGNNGIFRTPIVLVDNKGNRVQITNVTSKVVTSNIQLLLNTRNI
Secondary Structure: LLLHHHHHHHHHHHHHHHHHHHHTTTTTEEEEEEEELLSSSSLEEEEETTSHHHHHTBLLBLLGGGL
Terminals: NC
Solvent Accessibility: Relative: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

In [5]:
def convert_8state_to_3state(ss):
    ss_3state = ""
    for s in ss:
        if s in ['H', 'G', 'I']:
            ss_3state += 'H'
        elif s in ['E', 'B']:
            ss_3state += 'E'
        else:
            ss_3state += 'C'
    return ss_3state

In [8]:
dataset_dir = '../Data/Original/'

In [None]:
# def process_protein_data_cb513(file_path):
#     # Load cb513 data from compressed numpy file
#     f = gzip.GzipFile(file_path, "r")
#     cb513 = np.load(f)
#     f.close()

#     # convert all to 700 x 57
#     cb513_reshaped = cb513.reshape((514, 700, 57))

#     # variable to store the preprocessed data
#     data = []

#     # loop through all proteins
#     for i in range(len(cb513_reshaped)):
#         # crop the arrays for features and labels according to the length of the protein
#         features = cb513_reshaped[i]

#         # convert the features to string
#         residue_len = len(residue_to_string(features))
#         residue_str = residue_to_string(features)
#         dssp8_ss = secondary_structure_to_string(features)
#         dssp3_ss = convert_8state_to_3state(dssp8_ss)

#         # Append the combined data to the list
#         data.append({'length': residue_len, 
#                                   'residue': residue_str,
#                                   'dssp8': dssp8_ss,
#                                    'dssp3': dssp3_ss})
#     # create a dataframe from the list
#     df = pd.DataFrame(data)

#     # save the dataframe to a csv file
#     df.to_csv(output_dir, index_label="id")

# input_dir = dataset_dir + 'cb513+profile_split1.npy.gz'
# output_dir = dataset_dir + 'CB513_dataset_preprocessed.csv'
# process_protein_data_cb513(input_dir)

  cb513 = np.load(f)


# Preprocess CASP14

In [10]:
def process_protein_data_casp14(file_path):
    # Read the dataset
    data = pd.read_csv(file_path)

    # Group by PDB code
    grouped = data.groupby('pdb')

    # Create a list to store the combined data
    combined_data = []

    # Clean invalid characters in the 'aa' column and adjust 'q3' and 'q8'
    def clean_sequence(row):
        aa, q3, q8 = row['aa'], row['q3'], row['q8']
        cleaned_aa, cleaned_q3, cleaned_q8 = [], [], []
        for i, residue in enumerate(aa):
            if residue != "!":
                cleaned_aa.append(residue)
                cleaned_q3.append(q3[i])
                cleaned_q8.append(q8[i])
        return ''.join(cleaned_aa), ''.join(cleaned_q3), ''.join(cleaned_q8)

    data[['aa', 'q3', 'q8']] = data.apply(lambda row: pd.Series(clean_sequence(row)), axis=1)

    # Process each protein
    for pdb_code, group in grouped:
        # Combine the sequences
        aa = group['aa'].tolist()
        combined_aa = ''.join(aa)
        # Combine the Q8 structures
        q8s = group['q8'].tolist()
        combined_q8 = ''.join(q8s)
        # Combine the Q3 structures
        q3s = group['q3'].tolist()
        combined_q3 = ''.join(q3s)
        # Get the length of the protein 
        aa_length = len(combined_aa)

        # Append the combined data to the list
        combined_data.append({'length': aa_length, 
                              'residue': combined_aa,
                              'dssp8': combined_q8,
                               'dssp3': combined_q3,
                               'pdb': pdb_code})

    # Create a new dataframe from the combined data
    df = pd.DataFrame(combined_data)
        
    # Save the new dataframe
    df.to_csv(output_path, index_label="id")

file_path = dataset_dir + 'CASP14_dataset.csv'
output_path = dataset_dir + 'CASP14_preprocessed.csv'
process_protein_data_casp14(file_path)

# Preprocess TS115

In [26]:
def process_protein_data_ts115(file_path):
    # Read the dataset
    df = pd.read_csv(file_path)

    # Add length column to left of dataframe
    df.insert(0, 'length', df['input'].apply(len))

    # Rename column "input" to "residue"
    df.rename(columns={'input': 'residue'}, inplace=True)
        
    # Save the new dataframe
    df.to_csv(output_path, index_label="id")

file_path = dataset_dir + 'TS115.csv'
output_path = dataset_dir + 'TS115_preprocessed.csv'
process_protein_data_ts115(file_path)