In [16]:
# Update: 5 May 2025 

# Post Processing Stage 1: Get the HIGHEST rank Structure for each Protein predicted by AlphaFold3
- Goal: Convert every predictions to DSSP format and store the results in a specified directory.
- Input: AlphaFold3 predictions in mmCIF format 
- Output: DSSP format files

In [17]:
import os
import shutil

# Define the path to the predictions directory
prediction_path = '../Data/Predictions'
output_path = '../Data/Postprocessed'

cb513_path = os.path.join(output_path, 'CB513')
ts115_path = os.path.join(output_path, 'TS115')
casp10_path = os.path.join(output_path, 'CASP10')
casp12_path = os.path.join(output_path, 'CASP12')

# Create a new directory for the highest confident predictions
os.makedirs(os.path.join(output_path), exist_ok=True)
os.makedirs(os.path.join(cb513_path), exist_ok=True)
os.makedirs(os.path.join(ts115_path), exist_ok=True)
os.makedirs(os.path.join(casp10_path), exist_ok=True)
os.makedirs(os.path.join(casp12_path), exist_ok=True)

In [18]:
def get_highest_confident_predictions(dataset_name):
    """
    This function copies the highest confident predictions to a new directory.
    """
    # Map dataset_name to its corresponding path and max index
    dataset_info = {
        'cb513': 513,
        'ts115': 115,
        'casp10': 122,
        'casp12': 20,
    }
    
    max_index = dataset_info.get(dataset_name, (None, None))
    
    # Initialize a set of all expected indices
    expected_indices = set(range(max_index + 1))
    found_indices = set()

    # Define the dataset path based on the dataset name
    dataset_path = os.path.join(prediction_path, dataset_name)
    print(f'Processing dataset: {dataset_name}')
    print(f'Looking for files in: {dataset_path}')

    # Iterate through each folder in the dataset path
    for folder in os.listdir(dataset_path):
        # Check if the folder name starts with the dataset name and is a directory
        if folder.startswith(dataset_name) and os.path.isdir(os.path.join(dataset_path, folder)):
            folder_path = os.path.join(dataset_path, folder)
            for file in os.listdir(folder_path):
                # Check if the file is a .cif file
                if file.endswith('.cif'):
                    # Check if the file is the highest confident one
                    # Name format: fold_{dataset_name}_{index}_model_0 -> 0 is the highest confident
                    if file.startswith(f'fold_{dataset_name}') and file.endswith('_model_0.cif'):
                        # Extract the index from the file name
                        try:
                            index = int(file.split('_')[2])
                            found_indices.add(index)
                        except (IndexError, ValueError):
                            print(f"Failed to extract index from file name: {file}")
                        
                        # Copy the file to the new directory
                        src = os.path.join(folder_path, file)
                        dst = os.path.join(output_path, dataset_name, file)
                        shutil.copy(src, dst)
                        # Check if the file was copied successfully
                        if os.path.exists(dst):
                            print(f'File successfully copied to {dst}')
                        else:
                            print(f'Failed to copy file to {dst}')

    # Calculate missing indices
    missing_indices = expected_indices - found_indices
    if missing_indices:
        print(f"Missing indices for {dataset_name}: {sorted(missing_indices)}")
    else:
        print(f"All indices are present for {dataset_name}.")

In [19]:
get_highest_confident_predictions('cb513')

Processing dataset: cb513
Looking for files in: ../Data/Predictions/cb513
File successfully copied to ../Data/Postprocessed/cb513/fold_cb513_0_model_0.cif
File successfully copied to ../Data/Postprocessed/cb513/fold_cb513_1_model_0.cif
File successfully copied to ../Data/Postprocessed/cb513/fold_cb513_10_model_0.cif
File successfully copied to ../Data/Postprocessed/cb513/fold_cb513_100_model_0.cif
File successfully copied to ../Data/Postprocessed/cb513/fold_cb513_101_model_0.cif
File successfully copied to ../Data/Postprocessed/cb513/fold_cb513_102_model_0.cif
File successfully copied to ../Data/Postprocessed/cb513/fold_cb513_103_model_0.cif
File successfully copied to ../Data/Postprocessed/cb513/fold_cb513_104_model_0.cif
File successfully copied to ../Data/Postprocessed/cb513/fold_cb513_105_model_0.cif
File successfully copied to ../Data/Postprocessed/cb513/fold_cb513_106_model_0.cif
File successfully copied to ../Data/Postprocessed/cb513/fold_cb513_107_model_0.cif
File successfully 

In [20]:
# get_highest_confident_predictions('ts115')

In [21]:
# get_highest_confident_predictions('casp10')

In [22]:
# get_highest_confident_predictions('casp12')

# Post Processing Stage 2: Convert tertiary structure predictions to secondary structure

In [23]:
import matplotlib.pyplot as plt
import os
import pandas as pd
from Bio.PDB import MMCIFParser, DSSP

## Landasan Teori DSSP

Biopython DSSP Secondary structure classification

| Code | Structure                     |
|------|-------------------------------|
| H    | Alpha helix (4-12)            |
| B    | Isolated beta-bridge residue  |
| E    | Strand                        |
| G    | 3-10 helix                    |
| I    | Pi helix                      |
| T    | Turn                          |
| S    | Bend                          |
| -    | None                          |

PDB-REDO DSSP Secondary structure classification
| DSSP Code | mmCIF Code      | Description   |
|-----------|-----------------|---------------|
| H         | HELX_RH_AL_P    | Alphahelix    |
| B         | STRN            | Betabridge    |
| E         | STRN            | Strand        |
| G         | HELX_RH_3T_P    | Helix_3       |
| I         | HELX_RH_PI_P    | Helix_5       |
| P         | HELX_LH_PP_P    | Helix_PPII    |
| T         | TURN_TY1_P      | Turn          |
| S         | BEND            | Bend          |
| ' ' (space)| OTHER          | Loop          |

Bipython DSSP Tuple Output per Residue
| Tuple Index | Value                     |
|-------------|---------------------------|
| 0           | DSSP index               |
| 1           | Amino acid               |
| 2           | Secondary structure      |
| 3           | Relative ASA             |
| 4           | Phi                      |
| 5           | Psi                      |
| 6           | NH–>O_1_relidx           |
| 7           | NH–>O_1_energy           |
| 8           | O–>NH_1_relidx           |
| 9           | O–>NH_1_energy           |
| 10          | NH–>O_2_relidx           |
| 11          | NH–>O_2_energy           |
| 12          | O–>NH_2_relidx           |
| 13          | O–>NH_2_energy           |


In [24]:
parser = MMCIFParser(QUIET=True)
file_path = "../Data/Postprocessed/CASP10/fold_casp10_0_model_0.cif"
struktur = parser.get_structure('struktur', file_path)
dssp = DSSP(model=struktur[0], in_file=file_path, dssp='mkdssp')
# Print 14 fitur dari setiap asam amino 
for data_dssp in dssp:
    print(data_dssp)

(1, 'L', '-', 1.0, 360.0, 132.6, 0, 0.0, 2, -0.2, 0, 0.0, 0, 0.0)
(2, 'L', '-', 0.6890243902439024, -68.8, 134.0, 1, -0.1, 2, -0.1, 2, -0.0, 0, 0.0)
(3, 'S', '-', 0.23076923076923078, -73.9, 154.0, -2, -0.2, 4, -2.1, 1, -0.1, 28, -0.2)
(4, 'T', 'H', 0.1267605633802817, -61.4, -38.4, 27, -0.5, 4, -3.0, 1, -0.2, 5, -0.2)
(5, 'D', 'H', 0.38650306748466257, -61.4, -40.1, 25, -2.2, 4, -2.4, 2, -0.2, -1, -0.2)
(6, 'I', 'H', 0.5207100591715976, -68.4, -40.9, 24, -0.4, 4, -2.2, -3, -0.2, -1, -0.2)
(7, 'W', 'H', 0.2026431718061674, -62.7, -45.5, -4, -2.1, 4, -2.3, 2, -0.2, -2, -0.2)
(8, 'V', 'H', 0.02112676056338028, -61.0, -45.2, -4, -3.0, 4, -2.6, 2, -0.2, 5, -0.2)
(9, 'A', 'H', 0.5471698113207547, -61.0, -39.0, -4, -2.4, 4, -2.6, 1, -0.2, -1, -0.2)
(10, 'A', 'H', 0.4528301886792453, -67.6, -39.1, -4, -2.2, 4, -2.1, 2, -0.2, -2, -0.2)
(11, 'L', 'H', 0.0, -62.1, -49.8, -4, -2.3, 4, -2.4, 2, -0.2, -2, -0.2)
(12, 'I', 'H', 0.16568047337278108, -56.7, -50.2, -4, -2.6, 4, -2.8, 1, -0.2, 5, -0.2)
(

## Hitung DSSP dengan bipython (lama)

In [25]:
# def calculate_dssp(dataset_name):
#     """
#     Calculates DSSP features for each protein (mmCIF file) in the specified dataset and saves the results to a CSV file.

#     Args:
#         dataset_name (str): The name of the dataset (e.g., 'casp10', 'cb513', 'ts115').

#     Returns:
#         None
#     """
#     # Initialize the parser for reading mmCIF files
#     parser = MMCIFParser(QUIET=True)
    
#     # Define the directories for input and output
#     input_directory = f"../Data/Postprocessed/{dataset_name}/"
#     output_file = f"../Data/Postprocessed/DSSP_{dataset_name}.csv"
    
#     protein_data = []

#     # Iterate through all files in the input directory
#     for file_name in os.listdir(input_directory):
#         # Process only mmCIF files
#         if file_name.endswith('.cif'):
#             file_path = os.path.join(input_directory, file_name)
#             print(f"Processing file: {file_path}...")
#             try:
#                 # Parse the structure from the mmCIF file
#                 structure = parser.get_structure('structure', file_path)
#                 # Compute secondary structure using DSSP
#                 dssp = DSSP(model=structure[0], in_file=file_path, dssp='mkdssp')
#                 # Append the DSSP results to the list
#                 protein_data.append([file_name, dssp])
#             except Exception as e:
#                 print(f"Error processing file {file_name}: {e}")

#     feature_data = []

#     # Process DSSP results for each protein
#     for protein in protein_data:
#         file_name, dssp = protein
#         residues = ''
#         secondary_structure = ''
#         # Concatenate all residues and secondary structure elements into strings
#         for dssp_entry in dssp:
#             residues += dssp_entry[1]
#             secondary_structure += dssp_entry[2]
#         feature_data.append([file_name, len(residues), residues, secondary_structure])

#     # Convert the data into a pandas DataFrame
#     df = pd.DataFrame(feature_data, columns=['file_name', 'length', 'residues', 'secondary_structure'])
#     # Extract the ID from the file name
#     df['id'] = df['file_name'].apply(lambda x: int(x.split('_')[2]))
#     # Drop the file_name column
#     df.drop(columns=['file_name'], inplace=True)
#     # Reorder columns to place 'id' at the front
#     df = df[['id', 'length', 'residues', 'secondary_structure']]
#     # Sort the DataFrame by 'id'
#     df.sort_values(by=['id'], inplace=True)
#     # Save the DataFrame to a CSV file
#     df.to_csv(output_file, index=False)

#     print(f"DSSP data successfully saved to {output_file}.")

In [26]:
# # DSSP dataset casp10
# data_protein_casp10 = calculate_dssp('CASP10')

In [27]:
# # DSSP dataset casp10
# data_protein_casp10 = calculate_dssp('CB513')

In [28]:
# # DSSP dataset casp10
# data_protein_casp10 = calculate_dssp('TS115')

## Hitung DSSP via Script bash

Script ada di `convert.sh` dan `convert_dsspcif.sh`

## Function to extract secondary structure from `.dssp` file

In [32]:
dssp_file_path = '../Data/Postprocessed/CB513/fold_cb513_0_model_0.dssp'

real_residue_cb513_0 = 'VPSLATISLENSWSGLSKQIQLAQGNNGIFRTPIVLVDNKGNRVQITNVTSKVVTSNIQLLLNTRNI'

In [None]:
def extract_secondary_structure(dssp_file):
    """
    Extracts the residue and secondary structure sequence from a DSSP file.
    """
    residues = ""
    secondary_structure = ""
    # Open the DSSP file and read its contents
    with open(dssp_file, 'r') as file:
        lines = file.readlines()
        start_reading = False
        for line in lines:
            if line.startswith(
                    "  #  RESIDUE"):  # Start reading after this line
                start_reading = True
                continue
            if start_reading:
                if line.strip() == "":
                    continue
                # Extract residue (AA column) and secondary structure (STRUCTURE column)
                residue = line[13].strip()  # from left, column 14
                structure = line[16]  # from left, column 17
                # If the structure is a space, set it "-"
                if structure == " ":
                    structure = "-"
                # Append the residue and structure
                residues += residue
                secondary_structure += structure
    return residues, secondary_structure

In [33]:
# Example usage
residue, secondary_structure = extract_secondary_structure(dssp_file_path)
print(f"xx: {real_residue_cb513_0}")
print(f"aa: {residue}")
print(f"ss: {secondary_structure}")

xx: VPSLATISLENSWSGLSKQIQLAQGNNGIFRTPIVLVDNKGNRVQITNVTSKVVTSNIQLLLNTRNI
aa: VPSLATISLENSWSGLSKQIQLAQGNNGIFRTPIVLVDNKGNRVQITNVTSKVVTSNIQLLLNTRNI
ss: ---HHHHHHHHHHHHHHHHHHHHTTTTTEEEEEEEEE-TTS-EEEEEETTSHHHHHHHHHHHHHHH-


## Extract SS from every dssp files inside a dataset folder
- Extract the secondary structures of every protein in dataset
- Secondary structure is stored in `.dssp` files previously generated by `convert.sh` bash script
- Save them into a single CSV file

  \- columns are: id, length, residue, predicted_dssp8

- The input dssp file name format: fold_{dataset_name}_{index}_model_0.dssp

  \- for example: fold_casp10_0_model_0.dssp

In [34]:
path = '../Data/Postprocessed/'

cb513_dssp_path = os.path.join(path, 'CB513')
ts115_dssp_path = os.path.join(path, 'TS115')
casp10_dssp_path = os.path.join(path, 'CASP10')
casp12_dssp_path = os.path.join(path, 'CASP12')

cb513_output_path = os.path.join(path, 'cb513_dssp.csv')
ts115_output_path = os.path.join(path, 'ts115_dssp.csv')
casp10_output_path = os.path.join(path, 'casp10_dssp.csv')
casp12_output_path = os.path.join(path, 'casp12_dssp.csv')

In [37]:
def extract_dssp_data(dssp_path, output_path):
    """
    This function extracts secondary structure data from DSSP files in a single directory,
    sorts them by ID, and combines them into a single CSV file.
    """
    data = []

    for file in os.listdir(dssp_path):
        if file.endswith('.dssp'):
            print(f"Processing file: {file}...")
            # Extract the index from the file name
            try:
                index = int(file.split('_')[2])
            except (IndexError, ValueError):
                print(f"Failed to extract index from file name: {file}")
                continue

            # Read the DSSP file and extract secondary structure
            dssp_file = os.path.join(dssp_path, file)
            residues, secondary_structure = extract_secondary_structure(
                dssp_file)
            length = len(residues)

            # Append the data to the list
            data.append((index, length, residues, secondary_structure))

    # Sort the data by ID
    data.sort(key=lambda x: x[0])

    # Write the sorted data to the CSV file
    with open(output_path, 'w') as outfile:
        outfile.write("id,length,residue,predicted_dssp8\n")  # Write header
        for entry in data:
            outfile.write(f"{entry[0]},{entry[1]},{entry[2]},{entry[3]}\n")

In [None]:
extract_dssp_data(cb513_dssp_path, cb513_output_path)

In [None]:
extract_dssp_data(ts115_dssp_path, ts115_output_path)

In [None]:
extract_dssp_data(casp10_dssp_path, casp10_output_path)

In [38]:
extract_dssp_data(casp12_dssp_path, casp12_output_path)

Processing file: fold_casp12_0_model_0.dssp...
Processing file: fold_casp12_10_model_0.dssp...
Processing file: fold_casp12_11_model_0.dssp...
Processing file: fold_casp12_12_model_0.dssp...
Processing file: fold_casp12_13_model_0.dssp...
Processing file: fold_casp12_14_model_0.dssp...
Processing file: fold_casp12_15_model_0.dssp...
Processing file: fold_casp12_16_model_0.dssp...
Processing file: fold_casp12_17_model_0.dssp...
Processing file: fold_casp12_18_model_0.dssp...
Processing file: fold_casp12_19_model_0.dssp...
Processing file: fold_casp12_1_model_0.dssp...
Processing file: fold_casp12_2_model_0.dssp...
Processing file: fold_casp12_3_model_0.dssp...
Processing file: fold_casp12_4_model_0.dssp...
Processing file: fold_casp12_5_model_0.dssp...
Processing file: fold_casp12_6_model_0.dssp...
Processing file: fold_casp12_7_model_0.dssp...
Processing file: fold_casp12_8_model_0.dssp...
Processing file: fold_casp12_9_model_0.dssp...
