# Final

# Import Required Libraries
Import necessary libraries, including gzip and numpy.

In [1]:
# Import necessary libraries
import gzip
import numpy as np

# Load Data from Files
Load the cb513_sequence and cb513_testlabel data from the compressed numpy files using gzip.

In [2]:
# Load Data from Files

# Load cb513_sequence data from compressed numpy file
f = gzip.GzipFile('../Data/Preprocessed/cb513_sequence_MASK.npy.gz', "r")
cb513_sequence = np.load(f)
f.close()

# Load cb513_testlabel data from compressed numpy file
f = gzip.GzipFile('../Data/Preprocessed/cb513_testlabel_MASK.npy.gz', "r")
cb513_testlabel = np.load(f)
f.close()

In [3]:
# Print shapes of the loaded data
print(cb513_sequence.shape)
print(cb513_testlabel.shape)

(514, 700, 22)
(514, 700, 9)


In [4]:
# Print the first 10 rows of the loaded data
print(len(cb513_sequence[0][0]))

22


# Convert One hot to Label

## Sequence Decoding

In [5]:
# Define the array of characters
amino_acid_residues = [
    '_', 'A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P',
    'S', 'R', 'T', 'W', 'V', 'Y', 'X'
]

# Function to convert one-hot encoded sequence to string and clean trailing underscores
def residue_to_string(sequence):
    indices = np.argmax(sequence, axis=-1)
    sequence = ''.join([amino_acid_residues[i] for i in indices])
    sequence = sequence.rstrip('_')  # Remove trailing underscores
    return sequence


# Apply the function to each sequence in CullPDB_sequence
cb513_sequence_string = [residue_to_string(seq) for seq in cb513_sequence]
print(cb513_sequence_string[0])
print(len(cb513_sequence_string[0]))
print(cb513_sequence_string[1])
print(len(cb513_sequence_string[1]))

VPSLATISLENSWSGLSKQIQLAQGNNGIFRTPIVLVDNKGNRVQITNVTSKVVTSNIQLLLNTRNI
67
MFKVYGYDSNIHKCVYCDNAKRLLTVKKQPFEFINIMPEKGVFDDEKIAELLTKLGRDTQIGLTMPQVFAPDGSHIGGFDQLREYFK
87


## Label Decoding

In [6]:
DSSP = {0: '_', 1: 'C', 2: 'B', 3: 'E', 4: 'G', 5: 'I', 6: 'H', 7: 'S', 8: 'T'}

def label_to_string(label):
    indices = np.argmax(label, axis=-1)
    indices = [DSSP[i] for i in indices]
    label = ''.join(indices)
    label = label.rstrip('_')  # Remove trailing underscores
    return label

# Apply the function to each sequence in cb513_testlabel
cb513_testlabel = [label_to_string(label) for label in cb513_testlabel]
print(cb513_testlabel[0])

CCCHHHHHHHHHHHHHHHHHHHHTTTTTEEEEEEEECCSSSSCEEEEETTSHHHHHTBCCBCCGGGC


In [9]:
# Print the first sequence and its label
print(cb513_sequence_string[0])
print(cb513_testlabel[0])

VPSLATISLENSWSGLSKQIQLAQGNNGIFRTPIVLVDNKGNRVQITNVTSKVVTSNIQLLLNTRNI
CCCHHHHHHHHHHHHHHHHHHHHTTTTTEEEEEEEECCSSSSCEEEEETTSHHHHHTBCCBCCGGGC


In [23]:
# def extract_secondary_structure(dssp_file):
#     secondary_structure = ""
#     with open(dssp_file, 'r') as file:
#         lines = file.readlines()
#         start_reading = False
#         for line in lines:
#             if line.startswith("  #  RESIDUE"):
#                 start_reading = True
#                 continue
#             if start_reading:
#                 if line.strip() == "":
#                     continue
#                 structure = line[16]
#                 if structure == " ":
#                     structure = "C"  # Coil
#                 secondary_structure += structure
#     return secondary_structure

# dssp_file = 'fold_cb513_0_model_0.dssp'
# secondary_structure = extract_secondary_structure(dssp_file)
# print(secondary_structure)

In [None]:
from Bio.PDB import MMCIFParser, DSSP

# Load mmCIF file
parser = MMCIFParser()
structure = parser.get_structure("model", "fold_cb513_0/fold_cb513_0_model_0.cif")

# Run DSSP
dssp = DSSP(structure[0], "fold_cb513_0/fold_cb513_0_model_0.cif", dssp='mkdssp')

# Combine all the residue into one string called residue
# Combine all the predicted secondary structure one string called ss
# Return the residue and ss
residue = ""
ss = ""

for key in dssp.keys():
    residue += dssp[key][1]
    ss += dssp[key][2]

print(residue)
print(ss)

VPSLATISLENSWSGLSKQIQLAQGNNGIFRTPIVLVDNKGNRVQITNVTSKVVTSNIQLLLNTRNI
---HHHHHHHHHHHHHHHHHHHHTTTTTEEEEEEEEE-TTS-EEEEEETTSHHHHHHHHHHHHHHH-


In [12]:
# convert the ss to the same format as the label
ss = ss.replace("-", "C")
print(ss)

CCCHHHHHHHHHHHHHHHHHHHHTTTTTEEEEEEEEECTTSCEEEEEETTSHHHHHHHHHHHHHHHC


$$Q_s =  \frac{∑^s_{i=1} C_i}{n}×100\%$$

In [None]:
# Calculate the accuracy of the predicted secondary structure (Q8 Score)
def calculate_accuracy(predicted, actual):
    correct = 0
    total = len(predicted)
    for p, a in zip(predicted, actual):
        if p == a:
            correct += 1
    return correct / total

# Calculate the accuracy of the predicted secondary structure
accuracy = calculate_accuracy(cb513_testlabel[0], ss)
print(accuracy)

0.7761194029850746


In [21]:
# Convert the original & predicted 8-state secondary structure to 3-state secondary structure
def convert_8state_to_3state(ss):
    ss_3state = ""
    for s in ss:
        if s in ['H', 'G', 'I']:
            ss_3state += 'H'
        elif s in ['E', 'B']:
            ss_3state += 'E'
        else:
            ss_3state += 'C'
    return ss_3state

cb513_testlabel_3state = cb513_testlabel.copy()
cb513_testlabel_3state[0] = convert_8state_to_3state(cb513_testlabel[0])
ss_3state = convert_8state_to_3state(ss)

print(cb513_testlabel_3state[0])
print(ss_3state)

CCCHHHHHHHHHHHHHHHHHHHHCCCCCEEEEEEEECCCCCCCEEEEECCCHHHHHCECCECCHHHC
CCCHHHHHHHHHHHHHHHHHHHHCCCCCEEEEEEEEECCCCCEEEEEECCCHHHHHHHHHHHHHHHC


In [22]:
# Calculate the accuracy of the predicted secondary structure (Q3 Score)
def calculate_accuracy(predicted, actual):
    correct = 0
    total = len(predicted)
    for p, a in zip(predicted, actual):
        if p == a:
            correct += 1
    return correct / total

# Calculate the accuracy of the predicted secondary structure
accuracy = calculate_accuracy(cb513_testlabel_3state[0], ss_3state)
print(accuracy)

0.8656716417910447
