# Final

# Import Required Libraries
Import necessary libraries, including gzip and numpy.

In [3]:
# Import necessary libraries
import gzip
import numpy as np

# Load Data from Files
Load the cb513_sequence and cb513_testlabel data from the compressed numpy files using gzip.

In [4]:
# Load Data from Files

# Load cb513_sequence data from compressed numpy file
f = gzip.GzipFile('cb513_sequence_MASK.npy.gz', "r")
cb513_sequence = np.load(f)
f.close()

# Load cb513_testlabel data from compressed numpy file
f = gzip.GzipFile('cb513_testlabel_MASK.npy.gz', "r")
cb513_testlabel = np.load(f)
f.close()

In [5]:
# Print shapes of the loaded data
print(cb513_sequence.shape)
print(cb513_testlabel.shape)

(514, 700, 22)
(514, 700, 9)


# Convert One hot to Label

## Sequence Decoding

In [6]:
# Define the array of characters
amino_acid_residues = [
    '_', 'A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P',
    'S', 'R', 'T', 'W', 'V', 'Y', 'X'
]

# Function to convert one-hot encoded sequence to string and clean trailing underscores
def residue_to_string(sequence):
    indices = np.argmax(sequence, axis=-1)
    sequence = ''.join([amino_acid_residues[i] for i in indices])
    sequence = sequence.rstrip('_')  # Remove trailing underscores
    return sequence


# Apply the function to each sequence in CullPDB_sequence
cb513_sequence = [residue_to_string(seq) for seq in cb513_sequence]
print(cb513_sequence[0])

VPSLATISLENSWSGLSKQIQLAQGNNGIFRTPIVLVDNKGNRVQITNVTSKVVTSNIQLLLNTRNI


## Label Decoding

In [7]:
DSSP = {0: '_', 1: 'C', 2: 'B', 3: 'E', 4: 'G', 5: 'I', 6: 'H', 7: 'S', 8: 'T'}

def label_to_string(label):
    indices = np.argmax(label, axis=-1)
    indices = [DSSP[i] for i in indices]
    label = ''.join(indices)
    label = label.rstrip('_')  # Remove trailing underscores
    return label

# Apply the function to each sequence in cb513_testlabel
cb513_testlabel = [label_to_string(label) for label in cb513_testlabel]
print(cb513_testlabel[0])

CCCHHHHHHHHHHHHHHHHHHHHTTTTTEEEEEEEECCSSSSCEEEEETTSHHHHHTBCCBCCGGGC


In [8]:
# Print the first sequence and its label
print(cb513_sequence[0])
print(cb513_testlabel[0])

VPSLATISLENSWSGLSKQIQLAQGNNGIFRTPIVLVDNKGNRVQITNVTSKVVTSNIQLLLNTRNI
CCCHHHHHHHHHHHHHHHHHHHHTTTTTEEEEEEEECCSSSSCEEEEETTSHHHHHTBCCBCCGGGC


In [9]:
def extract_secondary_structure(dssp_file):
    secondary_structure = ""
    with open(dssp_file, 'r') as file:
        lines = file.readlines()
        start_reading = False
        for line in lines:
            if line.startswith("  #  RESIDUE"):
                start_reading = True
                continue
            if start_reading:
                if line.strip() == "":
                    continue
                structure = line[16]
                if structure == " ":
                    structure = "C"  # Coil
                secondary_structure += structure
    return secondary_structure

dssp_file = 'fold_cb513_0_model_0.dssp'
secondary_structure = extract_secondary_structure(dssp_file)
print(secondary_structure)

CCCHHHHHHHHHHHHHHHHHHHHTTTTTEEEEEEEEECTTSCEEEEEETTSHHHHHHHHHHHHHHHC


$$Q_s =  \frac{∑^s_{i=1} C_i}{n}×100\%$$

In [11]:
# Calculate the accuracy of the predicted secondary structure (Q Score)
def calculate_accuracy(predicted, actual):
    correct = 0
    total = len(predicted)
    for p, a in zip(predicted, actual):
        if p == a:
            correct += 1
    return correct / total

# Calculate the accuracy of the predicted secondary structure
accuracy = calculate_accuracy(cb513_testlabel[0], secondary_structure)
print(accuracy)

0.7761194029850746
