The objective of this task is to classify amino acids according to their secondary structure, using the observed counts provided in the trainCount.txt file. The predominant secondary structure represented by the highest counts is assigned as the class for each amino acid. This methodology is employed to achieve accurate classification.

In [40]:
# This code aims to output the percentage frequency for all the amino acids in the trainCounts for the secondary structures:
def frequency_structure(filename):
    counts = {}
    with open(filename, "r") as f:
        for line in f:
            tokens = line.strip().split()
            letter = tokens[0][0]
            if letter not in counts:
                counts[letter] = {"C": 0, "H": 0, "E": 0}
            if len(tokens[0]) > 1:
                if tokens[0][1] == "C":
                    counts[letter]["C"] += int(tokens[1])
                elif tokens[0][1] == "H":
                    counts[letter]["H"] += int(tokens[1])
                elif tokens[0][1] == "E":
                    counts[letter]["E"] += int(tokens[1])

    print("First Letter\tClassification\tPercentage")
    for letter, letter_counts in counts.items():
        total = sum(letter_counts.values())
        percentages = {k: v/total*100 for k, v in letter_counts.items()}
        max_key = max(percentages, key=percentages.get)
        for k, v in letter_counts.items():
            percentage = percentages[k]
            print(f"{letter}\t\t{max_key}\t\t{percentage:.2f}%")

In [41]:
frequency_structure("./trainCounts.txt")

First Letter	Classification	Percentage
A		H		31.34%
A		H		51.92%
A		H		16.74%
C		C		40.03%
C		C		31.01%
C		C		28.96%
D		C		54.66%
D		C		33.07%
D		C		12.27%
E		H		34.36%
E		H		50.22%
E		H		15.42%
F		H		30.55%
F		H		38.43%
F		H		31.03%
G		C		66.52%
G		C		18.61%
G		C		14.87%
H		C		44.60%
H		C		33.55%
H		C		21.85%
K		H		39.20%
K		H		43.05%
K		H		17.76%
L		H		26.78%
L		H		49.25%
L		H		23.97%
M		H		32.55%
M		H		46.22%
M		H		21.23%
N		C		56.90%
N		C		29.31%
N		C		13.79%
P		C		70.12%
P		C		19.81%
P		C		10.07%
Q		H		35.09%
Q		H		47.93%
Q		H		16.98%
R		H		35.40%
R		H		44.34%
R		H		20.26%
S		C		49.91%
S		C		31.37%
S		C		18.72%
T		C		43.64%
T		C		29.25%
T		C		27.11%
V		E		25.47%
V		E		33.23%
V		E		41.30%
W		H		29.82%
W		H		40.83%
W		H		29.35%
X		H		25.26%
X		H		42.01%
X		H		32.73%
Y		H		30.96%
Y		H		37.37%
Y		H		31.67%


In this next approach the aim it to focus on assigning a secondary structure to the amino acids based solely on the frequency of the predominant secondary structure.

In [29]:
def classify_structure(filename):
    counts = {}
    with open(filename, "r") as f:
        for line in f:
            tokens = line.strip().split()
            letter = tokens[0][0]
            if letter not in counts:
                counts[letter] = {"C": 0, "H": 0, "E": 0}
            if len(tokens[0]) > 1:
                if tokens[0][1] == "C":
                    counts[letter]["C"] += int(tokens[1])
                elif tokens[0][1] == "H":
                    counts[letter]["H"] += int(tokens[1])
                elif tokens[0][1] == "E":
                    counts[letter]["E"] += int(tokens[1])
                    
    print("Amino acids\tAssoc_Sec_Structures\tPercentage")
    for letter, letter_counts in counts.items():
        total = sum(letter_counts.values())
        percentages = {k: v/total*100 for k, v in letter_counts.items()}
        max_key = max(percentages, key=percentages.get)
        print(f"{letter}\t\t{max_key}\t\t\t{percentages[max_key]:.2f}%")

In [70]:
classify_structure("./trainCounts.txt")

Amino acids	Assoc_Sec_Structures	Percentage
A		H			51.92%
C		C			40.03%
D		C			54.66%
E		H			50.22%
F		H			38.43%
G		C			66.52%
H		C			44.60%
K		H			43.05%
L		H			49.25%
M		H			46.22%
N		C			56.90%
P		C			70.12%
Q		H			47.93%
R		H			44.34%
S		C			49.91%
T		C			43.64%
V		E			41.30%
W		H			40.83%
X		H			42.01%
Y		H			37.37%


In [43]:
classify_structure("./testCounts.txt")

Amino acids	Assoc_Sec_Structures	Percentage
A		H			51.57%
C		C			39.56%
D		C			54.95%
E		H			49.82%
F		H			37.36%
G		C			67.14%
H		C			44.61%
K		H			42.81%
L		H			48.70%
M		H			44.91%
N		C			56.51%
P		C			70.31%
Q		H			47.01%
R		H			43.68%
S		C			50.08%
T		C			43.54%
V		E			41.62%
W		H			39.93%
X		H			43.59%
Y		H			36.58%


Using this approach we can see that both the train and test set have classified the amino acids in a similar way across the files using the frequecy approach. This does show that this is a much more reliable approach as there is consistency with the assignemnet of the associated secondary structures.

Test for accuracy of the predictor:

In [69]:
# Read trainCounts and testCounts files
train_counts = {}
test_counts = {}

with open('./trainCounts.txt') as f:
    for line in f:
        aa_ss, count = line.strip().split()
        count = int(count)
        aa, ss = aa_ss[0], aa_ss[1]
        if aa not in train_counts:
            train_counts[aa] = {}
        train_counts[aa][ss] = count

with open('./testCounts.txt') as f:
    for line in f:
        aa_ss, count = line.strip().split()
        count = int(count)
        aa, ss = aa_ss[0], aa_ss[1]
        if aa not in test_counts:
            test_counts[aa] = {}
        test_counts[aa][ss] = count

# Define function to predict secondary structure of a protein sequence
def predict_ss(protein_seq, counts):
    predicted_ss = ''
    for aa in protein_seq:
        if aa not in counts:
            predicted_ss += 'C'  # default to coil
        else:
            ss_counts = counts[aa]
            predicted_ss += max(ss_counts, key=ss_counts.get)
    return predicted_ss

# Calculate accuracy on training set
train_correct = 0
train_total = 0
with open('./trainSS.txt') as f:
    for i in range(0, 4*11860, 4):
        protein_name = f.readline().strip()
        protein_length = int(f.readline().strip())
        protein_seq = f.readline().strip().split('\t')
        protein_ss = f.readline().strip().split('\t')
        predicted_ss = predict_ss(protein_seq, train_counts)
        for j in range(protein_length):
            if protein_ss[j] == predicted_ss[j]:
                train_correct += 1
            train_total += 1

train_accuracy = train_correct / train_total
print('Training accuracy:', train_accuracy)

# Calculate accuracy on test set
test_correct = 0
test_total = 0
with open('./testSS.txt') as f:
    for i in range(0, 4*3954, 4):
        protein_name = f.readline().strip()
        protein_length = int(f.readline().strip())
        protein_seq = f.readline().strip().split('\t')
        protein_ss = f.readline().strip().split('\t')
        predicted_ss = predict_ss(protein_seq, test_counts)
        for j in range(protein_length):
            if protein_ss[j] == predicted_ss[j]:
                test_correct += 1
            test_total += 1

test_accuracy = test_correct / test_total
print('Test accuracy:', test_accuracy)

Training accuracy: 0.49057580602822454
Test accuracy: 0.4893581866060825


In [73]:
def parse_counts(filename):
    counts = {}
    with open(filename, "r") as f:
        for line in f:
            tokens = line.strip().split()
            letter = tokens[0][0]
            if letter not in counts:
                counts[letter] = {"C": 0, "H": 0, "E": 0}
            if len(tokens[0]) > 1:
                if tokens[0][1] == "C":
                    counts[letter]["C"] += int(tokens[1])
                elif tokens[0][1] == "H":
                    counts[letter]["H"] += int(tokens[1])
                elif tokens[0][1] == "E":
                    counts[letter]["E"] += int(tokens[1])
    return counts

def compute_accuracy(predictor, counts):
    total = 0
    correct = 0
    for letter, letter_counts in counts.items():
        for classification in ["C", "H", "E"]:
            for count in range(letter_counts[classification]):
                total += 1
                if predictor(letter) == classification:
                    correct += 1
    return correct/total

# Parse the training and test sets
train_counts = parse_counts("./trainCounts.txt")
test_counts = parse_counts("./testCounts.txt")

# Define a basic predictor that always predicts "C"
def basic_predictor(letter):
    return "E"

# Compute the accuracy of the basic predictor on the training set and test set
train_accuracy = compute_accuracy(basic_predictor, train_counts)
test_accuracy = compute_accuracy(basic_predictor, test_counts)

# Print the results
print(f"Basic predictor accuracy on training set: {train_accuracy:.2f}")
print(f"Basic predictor accuracy on test set: {test_accuracy:.2f}")

Basic predictor accuracy on training set: 0.22
Basic predictor accuracy on test set: 0.22
