# Aritifical Neural Network for PBL

Prediction of the secondary structure of a protein using an ANN in order to predict the states of every amino acid in the sequence. This then gives an overview on how a protein could look like in reality only given the sequence of amino acids.

In [1]:
import numpy as num
import pandas as pnds
import math
import sys
import datetime
import sklearn.metrics as metrics
import csv

from joblib import dump, load

from sklearn import datasets
from sklearn.preprocessing import Normalizer
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold, GridSearchCV

from scipy.stats import sem, t
from scipy import mean

# Import of data

Import data from text files.

In [2]:
# open files
file_test_sequence = open("/home/annette/Data/Studium/Module/PBL/Dataset/test_sequences.fasta")
file_training_sequence = open("/home/annette/Data/Studium/Module/PBL/Dataset/training_sequences.fasta")
file_test_structure = open("/home/annette/Data/Studium/Module/PBL/Dataset/test_structure.fasta")
file_training_structure = open("/home/annette/Data/Studium/Module/PBL/Dataset/training_structure.fasta")
file_test_disorder = open("/home/annette/Data/Studium/Module/PBL/Dataset/test_pdb_disorder.fasta")
file_training_disorder = open("/home/annette/Data/Studium/Module/PBL/Dataset/training_disorder.fasta")

# delete \n from file
list_test_sequence = file_test_sequence.readlines()
for number in range(len(list_test_sequence)):
    element = list_test_sequence[number]
    list_test_sequence[number] = element[0:len(element) - 1]
    
list_training_sequence = file_training_sequence.readlines()
for number in range(len(list_training_sequence)):
    element = list_training_sequence[number]
    list_training_sequence[number] = element[0:len(element) - 1]

list_test_structure = file_test_structure.readlines()
for number in range(len(list_test_structure)):
    element = list_test_structure[number]
    list_test_structure[number] = element[0:len(element) - 1]
    
list_training_structure = file_training_structure.readlines()
for number in range(len(list_training_structure)):
    element = list_training_structure[number]
    list_training_structure[number] = element[0:len(element) - 1]
    
list_test_disorder = file_test_disorder.readlines()
for number in range(len(list_test_disorder)):
    element = list_test_disorder[number]
    list_test_disorder[number] = element[0:len(element) - 1]
    
list_training_disorder = file_training_disorder.readlines()
for number in range(len(list_training_disorder)):
    element = list_training_disorder[number]
    list_training_disorder[number] = element[0:len(element) - 1]

# close files
file_test_sequence.close()
file_training_sequence.close()
file_test_structure.close()
file_training_structure.close()
file_test_disorder.close()
file_training_disorder.close()

# Preparation of data

Change from given 8 states to needed 3 states

labels:
* sheet: 0
* coil: 1
* helix: 2

Test set

In [3]:
# Change to 3 states
for index_list, prot_sequ in enumerate(list_test_structure):
    
    if prot_sequ[0:1] == ">" or len(prot_sequ) == 0:
        continue
        
    test_structure = [] 
    for index in range(len(prot_sequ)):
        if prot_sequ[index] == ' ':
            if list_test_disorder[index_list][index] == '-':
                test_structure.append("C")
            else:
                test_structure.append(" ")
        elif prot_sequ[index] == 'H':
            test_structure.append("H")
        elif prot_sequ[index] == 'E':
            test_structure.append("E")
        elif prot_sequ[index] == 'T':
            test_structure.append("C")
        elif prot_sequ[index] == 'S':
            test_structure.append("C")
        elif prot_sequ[index] == 'G':
            test_structure.append("H")
        elif prot_sequ[index] == 'B':
            test_structure.append("E")
        elif prot_sequ[index] == 'I':
            test_structure.append("H")
        elif prot_sequ[index] == 'C':
            test_structure.append("C")
            
    structure3_test = ''.join(test_structure)
    list_test_structure[index_list] = structure3_test

print("Ready.")
# print(list_test_structure)


list_test_labels = []

# Change to numbers = labels
for index_list, prot_sequ in enumerate(list_test_structure):

    if prot_sequ[0:1] == ">" or len(prot_sequ) == 0:
        list_test_labels.append(prot_sequ)
        continue

    test_structure = []
    for index in range(len(prot_sequ)):
        if prot_sequ[index] == 'H':
            test_structure.append("2")
        elif prot_sequ[index] == 'E':
            test_structure.append("0")
        elif prot_sequ[index] == ' ':
            test_structure.append(" ")
        elif prot_sequ[index] == 'C':
            test_structure.append("1")
    
    labels_test = ''.join(test_structure)
    list_test_labels.append(labels_test)

print("Ready.")

Ready.
Ready.


Training set

In [4]:
# Change to 3 states
for index_list, prot_sequ in enumerate(list_training_structure):
     
    if prot_sequ[0:1] == ">" or len(prot_sequ) == 0:
        continue
     
    training_structure = []
    for index in range(len(prot_sequ)):
        if prot_sequ[index] == ' ':
            if list_training_disorder[index_list][index] == '-':
                training_structure.append("C")
            else:
                training_structure.append(" ")
        elif prot_sequ[index] == 'H':
            training_structure.append("H")
        elif prot_sequ[index] == 'E':
            training_structure.append("E")
        elif prot_sequ[index] == 'T':
            training_structure.append("C")
        elif prot_sequ[index] == 'S':
            training_structure.append("C")
        elif prot_sequ[index] == 'G':
            training_structure.append("H")
        elif prot_sequ[index] == 'B':
            training_structure.append("E")
        elif prot_sequ[index] == 'I':
            training_structure.append("H")
        elif prot_sequ[index] == 'C':
            training_structure.append("C")

    
    structure3_training = ''.join(training_structure)
    list_training_structure[index_list] = structure3_training

print("Ready.")

list_training_labels = []

# Change to numbers = labels
for index_list, prot_sequ in enumerate(list_training_structure):

    if prot_sequ[0:1] == ">" or len(prot_sequ) == 0:
        list_training_labels.append(prot_sequ)
        continue

    training_structure = []
    for index in range(len(prot_sequ)):
        if prot_sequ[index] == 'H':
            training_structure.append("2")
        elif prot_sequ[index] == 'E':
            training_structure.append("0")
        elif prot_sequ[index] == ' ':
            training_structure.append(" ")
        elif prot_sequ[index] == 'C':
            training_structure.append("1")
    
    labels_training = ''.join(training_structure)
    list_training_labels.append(labels_training)

print("Ready.")

Ready.
Ready.


Connection of name, sequence and labels of protein into dictionary

order in the end:

* name = key
* amino acid sequence
* states
* number of amino acids with state
* target values for prediction
* predicted values
* list with accuracy, f1, MCC, precision

In [5]:
# list_test_sequence. list_training_sequence, list_test_labels, list_training_labels
whole_data_test = {}
whole_data_training = {}

# test data
for index in range(0, len(list_test_sequence), 3):
    key = list_test_sequence[index]
    key = key[1:len(key)]
    value1 = list_test_sequence[index + 1]
    value2 = list_test_labels[index + 1]
    value = [value1, value2]
    whole_data_test[key] = value


# training data
for index in range(0, len(list_training_sequence), 3):
    key = list_training_sequence[index]
    key = key[1:len(key)]
    value1 = list_training_sequence[index + 1]
    value2 = list_training_labels[index + 1]
    value = [value1, value2]
    whole_data_training[key] = value

print("Ready.")

Ready.


# Input features

In [6]:
class_names = ['coil', 'sheet', 'helix']
helix_proba = {'A': 1.32, 'B': 0.86, 'C': 1.22, 'D': 0.86, 'E': 1.2, 'F': 1.02, 'G': 0.5, 'H': 0.9, 'I': 1.08, 'J': 1.2, 'K': 1.06, 'L': 1.32, 'M': 1.34, 'N': 0.86, 'O': 1., 'P': 0.3, 'Q': 1.24, 'R': 1.16, 'S': 0.84, 'T': 0.84, 'U': 1., 'V': 0.9, 'W': 1.06, 'X': 1., 'Y': 0.98, 'Z': 1.22}
sheet_proba = {'A': 0.78, 'B': 0.43, 'C': 1.36, 'D': 0.42, 'E': 0.64, 'F': 1.52, 'G': 0.64, 'H': 0.98, 'I': 1.78, 'J': 1.48, 'K': 0.7, 'L': 1.18, 'M': 1.06, 'N': 0.44, 'O': 0.96, 'P': 0.02, 'Q': 0.74, 'R': 0.86, 'S': 0.76, 'T': 1.04, 'U': 0.96, 'V': 1.92, 'W': 1.22, 'X': 0.96, 'Y': 1.1, 'Z': 0.69}
hydrophob = {'A': 1.8, 'B': -3.5, 'C': 2.5, 'D': -3.5, 'E': -3.5, 'F': 2.8, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'J': 4.15, 'K': -3.9, 'L': 3.8, 'M': 1.9, 'N': -3.5, 'O': -3., 'P': -1.6, 'Q': -3.5, 'R': -4.5, 'S': -0.8, 'T': -0.7, 'U': 1., 'V': 4.2, 'W': -0.9, 'X': -0.49, 'Y': -1.3, 'Z': -3.5}
isoelec_point = {'A': 6.1, 'B': 4.13, 'C': 5.05, 'D': 2.85, 'E': 3.22, 'F': 5.84, 'G': 5.97, 'H': 7.47, 'I': 5.94, 'J': 5.96, 'K': 9.59, 'L': 5.98, 'M': 5.74, 'N': 5.41, 'O': 7., 'P': 6.3, 'Q': 5.65, 'R': 11.76, 'S': 5.68, 'T': 5.6, 'U': 6., 'V': 5.96, 'W': 5.64, 'X': 6.07, 'Y': 5.66, 'Z': 4.44}
amino_acids = {'A': 0.1, 'C': 0.1, 'D': 0.1, 'E': 0.1, 'F': 0.1, 'G': 0.1, 'H': 0.1, 'I': 0.1, 'K': 0.1, 'L': 0.1, 'M': 0.1, 'N': 0.1, 'P': 0.1, 'Q': 0.1, 'R': 0.1, 'S': 0.1, 'T': 0.1, 'V': 0.1, 'W': 0.1, 'Y': 0.1, 'B': 0.2, 'J': 0.2, 'Z': 0.2, 'O': 0.4, 'U': 0.4, 'X': 0.6}


Normalization of input features

In [7]:
array = num.empty((5,26))

# helix
for index, aa in enumerate(helix_proba):
    array[0][index] = helix_proba.get(aa)
    
# sheet
for index, aa in enumerate(sheet_proba):
    array[1][index] = sheet_proba.get(aa)

# hydrophob
for index, aa in enumerate(hydrophob):
    array[2][index] = hydrophob.get(aa)

# isoelec
for index, aa in enumerate(isoelec_point):
    array[3][index] = isoelec_point.get(aa)

# amino acids
for index, aa in enumerate(amino_acids):
    array[4][index] = amino_acids.get(aa)

# Normalizer, StandardScaler
array_transformed = Normalizer().fit_transform(array)

# back to dictionary
# helix
for index, aa in enumerate(helix_proba):
    helix_proba[aa] = array_transformed[0][index]
print(helix_proba)

# sheet
for index, aa in enumerate(sheet_proba):
    sheet_proba[aa] = array_transformed[1][index]
print(sheet_proba)

# hydrophob
for index, aa in enumerate(hydrophob):
    hydrophob[aa] = array_transformed[2][index]
print(hydrophob)

# isoelec
for index, aa in enumerate(isoelec_point):
    isoelec_point[aa] = array_transformed[3][index]
print(isoelec_point)

# amino acids
for index, aa in enumerate(amino_acids):
    amino_acids[aa] = array_transformed[4][index]
print(amino_acids)

{'A': 0.24934615190204282, 'B': 0.1624527959361794, 'C': 0.23045629190946382, 'D': 0.1624527959361794, 'E': 0.226678319910948, 'F': 0.19267657192430582, 'G': 0.094449299962895, 'H': 0.170008739933211, 'I': 0.2040104879198532, 'J': 0.226678319910948, 'K': 0.20023251592133742, 'L': 0.24934615190204282, 'M': 0.2531241239005586, 'N': 0.1624527959361794, 'O': 0.18889859992579, 'P': 0.056669579977737, 'Q': 0.2342342639079796, 'R': 0.2191223759139164, 'S': 0.1586748239376636, 'T': 0.1586748239376636, 'U': 0.18889859992579, 'V': 0.170008739933211, 'W': 0.20023251592133742, 'X': 0.18889859992579, 'Y': 0.1851206279272742, 'Z': 0.23045629190946382}
{'A': 0.14730044172548845, 'B': 0.08120408966917952, 'C': 0.2568315394188004, 'D': 0.07931562246757069, 'E': 0.12086190090296488, 'F': 0.2870470146445416, 'G': 0.12086190090296488, 'H': 0.18506978575766497, 'I': 0.33614716188637106, 'J': 0.27949314583810625, 'K': 0.13219270411261783, 'L': 0.22283912978984147, 'M': 0.2001775233705356, 'N': 0.08309255687

# Generating the input vectors and the target array

Windows of sequences: not if space for middle AA or not one from 20 letters, else: 7 on each side

structure: for every AA: value of
* amino_acids
* hydrophob
* isoelec_point
* helix_proba
* sheet_proba

Test set

In [8]:
vectors_test = []
target_test_list = []

for name in whole_data_test:
    data = whole_data_test.get(name)
    sequence = data[0]
    structure = data[1]
    counter_amino_acids = 0
    target_test = []
    
    for aa in range(len(sequence)):
        if sequence[aa] == ('B' or 'J' or 'O' or 'U' or 'X' or 'Z'):
            continue
            
        if structure[aa] == ' ':
            continue
                
        counter_amino_acids = counter_amino_acids + 1
        vector = []
        window = []
        # generating the amino acids in the input vector of length 15
        if aa < 7:
            for index in range(0, 7 - aa):
                window.append(sequence[0])
            for index in range(0, aa + 8):
                window.append(sequence[index])
        elif aa > len(sequence) - 8:
            for index in range(aa - 7, len(sequence)):
                window.append(sequence[index])
            for index in range(len(window), 15):
                window.append(sequence[len(sequence) - 1])
        else:
            for index in range(aa - 7, aa + 8):
                window.append(sequence[index])
        
        # Generation of the target array
        target_test_list.append(structure[aa])
        target_test.append(structure[aa])
        
        # generating the input vector with properties
        for acid in window:
            vector.append(amino_acids.get(acid))
            vector.append(hydrophob.get(acid))
            vector.append(isoelec_point.get(acid))
            vector.append(helix_proba.get(acid))
            vector.append(sheet_proba.get(acid))
            
        vectors_test.append(vector)
        
    whole_data_test[name].append(counter_amino_acids)
    whole_data_test[name].append(target_test)
        
input_vectors_test = num.array(vectors_test, dtype=object)
for index in range(len(input_vectors_test)):
        vector = input_vectors_test[index]
        input_vectors_test[index] = num.array(vector, dtype=object)
target_test = num.array(target_test_list)
print("Ready.")

Ready.


Training set

In [9]:
vectors_training = []
target_training_list = []

for name in whole_data_training:
    data = whole_data_training.get(name)
    sequence = data[0]
    structure = data[1]
    counter_amino_acids = 0
    target_training = []
    
    for aa in range(len(sequence)):
        if sequence[aa] == 'B' or sequence[aa] == 'J' or sequence[aa] == 'O' or sequence[aa] == 'U' or sequence[aa] == 'X' or sequence[aa] == 'Z':
            continue

        if structure[aa] == ' ':
            continue

        counter_amino_acids = counter_amino_acids + 1
        vector = []
        window = []
        if aa < 7:
            for index in range(0, 7 - aa):
                window.append(sequence[0])
            for index in range(0, aa + 8):
                window.append(sequence[index])
        elif aa > len(sequence) - 8:
            for index in range(aa - 7, len(sequence)):
                window.append(sequence[index])
            for index in range(len(window), 15):
                window.append(sequence[len(sequence) - 1])
        else:
            for index in range(aa - 7, aa + 8):
                window.append(sequence[index])
        
        # Generation the target array
        target_training_list.append(structure[aa])
        target_training.append(structure[aa])
        
        for acid in window:
            vector.append(amino_acids.get(acid))
            vector.append(hydrophob.get(acid))
            vector.append(isoelec_point.get(acid))
            vector.append(helix_proba.get(acid))
            vector.append(sheet_proba.get(acid))
            
        vectors_training.append(vector)
        
    whole_data_training[name].append(counter_amino_acids)
    whole_data_training[name].append(target_training)
        
input_vectors_training = num.array(vectors_training, dtype=object)
for index in range(len(input_vectors_training)):
        vector = input_vectors_training[index]
        input_vectors_training[index] = num.array(vector, dtype=object)
target_training = num.array(target_training_list)
print("Ready.")

0
Ready.


# Performance with confidence intervals

add predicted values to whole_data_test and whole_data_training

calculate three performance measurements:

* accuracy
* f1-score
* MCC

In [10]:
# predict values with best ANN
best_nn = load("/home/annette/Data/Studium/Module/PBL/Neural_Networks/Best_ANN_0.659490.lib")

prediction_values_test = best_nn.predict(input_vectors_test)
prediction_score_test = best_nn.score(input_vectors_test, target_test)
prediction_proba_test = best_nn.predict_proba(input_vectors_test)

prediction_values_training = best_nn.predict(input_vectors_training)
prediction_score_training = best_nn.score(input_vectors_training, target_training)
print("Ready.")

Ready.


In [11]:
# test set
index = 0
accuracy_test = []
f1_test = []
mcc_test = []
accuracy_helix = []
accuracy_sheet = []
accuracy_coil = []
accuracy_helix_to_sheet = []
accuracy_helix_to_coil = []
accuracy_sheet_to_helix = []
accuracy_sheet_to_coil = []
accuracy_coil_to_helix = []
accuracy_coil_to_sheet = []

# add predicted values + calculate preformance measurements for each protein
for name in whole_data_test:
    data = whole_data_test.get(name)
    length = data[2]
    # predicted values
    predicted_test = prediction_values_test[index : index + length]
    whole_data_test[name].append(predicted_test)
    index = index + length
    
for name in whole_data_test:
    # calculate
    performance_test = []
    data = whole_data_test.get(name)
    target = data[3]
    predicted = data[4]
    accuracy = metrics.accuracy_score(target, predicted) # accuracy
    accuracy_test.append(accuracy)
    f1 = metrics.f1_score(target, predicted, average='weighted') # f1-score, different averages: weighted, macro
    f1_test.append(f1)
    mcc = metrics.matthews_corrcoef(target, predicted) # Matthews Correlation Coefficient
    mcc_test.append(mcc)
    performance_test.append(accuracy)
    performance_test.append(f1)
    performance_test.append(mcc)
    whole_data_test[name].append(performance_test)    
    
    helix_right_test = 0
    helix_test = 0
    sheet_right_test = 0
    sheet_test = 0
    coil_right_test = 0
    coil_test = 0
    helix_to_sheet_test = 0
    helix_to_coil_test = 0
    sheet_to_helix_test = 0
    sheet_to_coil_test = 0
    coil_to_helix_test = 0
    coil_to_sheet_test = 0
    
    # counter for confusion matrix: right state predicted?
    for index in range(len(target)):
        if target[index] == '0':
            sheet_test = sheet_test + 1
            if predicted[index] == '0':
                sheet_right_test = sheet_right_test + 1
            elif predicted[index] == '1':
                sheet_to_coil_test = sheet_to_coil_test + 1
            elif predicted[index] == '2':
                sheet_to_helix_test = sheet_to_helix_test + 1
        elif target[index] == '1':
            coil_test = coil_test + 1
            if predicted[index] == '0':
                coil_to_sheet_test = coil_to_sheet_test + 1
            elif predicted[index] == '1':
                coil_right_test = coil_right_test + 1
            elif predicted[index] == '2':
                coil_to_helix_test = coil_to_helix_test + 1
        elif target[index] == '2':
            helix_test = helix_test + 1
            if predicted[index] == '0':
                helix_to_sheet_test = helix_to_sheet_test + 1
            elif predicted[index] == '1':
                helix_to_coil_test = helix_to_coil_test + 1
            elif predicted[index] == '2':
                helix_right_test = helix_right_test + 1
    
    counter = [helix_right_test, helix_test, sheet_right_test, sheet_test, coil_right_test, coil_test,
               helix_to_sheet_test, helix_to_coil_test, sheet_to_helix_test, sheet_to_coil_test,
               coil_to_helix_test, coil_to_sheet_test]
    
    # accuracy for helix, sheet, coil for confidence intervals
    if helix_test != 0:
        helix_accuracy = helix_right_test / helix_test
        helix_to_sheet_accuracy = helix_to_sheet_test / helix_test
        helix_to_coil_accuracy = helix_to_coil_test / helix_test
    else:
        helix_accuracy = 0.0
        helix_to_sheet_accuracy = 0.0
        helix_to_coil_accuracy = 0.0
    if sheet_test != 0:
        sheet_accuracy = sheet_right_test / sheet_test
        sheet_to_helix_accuracy = sheet_to_helix_test / sheet_test
        sheet_to_coil_accuracy = sheet_to_coil_test / sheet_test
    else:
        sheet_accuracy = 0.0
        sheet_to_helix_accuracy = 0.0
        sheet_to_coil_accuracy = 0.0
    if coil_test != 0:
        coil_accuracy = coil_right_test / coil_test
        coil_to_helix_accuracy = coil_to_helix_test / coil_test
        coil_to_sheet_accuracy = coil_to_sheet_test / coil_test
    else:
        coil_accuracy = 0.0
        coil_to_helix_accuracy = 0.0
        coil_to_sheet_accuracy = 0.0
    accuracy_all = (helix_right_test + sheet_right_test + coil_right_test) / len(target)
    
    accuracy_helix.append(helix_accuracy)
    accuracy_sheet.append(sheet_accuracy)
    accuracy_coil.append(coil_accuracy)
    accuracy_helix_to_sheet.append(helix_to_sheet_accuracy)
    accuracy_helix_to_coil.append(helix_to_coil_accuracy)
    accuracy_sheet_to_helix.append(sheet_to_helix_accuracy)
    accuracy_sheet_to_coil.append(sheet_to_coil_accuracy)
    accuracy_coil_to_helix.append(coil_to_helix_accuracy)
    accuracy_coil_to_sheet.append(coil_to_sheet_accuracy)
    accuracy_hec = [helix_accuracy, sheet_accuracy, coil_accuracy, accuracy_all]
    
    whole_data_test[name].append(counter)
    whole_data_test[name].append(accuracy_hec)
    
print("Ready.")

Ready.


Evaluation test set

In [13]:
# accuracy
accuracy = ["Accuracy:"]
accuracy_mean = mean(accuracy_test)
accuracy_sem = sem(accuracy_test)
accuracy_interval = accuracy_sem * t.ppf((1 + 0.95) / 2, len(accuracy_test) - 1)
accuracy.append(accuracy_mean)
accuracy.append(accuracy_interval)
print(accuracy)

# f1-score
f1 = ["F1-score:"]
f1_mean = mean(f1_test)
f1_sem = sem(f1_test)
f1_interval = f1_sem * t.ppf((1 + 0.95) / 2, len(f1_test) - 1)
f1.append(f1_mean)
f1.append(f1_interval)
print(f1)

# mcc
mcc = ["MCC:"]
mcc_mean = mean(mcc_test)
mcc_sem = sem(mcc_test)
mcc_interval = mcc_sem * t.ppf((1 + 0.95) / 2, len(mcc_test) - 1)
mcc.append(mcc_mean)
mcc.append(mcc_interval)
print(mcc)

0.6469456054145243
0.004294808796163721
0.008521832416836654
['Accuracy:', 0.6469456054145243, 0.008521832416836654]
['F1-score:', 0.6486689391024677, 0.00932724878480278]
['MCC:', 0.4316142735940744, 0.011419160792964312]


  accuracy_mean = mean(accuracy_test)
  f1_mean = mean(f1_test)
  mcc_mean = mean(mcc_test)


In [27]:
# confusion matrix test set
# helix
accuracy_helix_test = ["Accuracy for helix:"]
accuracy_mean_helix = mean(accuracy_helix)
accuracy_sem_helix = sem(accuracy_helix)
accuracy_interval_helix = accuracy_sem_helix * t.ppf((1 + 0.95) / 2, len(accuracy_helix) - 1)
accuracy_helix_test.append(accuracy_mean_helix)
accuracy_helix_test.append(accuracy_interval_helix)
print(accuracy_helix_test)

# sheet
accuracy_sheet_test = ["Accuracy for sheet:"]
accuracy_mean_sheet = mean(accuracy_sheet)
accuracy_sem_sheet = sem(accuracy_sheet)
accuracy_interval_sheet = accuracy_sem_sheet * t.ppf((1 + 0.95) / 2, len(accuracy_sheet) - 1)
accuracy_sheet_test.append(accuracy_mean_sheet)
accuracy_sheet_test.append(accuracy_interval_sheet)
print(accuracy_sheet_test)

# coil
accuracy_coil_test = ["Accuracy for coil:"]
accuracy_mean_coil = mean(accuracy_coil)
accuracy_sem_coil = sem(accuracy_coil)
accuracy_interval_coil = accuracy_sem_coil * t.ppf((1 + 0.95) / 2, len(accuracy_coil) - 1)
accuracy_coil_test.append(accuracy_mean_coil)
accuracy_coil_test.append(accuracy_interval_coil)
print(accuracy_coil_test)

# helix to sheet
accuracy_helix_to_sheet_test = ["Accuracy for helix to sheet:"]
accuracy_mean_helix_to_sheet = mean(accuracy_helix_to_sheet)
accuracy_sem_helix_to_sheet = sem(accuracy_helix_to_sheet)
accuracy_interval_helix_to_sheet = accuracy_sem_helix_to_sheet * t.ppf((1 + 0.95) / 2, len(accuracy_helix_to_sheet) - 1)
accuracy_helix_to_sheet_test.append(accuracy_mean_helix_to_sheet)
accuracy_helix_to_sheet_test.append(accuracy_interval_helix_to_sheet)
print(accuracy_helix_to_sheet_test)

# sheet to helix
accuracy_sheet_to_helix_test = ["Accuracy for sheet to helix:"]
accuracy_mean_sheet_to_helix = mean(accuracy_sheet_to_helix)
accuracy_sem_sheet_to_helix = sem(accuracy_sheet_to_helix)
accuracy_interval_sheet_to_helix = accuracy_sem_sheet_to_helix * t.ppf((1 + 0.95) / 2, len(accuracy_sheet_to_helix) - 1)
accuracy_sheet_to_helix_test.append(accuracy_mean_sheet_to_helix)
accuracy_sheet_to_helix_test.append(accuracy_interval_sheet_to_helix)
print(accuracy_sheet_to_helix_test)

# coil to helix
accuracy_coil_to_helix_test = ["Accuracy for coil to helix:"]
accuracy_mean_coil_to_helix = mean(accuracy_coil_to_helix)
accuracy_sem_coil_to_helix = sem(accuracy_coil_to_helix)
accuracy_interval_coil_to_helix = accuracy_sem_coil_to_helix * t.ppf((1 + 0.95) / 2, len(accuracy_coil_to_helix) - 1)
accuracy_coil_to_helix_test.append(accuracy_mean_coil_to_helix)
accuracy_coil_to_helix_test.append(accuracy_interval_coil_to_helix)
print(accuracy_coil_to_helix_test)

# helix to coil
accuracy_helix_to_coil_test = ["Accuracy for helix to coil:"]
accuracy_mean_helix_to_coil = mean(accuracy_helix_to_coil)
accuracy_sem_helix_to_coil = sem(accuracy_helix_to_coil)
accuracy_interval_helix_to_coil = accuracy_sem_helix_to_coil * t.ppf((1 + 0.95) / 2, len(accuracy_helix_to_coil) - 1)
accuracy_helix_to_coil_test.append(accuracy_mean_helix_to_coil)
accuracy_helix_to_coil_test.append(accuracy_interval_helix_to_coil)
print(accuracy_helix_to_coil_test)

# sheet to coil
accuracy_sheet_to_coil_test = ["Accuracy for sheet to coil:"]
accuracy_mean_sheet_to_coil = mean(accuracy_sheet_to_coil)
accuracy_sem_sheet_to_coil = sem(accuracy_sheet_to_coil)
accuracy_interval_sheet_to_coil = accuracy_sem_sheet_to_coil * t.ppf((1 + 0.95) / 2, len(accuracy_sheet_to_coil) - 1)
accuracy_sheet_to_coil_test.append(accuracy_mean_sheet_to_coil)
accuracy_sheet_to_coil_test.append(accuracy_interval_sheet_to_coil)
print(accuracy_sheet_to_coil_test)

# coil to sheet
accuracy_coil_to_sheet_test = ["Accuracy for coil to sheet:"]
accuracy_mean_coil_to_sheet = mean(accuracy_coil_to_sheet)
accuracy_sem_coil_to_sheet = sem(accuracy_coil_to_sheet)
accuracy_interval_coil_to_sheet = accuracy_sem_coil_to_sheet * t.ppf((1 + 0.95) / 2, len(accuracy_coil_to_sheet) - 1)
accuracy_coil_to_sheet_test.append(accuracy_mean_coil_to_sheet)
accuracy_coil_to_sheet_test.append(accuracy_interval_coil_to_sheet)
print(accuracy_coil_to_sheet_test)

['Accuracy for helix:', 0.6403828542525609, 0.023678463922768567]
['Accuracy for sheet:', 0.47036805991636127, 0.020846296453831355]
['Accuracy for coil:', 0.7151736063549271, 0.010272399410769321]
['Accuracy for helix to sheet:', 0.08909827859530224, 0.008210085256723208]
['Accuracy for sheet to helix:', 0.17447970259708126, 0.013150078109620501]
['Accuracy for coil to helix:', 0.17732684813003177, 0.010614186122537915]
['Accuracy for helix to coil:', 0.27051886715213685, 0.021687752581968846]
['Accuracy for sheet to coil:', 0.3351522374865575, 0.016496450639916914]
['Accuracy for coil to sheet:', 0.10749954551504105, 0.005751409217542964]


  accuracy_mean_helix = mean(accuracy_helix)
  accuracy_mean_sheet = mean(accuracy_sheet)
  accuracy_mean_coil = mean(accuracy_coil)
  accuracy_mean_helix_to_sheet = mean(accuracy_helix_to_sheet)
  accuracy_mean_sheet_to_helix = mean(accuracy_sheet_to_helix)
  accuracy_mean_coil_to_helix = mean(accuracy_coil_to_helix)
  accuracy_mean_helix_to_coil = mean(accuracy_helix_to_coil)
  accuracy_mean_sheet_to_coil = mean(accuracy_sheet_to_coil)
  accuracy_mean_coil_to_sheet = mean(accuracy_coil_to_sheet)


In [28]:
# training set
index = 0
accuracy_training = []
f1_training = []
mcc_training = []
accuracy_helix = []
accuracy_sheet = []
accuracy_coil = []
accuracy_helix_to_sheet = []
accuracy_helix_to_coil = []
accuracy_sheet_to_helix = []
accuracy_sheet_to_coil = []
accuracy_coil_to_helix = []
accuracy_coil_to_sheet = []

# add predicted values
for name in whole_data_training:
    data = whole_data_training.get(name)
    length = data[2]
    # predicted values
    predicted_training = prediction_values_training[index : index + length]
    whole_data_training[name].append(predicted_training)
    index = index + length

for name in whole_data_training:
    # calculate
    performance_training = []
    data = whole_data_training.get(name)
    target = data[3]
    predicted = data[4]
    accuracy = metrics.accuracy_score(target, predicted) # accuracy
    accuracy_training.append(accuracy)
    f1 = metrics.f1_score(target, predicted, average='weighted') # f1-score, different averages: weighted, macro
    f1_training.append(f1)
    mcc = metrics.matthews_corrcoef(target, predicted) # Matthews Correlation Coefficient
    mcc_training.append(mcc)
    performance_training.append(accuracy)
    performance_training.append(f1)
    performance_training.append(mcc)
    whole_data_training[name].append(performance_training)    
    
    helix_right_training = 0
    helix_training = 0
    sheet_right_training = 0
    sheet_training = 0
    coil_right_training = 0
    coil_training = 0
    helix_to_sheet_training = 0
    helix_to_coil_training = 0
    sheet_to_helix_training = 0
    sheet_to_coil_training = 0
    coil_to_helix_training = 0
    coil_to_sheet_training = 0
    
    # counter for confusion matrix: right state predicted?
    for index in range(len(target)):
        if target[index] == '0':
            sheet_training = sheet_training + 1
            if predicted[index] == '0':
                sheet_right_training = sheet_right_training + 1
            elif predicted[index] == '1':
                sheet_to_coil_training = sheet_to_coil_training + 1
            elif predicted[index] == '2':
                sheet_to_helix_training = sheet_to_helix_training + 1
        elif target[index] == '1':
            coil_training = coil_training + 1
            if predicted[index] == '0':
                coil_to_sheet_training = coil_to_sheet_training + 1
            elif predicted[index] == '1':
                coil_right_training = coil_right_training + 1
            elif predicted[index] == '2':
                coil_to_helix_training = coil_to_helix_training + 1
        elif target[index] == '2':
            helix_training = helix_training + 1
            if predicted[index] == '0':
                helix_to_sheet_training = helix_to_sheet_training + 1
            elif predicted[index] == '1':
                helix_to_coil_training = helix_to_coil_training + 1
            elif predicted[index] == '2':
                helix_right_training = helix_right_training + 1
    
    counter = [helix_right_training, helix_training, sheet_right_training, sheet_training, coil_right_training,
               coil_training, helix_to_sheet_training, helix_to_coil_training, sheet_to_helix_training,
               sheet_to_coil_training, coil_to_helix_training, coil_to_sheet_training]
    
    # accuracy for helix, sheet, coil for confidence intervals
    if helix_training != 0:
        helix_accuracy = helix_right_training / helix_training
        helix_to_sheet_accuracy = helix_to_sheet_training / helix_training
        helix_to_coil_accuracy = helix_to_coil_training / helix_training
    else:
        helix_accuracy = 0.0
        helix_to_sheet_accuracy = 0.0
        helix_to_coil_accuracy = 0.0
    if sheet_training != 0:
        sheet_accuracy = sheet_right_training / sheet_training
        sheet_to_helix_accuracy = sheet_to_helix_training / sheet_training
        sheet_to_coil_accuracy = sheet_to_coil_training / sheet_training
    else:
        sheet_accuracy = 0.0
        sheet_to_helix_accuracy = 0.0
        sheet_to_coil_accuracy = 0.0
    if coil_training != 0:
        coil_accuracy = coil_right_training / coil_training
        coil_to_helix_accuracy = coil_to_helix_training / coil_training
        coil_to_sheet_accuracy = coil_to_sheet_training / coil_training
    else:
        coil_accuracy = 0.0
        coil_to_helix_accuracy = 0.0
        coil_to_sheet_accuracy = 0.0
    accuracy_all = (helix_right_training + sheet_right_training + coil_right_training) / len(target)
    
    accuracy_helix.append(helix_accuracy)
    accuracy_sheet.append(sheet_accuracy)
    accuracy_coil.append(coil_accuracy)
    accuracy_helix_to_sheet.append(helix_to_sheet_accuracy)
    accuracy_helix_to_coil.append(helix_to_coil_accuracy)
    accuracy_sheet_to_helix.append(sheet_to_helix_accuracy)
    accuracy_sheet_to_coil.append(sheet_to_coil_accuracy)
    accuracy_coil_to_helix.append(coil_to_helix_accuracy)
    accuracy_coil_to_sheet.append(coil_to_sheet_accuracy)
    accuracy_hec = [helix_accuracy, sheet_accuracy, coil_accuracy, accuracy_all]
    
    whole_data_training[name].append(counter)
    whole_data_training[name].append(accuracy_hec)

print("Ready.")

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Ready.


Evaluation training set

In [35]:
performance_training = []

# accuracy
accuracy = ["Accuracy:"]
accuracy_mean = mean(accuracy_training)
accuracy_sem = sem(accuracy_training)
accuracy_interval = accuracy_sem * t.ppf((1 + 0.95) / 2, len(accuracy_training) - 1)
accuracy.append(accuracy_mean)
accuracy.append(accuracy_interval)
performance_training.append(accuracy)
print(accuracy)

# f1-score
f1 = ["F1-score:"]
f1_mean = mean(f1_training)
f1_sem = sem(f1_training)
f1_interval = f1_sem * t.ppf((1 + 0.95) / 2, len(f1_training) - 1)
f1.append(f1_mean)
f1.append(f1_interval)
performance_training.append(f1)
print(f1)

# mcc
mcc = ["MCC:"]
mcc_mean = mean(mcc_training)
mcc_sem = sem(mcc_training)
mcc_interval = mcc_sem * t.ppf((1 + 0.95) / 2, len(mcc_training) - 1)
mcc.append(mcc_mean)
mcc.append(mcc_interval)
performance_training.append(mcc)
print(mcc)

['Accuracy:', 0.6762892387860027, 0.0020726525961255876]
['F1-score:', 0.6848216863551136, 0.0021525952826641583]
['MCC:', 0.4572879248215813, 0.002959880130851302]


  accuracy_mean = mean(accuracy_training)
  f1_mean = mean(f1_training)
  mcc_mean = mean(mcc_training)


In [27]:
# confusion matrix validation set
# helix
accuracy_helix_training = ["Accuracy for helix:"]
accuracy_mean_helix = mean(accuracy_helix)
accuracy_sem_helix = sem(accuracy_helix)
accuracy_interval_helix = accuracy_sem_helix * t.ppf((1 + 0.95) / 2, len(accuracy_helix) - 1)
accuracy_helix_training.append(accuracy_mean_helix)
accuracy_helix_training.append(accuracy_interval_helix)
print(accuracy_helix_training)

# sheet
accuracy_sheet_training = ["Accuracy for sheet:"]
accuracy_mean_sheet = mean(accuracy_sheet)
accuracy_sem_sheet = sem(accuracy_sheet)
accuracy_interval_sheet = accuracy_sem_sheet * t.ppf((1 + 0.95) / 2, len(accuracy_sheet) - 1)
accuracy_sheet_training.append(accuracy_mean_sheet)
accuracy_sheet_training.append(accuracy_interval_sheet)
print(accuracy_sheet_training)

# coil
accuracy_coil_training = ["Accuracy for coil:"]
accuracy_mean_coil = mean(accuracy_coil)
accuracy_sem_coil = sem(accuracy_coil)
accuracy_interval_coil = accuracy_sem_coil * t.ppf((1 + 0.95) / 2, len(accuracy_coil) - 1)
accuracy_coil_training.append(accuracy_mean_coil)
accuracy_coil_training.append(accuracy_interval_coil)
print(accuracy_coil_training)

# helix to sheet
accuracy_helix_to_sheet_training = ["Accuracy for helix to sheet:"]
accuracy_mean_helix_to_sheet = mean(accuracy_helix_to_sheet)
accuracy_sem_helix_to_sheet = sem(accuracy_helix_to_sheet)
accuracy_interval_helix_to_sheet = accuracy_sem_helix_to_sheet * t.ppf((1 + 0.95) / 2, len(accuracy_helix_to_sheet) - 1)
accuracy_helix_to_sheet_training.append(accuracy_mean_helix_to_sheet)
accuracy_helix_to_sheet_training.append(accuracy_interval_helix_to_sheet)
print(accuracy_helix_to_sheet_training)

# sheet to helix
accuracy_sheet_to_helix_training = ["Accuracy for sheet to helix:"]
accuracy_mean_sheet_to_helix = mean(accuracy_sheet_to_helix)
accuracy_sem_sheet_to_helix = sem(accuracy_sheet_to_helix)
accuracy_interval_sheet_to_helix = accuracy_sem_sheet_to_helix * t.ppf((1 + 0.95) / 2, len(accuracy_sheet_to_helix) - 1)
accuracy_sheet_to_helix_training.append(accuracy_mean_sheet_to_helix)
accuracy_sheet_to_helix_training.append(accuracy_interval_sheet_to_helix)
print(accuracy_sheet_to_helix_training)

# coil to helix
accuracy_coil_to_helix_training = ["Accuracy for coil to helix:"]
accuracy_mean_coil_to_helix = mean(accuracy_coil_to_helix)
accuracy_sem_coil_to_helix = sem(accuracy_coil_to_helix)
accuracy_interval_coil_to_helix = accuracy_sem_coil_to_helix * t.ppf((1 + 0.95) / 2, len(accuracy_coil_to_helix) - 1)
accuracy_coil_to_helix_training.append(accuracy_mean_coil_to_helix)
accuracy_coil_to_helix_training.append(accuracy_interval_coil_to_helix)
print(accuracy_coil_to_helix_training)

# helix to coil
accuracy_helix_to_coil_training = ["Accuracy for helix to coil:"]
accuracy_mean_helix_to_coil = mean(accuracy_helix_to_coil)
accuracy_sem_helix_to_coil = sem(accuracy_helix_to_coil)
accuracy_interval_helix_to_coil = accuracy_sem_helix_to_coil * t.ppf((1 + 0.95) / 2, len(accuracy_helix_to_coil) - 1)
accuracy_helix_to_coil_training.append(accuracy_mean_helix_to_coil)
accuracy_helix_to_coil_training.append(accuracy_interval_helix_to_coil)
print(accuracy_helix_to_coil_training)

# sheet to coil
accuracy_sheet_to_coil_training = ["Accuracy for sheet to coil:"]
accuracy_mean_sheet_to_coil = mean(accuracy_sheet_to_coil)
accuracy_sem_sheet_to_coil = sem(accuracy_sheet_to_coil)
accuracy_interval_sheet_to_coil = accuracy_sem_sheet_to_coil * t.ppf((1 + 0.95) / 2, len(accuracy_sheet_to_coil) - 1)
accuracy_sheet_to_coil_training.append(accuracy_mean_sheet_to_coil)
accuracy_sheet_to_coil_training.append(accuracy_interval_sheet_to_coil)
print(accuracy_sheet_to_coil_training)

# coil to sheet
accuracy_coil_to_sheet_training = ["Accuracy for coil to sheet:"]
accuracy_mean_coil_to_sheet = mean(accuracy_coil_to_sheet)
accuracy_sem_coil_to_sheet = sem(accuracy_coil_to_sheet)
accuracy_interval_coil_to_sheet = accuracy_sem_coil_to_sheet * t.ppf((1 + 0.95) / 2, len(accuracy_coil_to_sheet) - 1)
accuracy_coil_to_sheet_training.append(accuracy_mean_coil_to_sheet)
accuracy_coil_to_sheet_training.append(accuracy_interval_coil_to_sheet)
print(accuracy_coil_to_sheet_training)

['Accuracy for helix:', 0.6938171928517788, 0.004915727473729923]
['Accuracy for sheet:', 0.3999870971468214, 0.0058164388376394805]
['Accuracy for coil:', 0.7036307613559525, 0.0027624341004211313]
['Accuracy for helix to sheet:', 0.07441532003489548, 0.001840384329931617]
['Accuracy for sheet to helix:', 0.15499086685050603, 0.0036127367332149623]
['Accuracy for coil to helix:', 0.19657803551172381, 0.0027286429065484013]
['Accuracy for helix to coil:', 0.20917197648451638, 0.0036794736283841697]
['Accuracy for sheet to coil:', 0.2920563752326414, 0.004975101487168443]
['Accuracy for coil to sheet:', 0.09979120313232373, 0.0015333694249989405]


  accuracy_mean_helix = mean(accuracy_helix)
  accuracy_mean_sheet = mean(accuracy_sheet)
  accuracy_mean_coil = mean(accuracy_coil)
  accuracy_mean_helix_to_sheet = mean(accuracy_helix_to_sheet)
  accuracy_mean_sheet_to_helix = mean(accuracy_sheet_to_helix)
  accuracy_mean_coil_to_helix = mean(accuracy_coil_to_helix)
  accuracy_mean_helix_to_coil = mean(accuracy_helix_to_coil)
  accuracy_mean_sheet_to_coil = mean(accuracy_sheet_to_coil)
  accuracy_mean_coil_to_sheet = mean(accuracy_coil_to_sheet)
