# Basic Stuff
#### Author: Carl Winkler

## Baseslines

In [362]:
import pandas as pd

In [363]:
def parse_counts(fileName, absolutes = False):
    # We always expect the order C, E, H
    colums_pred = ['AminoAcid','Coil_p','Strand_p','Helix_p']
    data = []
    with open(fileName) as f:
        lines = f.readlines() # list containing lines of file
        i = 0
        A_row = []

        for line in lines:
            elems = line.split('\t')
            if i == 0:
                # Amino is first column
                A_row.append(line[0])
                #Count for Coil
                A_row.append(int(elems[1]))
            elif i == 1:
                #Count for Strand
                A_row.append(int(elems[1]))
            elif i == 2:
                #Count for Helix
                A_row.append(int(elems[1]))
                data.append(A_row) 

            i = i + 1
            if i > 2:
                i = 0
                A_row = []

    # Calculate the probabilites
    if absolutes == False:
        for elem in data:
            total_Count = sum(elem[1:])
            elem[1] = elem[1] / total_Count
            elem[2] = elem[2] / total_Count
            elem[3] = elem[3] / total_Count

    # Work with pd data frame here
    return pd.DataFrame(data, columns = colums_pred)  

In [359]:
#These are the different ACIDs we use for the last part
parse_counts("testCounts.txt",False)

Unnamed: 0,AminoAcid,Coil_p,Strand_p,Helix_p
0,A,0.315058,0.169215,0.515727
1,C,0.395618,0.292801,0.311581
2,D,0.54946,0.125833,0.324708
3,E,0.345957,0.155869,0.498174
4,F,0.305693,0.320682,0.373625
5,G,0.671437,0.146378,0.182185
6,H,0.44607,0.223546,0.330384
7,K,0.393129,0.178781,0.42809
8,L,0.269999,0.242968,0.487033
9,M,0.329096,0.221852,0.449053


In [364]:
def parse_sequences(fileName):
    colums_pred = ['ID','length','Sequence','Target']
    data = []
    
    with open(fileName) as f:
        lines = f.readlines() # list containing lines of file
        i = 0
        A_row = []

        for line in lines:
            line = line.strip()
            elems = line.split('\t')
            if i == 0:
                # ID
                A_row.append(elems[0])
            elif i == 1:
                #length
                A_row.append(int(elems[0]))
            elif i == 2:
                # Whole list of tokens - sequence
                A_row.append(elems)
            elif i == 3:
                # Whole list of tokens - target
                A_row.append(elems)
                data.append(A_row) 

            #Logic for parsing every 4 things as a row
            i = i + 1
            if i > 3:
                i = 0
                A_row = []
            
    return pd.DataFrame(data, columns = colums_pred) 


In [366]:
def getPredDict(dataFrame):
    dict_p = {}
    dataFrame = dataFrame.reset_index()  # make sure indexes pair with number of rows
    
    for index, row in dataFrame.iterrows():
        token = ""
        if row['Coil_p'] > row['Helix_p'] and row['Coil_p'] > row['Strand_p']:
            token = "C"
        elif row['Strand_p'] > row['Helix_p']:
            token = "E"
        else:
            token = "H"
        dict_p[row['AminoAcid']] = token
        
    return dict_p

In [198]:
df = parse_counts("trainCounts.txt")
print("The naive approach maps the tokens as follows:", getPredDict(df))

The naive approach maps the tokens as follows: {'A': 'H', 'C': 'C', 'D': 'C', 'E': 'H', 'F': 'H', 'G': 'C', 'H': 'C', 'K': 'H', 'L': 'H', 'M': 'H', 'N': 'C', 'P': 'C', 'Q': 'H', 'R': 'H', 'S': 'C', 'T': 'C', 'V': 'E', 'W': 'H', 'X': 'H', 'Y': 'H'}


In [365]:
def number_of_correct_pred(prediction, target_seq):
    count = 0
    for idx, tok in enumerate(prediction):
        if tok == target_seq[idx]:
            count = count + 1
    return count
    

In [367]:
# Predicts the secondary structure which the acid is seen in mostly
class basic_naive_predict:
    
    def __init__(self, dict_p): 
        self.dict_p = dict_p
    def get_name(self):
        return "Naive-Greedy"
    
    def predict(self, sequence):
        prediction = []
        for token in sequence:
            prediction.append(self.dict_p[token])
        return prediction  


In [223]:
# Always predicts c
class always_c_predict:
        
    def get_name(self):
        return "Always-C"
    
    def predict(self, sequence):
        prediction = []
        for token in sequence:
            prediction.append('C')
        return prediction  

In [224]:
import random
# Predicts randomly
class random_predict:
        
    def get_name(self):
        return "Random-Token"

    def predict(self, sequence):
        allowed_tok = ["H","E","C"]
        prediction = []
        for token in sequence:
            prediction.append(random.choice(allowed_tok))
        return prediction  

In [340]:
def evalute_basic_predictor(predictor, filenames):
    for filename in filenames:
        print("-------------------------------------------------")
        print("Evaluating predictior:", predictor.get_name(), " on: ", filename, "\n")
        
        data_df = parse_sequences(filename)
        
        n_of_predicted_tokens = 0
        n_of_correct_predictions = 0
        
        data_df = data_df.reset_index()
        
        for index, row in data_df.iterrows():
            prediction = predictor.predict(row['Sequence'])
            
            n_of_predicted_tokens += len(prediction)
            n_of_correct_predictions += number_of_correct_pred(prediction, row['Target'])

        acc = n_of_correct_predictions / n_of_predicted_tokens
        print("The accuracy is:", acc, "\n")
        

In [233]:
counts_train_df = parse_counts("trainCounts.txt")
counts_test_df = parse_counts("testCounts.txt")
pred_dict_train = getPredDict(counts_train_df)
pred_dict_test = getPredDict(counts_test_df)

In [234]:
# Evaluate naive approach based on "trainCounts.txt"
predictor = basic_naive_predict(pred_dict_train)
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Naive-Greedy  on:  trainSS.txt 

The accuracy is: 0.49057580602822454 

-------------------------------------------------
Evaluating predictior: Naive-Greedy  on:  testSS.txt 

The accuracy is: 0.4893581866060825 



In [235]:
# Evaluate naive approach based on "testCounts.txt"
predictor = basic_naive_predict(pred_dict_test)
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Naive-Greedy  on:  trainSS.txt 

The accuracy is: 0.49057580602822454 

-------------------------------------------------
Evaluating predictior: Naive-Greedy  on:  testSS.txt 

The accuracy is: 0.4893581866060825 



Now we evaluate the two other baselines.

In [236]:
# Evaluate always_h_predict, It doesnt even need testCounts however its just a baseline so I didn't optimize that
predictor = always_c_predict()
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Always-C  on:  trainSS.txt 

The accuracy is: 0.3984553719206006 

-------------------------------------------------
Evaluating predictior: Always-C  on:  testSS.txt 

The accuracy is: 0.39915113337119845 



In [237]:
# Evaluate random_predict, It doesnt even need testCounts however its just a baseline so I didn't optimize that as well
predictor = random_predict()
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Random-Token  on:  trainSS.txt 

The accuracy is: 0.33366033231835346 

-------------------------------------------------
Evaluating predictior: Random-Token  on:  testSS.txt 

The accuracy is: 0.3329219806888005 



## Using Windows :D

In [361]:
# Always predicts c
class window_predict():
    
    def __init__(self, count_df, window_size, weights = -1): 
        self.count_df = count_df
        #self.dict_p = dict_p
        self.window_size = window_size
        countdict = {}
        
        for index, row in self.count_df.iterrows():
            #format C E H
            countdict[row['AminoAcid']] = [row['Coil_p'],row['Strand_p'],row['Helix_p']]

        self.dict_decision = countdict
        
        # Use uniform window if no window is given
        if weights == -1:
            self.weights = list = [1/window_size] * window_size 
        else:
            self.weights = weights
            
    def get_name(self):
        return "Window-Predictor"
    
    def predict(self, sequence):
        prediction = []
        for idx, token in enumerate(sequence):
            # Check if we are on the edges and predict with window_size 1 then
            
            if idx < self.window_size/2 or idx > len(sequence)-self.window_size/2: 
                row = self.count_df.loc[self.count_df['AminoAcid'] == token].iloc[0]
                token = ""
                if row['Coil_p'] > row['Helix_p'] and row['Coil_p'] > row['Strand_p']:
                    token = "C"
                elif row['Strand_p'] > row['Helix_p']:
                    token = "E"
                else:
                    token = "H"
                prediction.append(token)
                
            else:
                elem_in_window = sequence[idx - self.window_size // 2: idx + 1 + self.window_size // 2]
                wind_probs = {'C': 0, 'E': 0, 'H': 0}
                
                # Iterate over window for each element
                for wind_pos, weight in enumerate(self.weights):
                    counts = self.dict_decision[elem_in_window[wind_pos]]
                    wind_probs['C'] +=  counts[0] * weight
                    wind_probs['E'] +=  counts[1] * weight
                    wind_probs['H'] +=  counts[2] * weight
                    
                prediction.append(max(wind_probs, key=wind_probs.get))
                #print("Elements in window: ", elem_in_window)
                #print("Dict: ", wind_probs)
                #print("Decision: ", max(wind_probs, key=wind_probs.get))
                
        return prediction  

In [341]:
# Evaluate naive approach based on "testCounts.txt"
counts_train_df = parse_counts("trainCounts.txt", True)
predictor = window_predict(counts_train_df, window_size=3)
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

{'A': [71295, 38096, 118122], 'C': [15219, 11012, 11788], 'D': [91108, 20452, 55118], 'E': [67859, 30453, 99177], 'F': [35731, 36291, 44951], 'G': [131513, 29389, 36800], 'H': [29394, 14397, 22110], 'K': [66098, 29944, 72590], 'L': [73825, 66075, 135789], 'M': [15908, 10376, 22587], 'N': [70299, 17040, 36206], 'P': [90246, 12963, 25491], 'Q': [38836, 18797, 53058], 'R': [52829, 30235, 66170], 'S': [87485, 32805, 54994], 'T': [67458, 41901, 45222], 'V': [50802, 82384, 66285], 'W': [11710, 11523, 16033], 'X': [50033, 64830, 83208], 'Y': [31418, 32141, 37932]}
-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  trainSS.txt 

The accuracy is: 0.5301336673369625 

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  testSS.txt 

The accuracy is: 0.5286301440594826 



In [343]:
predictor = window_predict(counts_train_df, window_size=5)
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  trainSS.txt 

The accuracy is: 0.5372367926913126 

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  testSS.txt 

The accuracy is: 0.5358393142975164 



In [344]:
predictor = window_predict(counts_train_df, window_size=7)
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  trainSS.txt 

The accuracy is: 0.5352453237931466 

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  testSS.txt 

The accuracy is: 0.5338720503950019 



In [345]:
# Evaluate naive approach based on "testCounts.txt"
counts_train_df = parse_counts("trainCounts.txt", True)
predictor = window_predict(counts_train_df, window_size=3,weights = [0.25,0.5,0.25])
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  trainSS.txt 

The accuracy is: 0.5276979930307257 

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  testSS.txt 

The accuracy is: 0.5258615170134765 



In [349]:
# Evaluate naive approach based on "testCounts.txt"
counts_train_df = parse_counts("trainCounts.txt", True)
predictor = window_predict(counts_train_df, window_size=3,weights = [0.4,0.1,0.4])
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  trainSS.txt 

The accuracy is: 0.5107432863779435 

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  testSS.txt 

The accuracy is: 0.5096060308772654 



In [350]:
# Evaluate naive approach based on "testCounts.txt"
counts_train_df = parse_counts("trainCounts.txt", True)
predictor = window_predict(counts_train_df, window_size=3,weights = [0.1,0.8,0.1])
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  trainSS.txt 

The accuracy is: 0.5046371091486549 

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  testSS.txt 

The accuracy is: 0.5037001084318686 



In [351]:
# Evaluate naive approach based on "testCounts.txt"
counts_train_df = parse_counts("trainCounts.txt", True)
predictor = window_predict(counts_train_df, window_size=3,weights = [0.2,0.6,0.2])
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  trainSS.txt 

The accuracy is: 0.5204780080178902 

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  testSS.txt 

The accuracy is: 0.5192389115505758 



In [352]:
# Evaluate naive approach based on "testCounts.txt"
counts_train_df = parse_counts("trainCounts.txt", True)
predictor = window_predict(counts_train_df, window_size=5,weights = [0.05,0.15,0.5,0.15,0.05])
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  trainSS.txt 

The accuracy is: 0.5244033828963927 

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  testSS.txt 

The accuracy is: 0.5228089017400733 



In [348]:
# Evaluate naive approach based on "testCounts.txt"
counts_train_df = parse_counts("trainCounts.txt", True)
predictor = window_predict(counts_train_df, window_size=5,weights = [0.1,0.2,0.5,0.2,0.1])
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  trainSS.txt 

The accuracy is: 0.5334542848136886 

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  testSS.txt 

The accuracy is: 0.5317757009345795 



In [375]:
# Evaluate naive approach based on "testCounts.txt"
counts_train_df = parse_counts("trainCounts.txt", True)
predictor = window_predict(counts_train_df, window_size=5,weights = [0.12,0.18,0.4,0.18,0.12])
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  trainSS.txt 

The accuracy is: 0.5375724607904637 

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  testSS.txt 

The accuracy is: 0.5359601383797181 



In [374]:
sum([0.12,0.18,0.4,0.18,0.12])

0.9799999999999999

In [373]:
# Evaluate naive approach based on "testCounts.txt"
counts_train_df = parse_counts("trainCounts.txt", True)
predictor = window_predict(counts_train_df, window_size=5,weights = [0.15,0.2,0.3,0.2,0.15])
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  trainSS.txt 

The accuracy is: 0.54233873973967 

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  testSS.txt 

The accuracy is: 0.5405504208189188 



In [371]:
# Evaluate naive approach based on "testCounts.txt"
counts_train_df = parse_counts("trainCounts.txt", True)
predictor = window_predict(counts_train_df, window_size=5,weights = [0.1,0.1,0.4,0.2,0.2])
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  trainSS.txt 

The accuracy is: 0.5369458572210773 

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  testSS.txt 

The accuracy is: 0.5351030102752105 



In [372]:
# Evaluate naive approach based on "testCounts.txt"
counts_train_df = parse_counts("trainCounts.txt", True)
predictor = window_predict(counts_train_df, window_size=5,weights = [0.15,0.15,0.3,0.2,0.2])
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  trainSS.txt 

The accuracy is: 0.5420228372207375 

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  testSS.txt 

The accuracy is: 0.5401982754169463 



In [369]:
# Evaluate naive approach based on "testCounts.txt"
counts_train_df = parse_counts("trainCounts.txt", True)
predictor = window_predict(counts_train_df, window_size=7,weights = [0.5,0.1,0.2,0.4,0.2,0.1,0.5])
evalute_basic_predictor(predictor, ["trainSS.txt", "testSS.txt"])

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  trainSS.txt 

The accuracy is: 0.5071334672538085 

-------------------------------------------------
Evaluating predictior: Window-Predictor  on:  testSS.txt 

The accuracy is: 0.5061661589301389 



In [None]:
#Archive


#Calculate the accuracy from the counts directly
def acc_naive_fromCounts(fileName):
    dataFrame = parse_counts(fileName)
    dataFrame = dataFrame.reset_index()  # make sure indexes pair with number of rows
    acc = 0
    
    for index, row in dataFrame.iterrows():
        token = ""
        if row['Coil_p'] > row['Helix_p'] and row['Coil_p'] > row['Strand_p']:
            acc += row['Coil_p']
        elif row['Strand_p'] > row['Helix_p']:
            acc += row['Strand_p']
        else:
            acc += row['Helix_p']
        
    acc = acc / len(dataFrame['Coil_p'])  
    return acc

# Here we compare with the accuracies from the counts directly to see that the implementation is correct
print("ACC with trainCounts:", acc_naive_fromCounts("trainCounts.txt"))
print("ACC with testCounts:", acc_naive_fromCounts("testCounts.txt"))