In [61]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_curve

In [62]:
def labeling(subset):

    label=[] #list of 1 if the seq is + or 0 if the seq is neg

    subset['positive/negative'] =np.where(subset['SP cleavage']=='False', '-','+')
    for row,seq in subset.iterrows():
        if seq['positive/negative']=="+":
            label.append(1)
        else:
            label.append(0)
    return label

In [63]:
subset_list=['subset1_arricchito.tsv','subset2_arricchito.tsv','subset3_arricchito.tsv','subset4_arricchito.tsv','subset5_arricchito.tsv']
subset={}
for i in range(len(subset_list)):
    subset[f'subset_{i+1}']=pd.read_csv(subset_list[i],sep="\t")

In [64]:
list_labels=[]
subset_for_labels = {'subset_1':subset['subset_4'],'subset_2':subset['subset_5'],'subset_3':subset['subset_1'],'subset_4':subset['subset_2'],'subset_5':subset['subset_3']}
for df in subset_for_labels.values():
    etiq = labeling(df)
    list_labels.append(etiq)

In [65]:
def file_set_creator(subset_dict):
    training_list = []
    validation_list = []
    testing_list = []
    keys = list(subset_dict.keys())   # elenco delle chiavi: ["subset_1", ..., "subset_5"]
    n = len(keys)

    for i in range(n):
        # --- Training: 3 DataFrame
        training_keys = [keys[i % n], keys[(i+1) % n], keys[(i+2) % n]]
        training_dfs = [subset_dict[k] for k in training_keys]
        training = pd.concat(training_dfs, ignore_index=True)
        pos_seq = []

        positive_df = training[training['SP cleavage'] != 'False']
    
        for _, row in positive_df.iterrows():
            sequence = row["Sequence"]
            cleavage = int(row["SP cleavage"])
            # estraggo la porzione di interesse
            pos_seq.append(sequence[cleavage-13:cleavage+2])
        
        training_list.append(pos_seq)




        # --- Validation: 1 DataFrame
        validation_key = keys[(i+3) % n]
        validation = subset_dict[validation_key]
        seq_val_90=[]
        for index,row in validation.iterrows(): #metodo di Pandas per iterare 
            sequence=row["Sequence"]
            seq_val_90.append(sequence[:90])
        validation_list.append(seq_val_90)
        


        # --- Testing: 1 DataFrame
        testing_key = keys[(i+4) % n]
        testing = subset_dict[testing_key]
        seq_test_90=[]
        for index,row in testing.iterrows(): #metodo di Pandas per iterare 
            sequence=row["Sequence"]
            seq_test_90.append(sequence[:90])
        testing_list.append(seq_test_90)

        

    return training_list,validation_list,testing_list

In [66]:
trainingset,validationset,testingset = file_set_creator(subset)

In [67]:
for i in validationset:
    for seq in i:
        if len(seq) > 90:
            print(seq)

In [68]:
def init_matrix (window):
    aminoacids = ["A","R","N","D","C","Q","E","G","H","I",
              "L","K","M","F","P","S","T","W","Y","V"]
    dict_mat = {}
    for i in aminoacids:
        dict_mat[i]= [1 for i in range(window)]
    return dict_mat

In [74]:
def compute_pswm (matrix,list_seq):
    diz_swp = {'A':0.08, 'R':0.06, 'N':0.04, 'D':0.06, 'C':0.01, 'Q':0.04, 'E':0.07, 'G':0.07, 'H':0.02, 'I':0.06, 'L':0.10, 'K':0.06, 'M':0.02, 'F':0.04, 'P':0.05, 'S':0.07, 'T':0.05, 'W':0.01, 'Y':0.03, 'V':0.07}
    for seq in list_seq:
        for index,res in enumerate(seq):
          if res not in matrix:
              continue
          matrix[res][index]+=1
    
    div = int(len(list_seq))+20

    
    for key in matrix:
        matrix[key] = [np.log2(x/(div*diz_swp[key])) for x in matrix[key]]

    return matrix

In [93]:
def score_window(matrix, seq_window):
    score = 0
    for pos in range(len(seq_window)):
        letter = seq_window[pos]
        if letter in matrix:
            score += matrix[letter][pos]
    return score


In [94]:
def train_and_score(training,validation):
     global current_labels
     matrix = init_matrix(15)
     PSWM = compute_pswm(matrix,training)
     final_score=[]
     for seq in validation:#ricorda di farla come lista di seqeunze len = 90
          scores_seq = []
          if len(seq) < 15:
               continue
          for i in range(len(seq)-14):
               window = seq[i:15+i]
               score = score_window(PSWM,window)
               scores_seq.append(score)
          max_score = max(scores_seq)
          final_score.append(max_score)
     precision, recall, thresholds = precision_recall_curve(current_labels, final_score)
     fscore = (2 * precision * recall) / (precision + recall)
     index = np.argmax(fscore)
     optimal_threshold = thresholds[index]
     return PSWM, optimal_threshold

In [133]:
PSWM_matrix_final = {}
optimal_threshold = {}

for i in range(len(trainingset)):
    current_labels = list_labels[i]
    PSWM_matrix_final[f"PSWM_matrix{i}"], optimal_threshold[f"optimal_threshold{i}"] = train_and_score(trainingset[i], validationset[i])

W = PSWM_matrix_final['PSWM_matrix0']
Th = optimal_threshold['optimal_threshold0']


In [134]:
print(W)

{'A': [0.2816642246736351, 0.5310236936103524, 0.10833262178807318, 0.33510348363509546, 0.9966872659591642, 1.2541834882515281, 0.33510348363509546, 0.9631347062481377, 0.6412066113607754, 0.16845361422564425, 1.7136151068888255, 0.3086312722739044, 2.6358928114643074, 0.9800085248125339, -0.7458165117484719], 'R': [-2.2238638085531157, -5.03121873061072, -4.03121873061072, -2.7092906357233577, -3.446256229889564, -3.446256229889564, -2.2238638085531157, -3.03121873061072, -1.446256229889564, -1.1243281350022014, -3.03121873061072, -0.031218730610720058, -3.03121873061072, -0.5717871119734227, -0.07702242022384478], 'N': [-4.446256229889564, -2.8612937291684077, -3.4462562298895643, -3.4462562298895643, -1.8612937291684077, -2.8612937291684077, -2.1243281350022016, -2.8612937291684077, -0.8612937291684077, -1.8612937291684077, -2.1243281350022016, -0.9868246112522668, -3.4462562298895643, -0.6389013078319601, -0.3587933886392246], 'D': [-2.7092906357233577, -4.03121873061072, -5.03121

In [135]:
print(Th)

9.783363992057446


In [144]:
def testing(matrix, th, testing_set):
    """
    Calcola lo score massimo (su finestre di lunghezza 15) per ogni sequenza del set di test.
    Ritorna una lista di score, uno per sequenza.
    """
    final_score = []
    for seq in testing_set:  # lista di sequenze (ognuna lunga ~90 aa)
        if len(seq) < 15:
            continue
        scores_seq = []
        for i in range(len(seq) - 14):
            window = seq[i:i+15]
            score = score_window(matrix, window)
            scores_seq.append(score)
        max_score = max(scores_seq)
        final_score.append(max_score)
        y_pred_test = [int(s >= th) for s in final_score]
    return final_score, y_pred_test


In [145]:
for i in range(len(testingset)):
    f,l = testing(W,Th,testingset[i])
    print(f)
    print(l)



[9.435512394458527, 11.08847797446635, 9.1407366746978, 17.623394378658972, 10.379898373090253, 8.71450443973984, 16.924909990192948, 10.515907227892, 19.645224796140877, 19.24217764529871, 8.248335066835264, 15.29303194912492, 17.91857334264858, 14.00669838831305, 13.493372397368525, 10.288649062583552, 13.432200751695667, 15.343340612369213, 13.489333354392139, 10.909282192740529, 9.725740429831125, 4.234271805375119, 11.859488416190183, 11.661137844320438, 12.158526175216487, 11.603334971284488, 8.491908121485615, 8.09621169271139, 13.072976532688417, 18.137272817010462, 14.24261268420539, 19.372654586074205, 15.296819034992705, 12.322207233829722, 9.36386681172984, 15.247575799683256, 10.640416054691665, 8.616048705525834, 11.59325652985143, 9.806771858695278, 11.458445766522612, 9.395382236838202, 11.483881626323946, 12.100679047200888, 6.304006722975061, 15.004221519111447, 7.185580349346038, 10.497785282885355, 17.452101755190313, 14.338646870956314, 13.794202711068904, 11.12709