In [1]:
%cd ..

/home/d/PycharmProjects/protein_properties/src


In [2]:
import pandas as pd
from Bio import ExPASy
from Bio import SwissProt
import re
import numpy as np
from fasta import Fasta
from utils import seed_all
import random
import copy
seed_all(13)

Global seed set to 13


In [3]:
# position of glyco site is one based -> will be converted to zero based
def fetch_protein_sequence_pdb(uniprot_id):
    handle = ExPASy.get_sprot_raw(uniprot_id)
    record = SwissProt.read(handle)
    return record.sequence

- load data
- merge train 
- remove test from train 
- get sequences 
    - for O-linked dataset: site positions are part of the fasta header -> one header per protein is enough 
    - for the rest: group the entries by PID and accumulate glyco site positions 
- map sites to sequences
- write to fasta file 
    - one fasta for training and one for RR, containing the PID and sequences 

### Load Data

In [4]:
import os
import os
os.getcwd()

'/home/d/PycharmProjects/protein_properties/src'

The negative samples of Captor are not usable, since they do not provide the exact position of the window -> would mean I have to align and map the labels to the sequence = pain

In [5]:
N_LMNgly_train_df = pd.read_csv('../data/LMNglyPred/Train_Data_NGlycositeAtlas.csv', usecols=["label","PID","Position"])
N_LMNgly_test_df = pd.read_csv('../data/LMNglyPred/Test_Data_NGlycositeAtlas.csv', usecols=["label","PID","Position"])

In [6]:
O_train_pos = pd.read_csv("../data/OglyPred/Feature_Extraction_O_linked_Training_Positive_4885_Sites_less.txt", 
                          usecols=[0, 1, 4], header=None, names=["Position", "PID", "AA"])
O_train_neg = pd.read_csv("../data/OglyPred/Feature_Extraction_O_linked_Training_Negative_114307_Sites_less.txt", 
                          usecols=[0, 1, 4], header=None, names=["Position", "PID", "AA"])
O_test_pos = pd.read_csv("../data/OglyPred/Feature_Extraction_O_linked_Testing_Positive_375_Sites_less.txt", 
                         usecols=[0, 1, 4], header=None, names=["Position", "PID", "AA"])
O_test_neg = pd.read_csv("../data/OglyPred/Feature_Extraction_O_linked_Testing_Negative_11466_Sites_less.txt", 
                         usecols=[0, 1, 4], header=None, names=["Position", "PID", "AA"])

In [7]:
N_LMNgly_train_df["AA"] = "N"
N_LMNgly_test_df["AA"] = "N"
O_train_pos["label"] = 2
O_train_neg["label"] = 0
O_test_pos["label"] = 2
O_test_neg["label"] = 0

### Clean Data

In [8]:
merged_test_df = pd.concat([N_LMNgly_test_df, O_test_pos, O_test_neg], ignore_index=True)
O_merged_test_df = pd.concat([O_test_pos, O_test_neg], ignore_index=True)

In [9]:
N_merged_train_df = pd.concat([N_LMNgly_train_df, O_train_pos, O_train_neg], ignore_index=True)
N_merged_train_df.duplicated().sum()


0

In [10]:
print(f"Lenght {len(N_merged_train_df)}\nClass distribution {N_merged_train_df.label.value_counts()}")

Lenght 143294
Class distribution label
0    130004
1      8405
2      4885
Name: count, dtype: int64


Lenght 143294
Class distribution label
0    130004
1      8405
2      4885
Name: count, dtype: int64

In [11]:
# drop the proteins that are in the either of the two test set
N_merged_train_df = N_merged_train_df[~N_merged_train_df["PID"].isin(merged_test_df["PID"])]
N_merged_train_df["PID"] = N_merged_train_df["PID"].astype(str)
print(f"Lenght {len(N_merged_train_df)}\nClass distribution {N_merged_train_df.label.value_counts()}")

Lenght 134617
Class distribution label
0    121657
1      8245
2      4715
Name: count, dtype: int64


In [31]:
N_merged_train_df.to_csv("../data/glyco/train_df.csv", index=False, header=True)
merged_test_df.to_csv("../data/glyco/test_df.csv", index=False, header=True)
O_merged_test_df.to_csv("../data/glyco/O_test_df.csv", index=False, header=True)
N_LMNgly_test_df.to_csv("../data/glyco/N_test_df.csv", index=False, header=True)

In [23]:
from Bio import Entrez
from Bio.Seq import Seq
import concurrent.futures

import requests

def get_protein_sequences(protein_ids: list, exiting_fasta: Fasta = None):
    sequences = {}
    protein_ids = set(protein_ids)

    if exiting_fasta:
        existing_pids = [pid for pid in protein_ids if pid in exiting_fasta]
        for pid in existing_pids:
            sequences[pid] = exiting_fasta[pid]   
        protein_ids = protein_ids - set(existing_pids)        
        print(f"Length of existing fasta {len(existing_pids)}")     
                
    # Separate UniProt and NCBI IDs
    uniprot_ids = [id for id in protein_ids if not id.startswith('NP_')]
    ncbi_ids = [id for id in protein_ids if id.startswith('NP_')]
    
    # Fetch sequences for UniProt IDs
    if uniprot_ids:
        uniprot_sequences = fetch_uniprot_sequences(uniprot_ids)  # Fetch UniProt sequences
        sequences.update(uniprot_sequences)
    
    # Fetch sequences for NCBI IDs
    if ncbi_ids:
        ncbi_sequences = fetch_ncbi_sequences(ncbi_ids)  # Fetch NCBI sequences
        sequences.update(ncbi_sequences)
    
    return sequences

def fetch_uniprot_sequences(uniprot_ids):
    sequences = {}
    
    for uniprot_id in uniprot_ids:
        
        # Make a request to UniProt for the FASTA sequence
        url = f'https://www.uniprot.org/uniprot/{uniprot_id}.fasta'
        response = requests.get(url)
        
        if response.ok:
            sequences[uniprot_id] = [''.join(response.text.split('\n')[1:])]
        else:
            print(f'Failed to fetch sequence for UniProt ID {uniprot_id}')
            print("Skipping...")
    
    return sequences

def fetch_ncbi_sequences(ncbi_ids):
    Entrez.email = 'd.hasler@tum.de'  # Set your email address here
    sequences = {}
    
    def fetch_sequence(ncbi_id):
        handle = Entrez.efetch(db='protein', id=ncbi_id, rettype='fasta', retmode='text')
        record = handle.read()
        handle.close()
        sequences[ncbi_id] = [record.split('\n', 1)[1].replace('\n', '')]
    
    # Fetch sequences using concurrent futures
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(fetch_sequence, ncbi_ids)
    
    return sequences

In [24]:
# this will take around 90 minutes
all_PID = set((N_merged_train_df["PID"].tolist() + 
               N_LMNgly_test_df["PID"].tolist() + 
               O_merged_test_df["PID"].tolist()))
sequences = get_protein_sequences(all_PID, exiting_fasta=Fasta("../data/glyco/base/glyco_all.fasta"))

Length of existing fasta 10648


In [27]:
print(f"Number of sequences {len(sequences)}\nNumber of proteins that are not fetched {len(all_PID) - len(sequences)}")

Number of sequences 10657
Number of proteins that are not fetched 0


In [26]:
Fasta(sequences=sequences).write_fasta("../../data/glyco/base/glyco_all.fasta", overwrite=True)

In [15]:
sequences = Fasta("../data/glyco/base/glyco_all.fasta")

In [32]:
train_fasta = {pid: sequences[pid] for pid in N_merged_train_df["PID"].tolist()}
test_fasta = {pid: sequences[pid] for pid in merged_test_df["PID"].tolist()}
O_test_fasta = {pid: sequences[pid] for pid in O_merged_test_df["PID"].tolist()}
N_test_fasta = {pid: sequences[pid] for pid in N_LMNgly_test_df["PID"].tolist()}
Fasta(sequences=train_fasta).write_fasta("../data/glyco/train.fasta", overwrite=True)
Fasta(sequences=test_fasta).write_fasta("../data/glyco/test.fasta", overwrite=True)
Fasta(sequences=O_test_fasta).write_fasta("../data/glyco/O_test.fasta", overwrite=True)
Fasta(sequences=N_test_fasta).write_fasta("../data/glyco/N_test.fasta", overwrite=True)

In [16]:
merged_train_dict = {}
label_dict = {0: "T", 1: "N", 2: "O", 5: "X"}
count_dict = {0: 0, 1: 0, 2: 0, 5: 0}
not_found = []
for i in set(N_merged_train_df["PID"]):
    try:
        seq = sequences[i][0]
    except KeyError:
        not_found.append(i)
        continue
    position = N_merged_train_df[N_merged_train_df["PID"] == i]["Position"].tolist()
    true_labels = N_merged_train_df[N_merged_train_df["PID"] == i]["label"].tolist()
    labels = np.zeros(len(seq)) + 5
    for idx, pos in enumerate(position):
        try:
            labels[pos] = true_labels[idx]
        except IndexError:
            continue
    for l in labels:
        count_dict[l] += 1
    merged_train_dict[i] = [seq, ["".join([label_dict[l] for l in labels.tolist()])]]

print(len(merged_train_dict.keys()))
print(len(merged_train_dict.keys()) / len(N_merged_train_df["PID"].unique()))

10706
0.9231697852892989


In [17]:
set(not_found).intersection(set(O_captor_train_df["PID"]))

{'P33827-1', 'Q13275.2'}

In [18]:
shit_ids = []
for item in merged_train_dict.items():
    if len(set(list(item[1][1][0]))) == 1:
        shit_ids.append(item[0])
for id in shit_ids:
    merged_train_dict.pop(id)

In [19]:
print(len(merged_train_dict.keys()))
print(len(merged_train_dict.keys()) / len(N_merged_train_df["PID"].unique()))

10668
0.9198930757954643


In [69]:

merged_N_test_dict = {}
label_dict = {0: "T", 1: "N", 2: "O", 5: "X"}
for i in set(N_LMNgly_test_df["PID"]):
    try:
        seq = sequences[i][0]
    except KeyError:
        print(f"Skipping {i}")
        continue
    position = N_LMNgly_test_df[N_LMNgly_test_df["PID"] == i]["Position"].tolist()
    true_labels = N_LMNgly_test_df[N_LMNgly_test_df["PID"] == i]["label"].tolist()
    labels = np.zeros(len(seq)) + 5
    for idx, pos in enumerate(position):
        try:
            labels[pos] = true_labels[idx]
        except IndexError:
            print(f"Skipping {i} because of IndexError")
            continue
    merged_N_test_dict[i] = [seq, ["".join([label_dict[l] for l in labels.tolist()])]]
shit_ids = []
for item in merged_N_test_dict.items():
    if len(set(list(item[1][1][0]))) == 1:
        shit_ids.append(item[0])
for id in shit_ids:
    merged_N_test_dict.pop(id)

In [70]:
len(set(N_LMNgly_test_df["PID"]))

955

In [71]:
len(merged_N_test_dict)

955

In [78]:
Fasta(sequences=merged_N_test_dict).write_fasta("../../data/glyco/N_test.o", overwrite=True)

In [72]:
negative_sites = 0
postive_N_sites = 0
postive_O_sites = 0
for id in merged_train_dict.keys():
    try:
        negative_sites += merged_train_dict[id][1][0].count("T")
        postive_O_sites += merged_train_dict[id][1][0].count("O")
        postive_N_sites += merged_train_dict[id][1][0].count("N")
    except:
        continue
print(f"Negative sites: {negative_sites}\nN-sites: {postive_N_sites}\nO-sites: {postive_O_sites}")

Negative sites: 15778
N-sites: 11057
O-sites: 4637


### Split O-Linked test from train

In [27]:
# select a number of protein that contain only O-sites
o_prot = {}
for id in merged_train_dict.keys():
    try:
        t = merged_train_dict[id][1][0].count("T")
        o = merged_train_dict[id][1][0].count("O")
        n = merged_train_dict[id][1][0].count("N")
        if o > 0 and t == 0 and n == 0:
            o_prot[id] = o
    except:
        continue

In [28]:
len(o_prot.keys())

1227

In [29]:
o_prot_ids = []
for id in o_prot.keys():
    if "." in id:
        o_prot_ids.append(id.split(".")[0])
    else:
        o_prot_ids.append(id)

In [30]:


pids = list(o_prot.keys())
random.shuffle(pids)
test_ids = []
num_in_test = 0
for i in pids:
    if num_in_test < 200:
        test_ids.append(i)
        num_in_test += o_prot[i]
    else:
        break
test_ids

['Q13790.2',
 'Q92187.1',
 'Q9BT09.1',
 'Q99618.1',
 'Q92508.4',
 'Q6P9A2.2',
 'P52594.2',
 'P07204.2',
 'O95972.2',
 'Q2UY09.2',
 'Q9NPR9.3',
 'P01024.2',
 'Q9NSC7.1',
 'Q9UKU9.1',
 'P08318.1',
 'O95631.2',
 'Q86X29.4',
 'O60609.2',
 'Q08397.2',
 'P31150.2',
 'P04141.1',
 'Q8IVY1.1',
 'P05362.2',
 'Q96BF3.2',
 'P28300.2',
 'Q9UBG0.2',
 'P02787.3',
 'Q8WWX8.1',
 'Q9BRP8.1',
 'Q9HAB3.1',
 'Q9Y624.1',
 'Q8IZ83.2',
 'Q13231.1',
 'Q99941.2',
 'Q9P0U3.2',
 'Q9NVR5.2',
 'Q12841.1',
 'P55157.1',
 'Q6UWL6.2',
 'Q63HQ2.2',
 'O43399.2',
 'Q9GZZ8.1',
 'O60353.2',
 'Q4G148.2',
 'O60502.2',
 'Q9UMX5.1',
 'Q76M96.1',
 'P49641.3',
 'P21453.2',
 'Q15125.3',
 'Q8N1G4.1',
 'P08697.3',
 'P01127.1',
 'Q14697.3',
 'O75631.3',
 'Q14435.2',
 'Q14669.1',
 'O75821.2',
 'Q86VZ4.2',
 'Q9NP72.1',
 'Q99795.1',
 'Q9Y2H0.3',
 'Q9NPY3.3',
 'Q9HBJ8.1',
 'Q5VU43.3',
 'A5D8T8.3',
 'Q9UM21.1',
 'Q96RP7.1',
 'P08709.1']

In [31]:
O_test_dict = {i: merged_train_dict.pop(i) for i in test_ids}

In [32]:
set(O_test_dict.keys()) & set(merged_train_dict.keys())

set()

### Add negative samples

In [3]:
def generate_negative_samples(data_dict, total_num_negative, o_or_n="O"):
    num_negative_samples = 0
    ids_shuffled = list(data_dict.keys())
    random.shuffle(ids_shuffled)
    while num_negative_samples <= total_num_negative:
        # select random id
        id = random.choice(ids_shuffled)
        seq = data_dict[id][0]
        labels = data_dict[id][1]
        num_negative = random.randint(1, 8) if total_num_negative > 5 else total_num_negative
        if o_or_n == "O":
            potential_idx = [i for i, x in enumerate(list(seq)) if x == "S" or x == "T"]
        else:
            potential_idx = [i for i, x in enumerate(list(seq)) if x == "N"]
            
        potential_idx = list(set(potential_idx) - set([i for i, x in enumerate(list(labels)) if x in ["N", "T", "O"]]))

        for i in range(num_negative):
            while True:
                if potential_idx == 0 or i >= len(potential_idx):
                    break
                idx = random.choice(potential_idx)
                if not list(labels)[idx] in ["N", "T", "O"]:
                    labels = labels[:idx] + "T" + labels[idx + 1 :]
                    break
        num_negative_samples += num_negative
        print(f"{num_negative_samples}", end=" ")
        data_dict[id][1] = labels
        
       
    return data_dict

In [4]:
merged_train_dict = Fasta("../data/glyco/train.o")

In [5]:
merged_train_dict["P33527"]

['MALRGFCSADGSDPLWDWNVTWNTSNPDFTKCFQNTVLVWVPCFYLWACFPFYFLYLSRHDRGYIQMTPLNKTKTALGFLLWIVCWADLFYSFWERSRGIFLAPVFLVSPTLLGITMLLATFLIQLERRKGVQSSGIMLTFWLVALVCALAILRSKIMTALKEDAQVDLFRDITFYVYFSLLLIQLVLSCFSDRSPLFSETIHDPNPCPESSASFLSRITFWWITGLIVRGYRQPLEGSDLWSLNKEDTSEQVVPVLVKNWKKECAKTRKQPVKVVYSSKDPAQPKESSKVDANEEVEALIVKSPQKEWNPSLFKVLYKTFGPYFLMSFFFKAIHDLMMFSGPQILKLLIKFVNDTKAPDWQGYFYTVLLFVTACLQTLVLHQYFHICFVSGMRIKTAVIGAVYRKALVITNSARKSSTVGEIVNLMSVDAQRFMDLATYINMIWSAPLQVILALYLLWLNLGPSVLAGVAVMVLMVPVNAVMAMKTKTYQVAHMKSKDNRIKLMNEILNGIKVLKLYAWELAFKDKVLAIRQEELKVLKKSAYLSAVGTFTWVCTPFLVALCTFAVYVTIDENNILDAQTAFVSLALFNILRFPLNILPMVISSIVQASVSLKRLRIFLSHEELEPDSIERRPVKDGGGTNSITVRNATFTWARSDPPTLNGITFSIPEGALVAVVGQVGCGKSSLLSALLAEMDKVEGHVAIKGSVAYVPQQAWIQNDSLRENILFGCQLEEPYYRSVIQACALLPDLEILPSGDRTEIGEKGVNLSGGQKQRVSLARAVYSNADIYLFDDPLSAVDAHVGKHIFENVIGPKGMLKNKTRILVTHSMSYLPQVDVIIVMSGGKISEMGSYQELLARDGAFAEFLRTYASTEQEQDAEENGVTGVSGPGKEAKQMENGMLVTDSAGKQLQRQLSSSSSYSGDISRHHNSTAELQKAEAKKEETWKLMEADKAQTGQVKLSVYWDYMKAIGLFISFLSIFLFMCNHVSALASNYWLSL

In [6]:
# get number of negative samples -> n and o sites
negative_o_sites = 0
negative_n_sites = 0
for id in merged_train_dict.keys():
    indices_neg = [i for i, x in enumerate(merged_train_dict[id][1]) if x == "T"]
    indices_o = [i for i, x in enumerate(merged_train_dict[id][0]) if x == "S" or x == "T"]
    indices_n = [i for i, x in enumerate(merged_train_dict[id][0]) if x == "N"]
    negative_o_sites += len(set(indices_neg).intersection(set(indices_o)))
    negative_n_sites += len(set(indices_neg).intersection(set(indices_n)))
print(f"Negative O sites: {negative_o_sites}\nNegative N sites: {negative_n_sites}")
print(f"Total negative sites: {negative_o_sites + negative_n_sites}")

Negative O sites: 9148
Negative N sites: 10899
Total negative sites: 20047


In [7]:
sum([merged_train_dict[i][1].count("T") for i in merged_train_dict.keys()])

20245

In [8]:
a = generate_negative_samples(copy.deepcopy(merged_train_dict), 5000, "N")

1 3 5 7 11 12 14 16 20 25 26 29 30 35 37 42 47 50 52 54 59 61 64 66 71 73 76 80 82 87 92 96 97 102 105 108 110 111 112 113 115 120 122 123 124 127 128 132 136 141 146 147 151 155 157 162 166 171 173 178 182 185 187 189 193 196 198 199 202 207 209 212 217 220 224 227 230 231 234 236 239 241 245 246 248 251 253 255 259 264 268 272 276 281 283 288 293 298 302 307 309 312 314 317 322 327 329 332 336 341 346 348 351 355 359 361 366 369 374 375 380 381 385 386 390 394 397 398 401 405 407 411 412 416 420 425 427 431 434 438 439 440 443 448 450 454 455 457 462 466 469 474 479 484 489 490 491 493 496 497 500 505 508 511 513 514 518 522 527 529 530 532 535 539 541 546 551 554 559 564 569 572 576 580 581 586 588 593 597 602 605 609 612 613 615 617 622 624 626 627 632 637 641 645 649 653 657 662 664 666 671 672 674 679 681 683 688 693 694 696 698 701 702 706 711 712 714 719 723 727 731 734 738 740 745 746 751 753 758 760 762 765 768 773 774 775 780 784 788 793 795 800 805 807 810 813 815 817 819 8

In [9]:
sum([a[i][1].count("T") for i in a.keys()])

25106

In [12]:
negative_o_sites = 0
negative_n_sites = 0
for id in a.keys():
    indices_neg = [i for i, x in enumerate(a[id][1]) if x == "T"]
    indices_o = [i for i, x in enumerate(a[id][0]) if x == "S" or x == "T"]
    indices_n = [i for i, x in enumerate(a[id][0]) if x == "N"]
    negative_o_sites += len(set(indices_neg).intersection(set(indices_o)))
    negative_n_sites += len(set(indices_neg).intersection(set(indices_n)))
print(f"Negative O sites: {negative_o_sites}\nNegative N sites: {negative_n_sites}")
print(f"Total negative sites: {negative_o_sites + negative_n_sites}")

Negative O sites: 14153
Negative N sites: 15760
Total negative sites: 29913


In [11]:
a = generate_negative_samples(a, 5000, "O")

3 4 5 9 14 15 18 23 26 30 33 34 39 44 46 50 51 55 57 58 60 65 69 73 77 81 84 89 90 93 94 99 102 106 108 111 112 114 116 118 119 122 127 131 132 136 138 140 142 145 150 152 153 158 160 165 170 171 175 178 183 187 190 192 197 200 202 207 209 211 215 220 223 228 232 237 238 241 242 246 251 253 258 262 263 265 268 271 275 276 277 281 283 287 291 296 298 299 300 302 307 312 314 315 318 321 324 327 331 335 337 338 339 343 344 349 352 353 354 358 362 366 371 374 375 379 380 383 386 390 394 399 402 405 409 410 412 415 419 424 426 430 432 437 440 444 448 452 454 456 458 462 464 468 469 471 476 479 482 487 488 492 496 500 501 504 506 510 511 512 514 515 520 522 523 524 529 532 533 535 538 542 546 551 552 556 557 561 564 566 569 570 572 577 579 583 588 591 596 601 602 603 604 609 611 615 620 621 622 627 630 634 638 642 643 648 649 653 658 663 666 670 672 675 676 680 683 686 689 691 696 700 703 708 713 715 720 723 725 727 730 734 737 739 740 744 746 748 750 755 759 763 766 771 776 780 782 784 788 

In [14]:
Fasta(sequences=a).write_fasta("../data/glyco/train_more_neg.o", overwrite=True)

### Split validation from train

In [50]:
pids = list(merged_train_dict.keys())
random.shuffle(pids)
val_ids = []
num_in_val = 0
for i in pids:
    if num_in_val < 400:
        val_ids.append(i)
        num_in_val += 1
    else:
        break
num_in_val

400

In [57]:
negative_sites = 0
postive_N_sites = 0
postive_O_sites = 0
for id in merged_train_dict.keys():
    try:
        negative_sites += merged_train_dict[id][1][0].count("T")
        postive_O_sites += merged_train_dict[id][1][0].count("O")
        postive_N_sites += merged_train_dict[id][1][0].count("N")
    except:
        continue
print(f"Negative sites: {negative_sites}\nN-sites: {postive_N_sites}\nO-sites: {postive_O_sites}")

Negative sites: 15778
N-sites: 11057
O-sites: 4637


In [52]:
val_dict = {i: merged_train_dict.pop(i) for i in val_ids}

In [53]:
generate_negative_samples(val_dict, 250)

{'Q93063.1': ['MCASVKYNIRGPALIPRMKTKHRIYYITLFSIVLLGLIATGMFQFWPHSIESSNDWNVEKRSIRDVPVVRLPADSPIPERGDLSCRMHTCFDVYRCGFNPKNKIKVYIYALKKYVDDFGVSVSNTISREYNELLMAISDSDYYTDDINRACLFVPSIDVLNQNTLRIKETAQAMAQLSRWDRGTNHLLFNMLPGGPPDYNTALDVPRDRALLAGGGFSTWTYRQGYDVSIPVYSPLSAEVDLPEKGPGPRQYFLLSSQVGLHPEYREDLEALQVKHGESVLVLDKCTNLSEGVLSVRKRCHKHQVFDYPQVLQEATFCVVLRGARLGQAVLSDVLQAGCVPVVIADSYILPFSEVLDWKRASVVVPEEKMSDVYSILQSIPQRQIEEMQRQARWFWEAYFQSIKAIALATLQIINDRIYPYAAISYEEWNDPPAVKWGSVSNPLFLPLIPPQSQGFTAIVLTYDRVESLFRVITEVSKVPSLSKLLVVWNNQNKNPPEDSLWPKIRVPLKVVRTAENKLSNRFFPYDEIETEAVLAIDDDIIMLTSDELQFGYEVWREFPDRLVGYPGRLHLWDHEMNKWKYESEWTNEVSMVLTGAAFYHKYFNYLYTYKMPGDIKNWVDAHMNCEDIAMNFLVANVTGKAVIKVTPRKKFKCPECTAIDGLSLDQTHMVERSECINKFASVFGTMPLKVVEHRADPVLYKDDFPEKLKSFPNIGSL',
  ['XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

In [54]:
Fasta(sequences=val_dict).write_fasta("../data/glyco/val.o", overwrite=True)

In [56]:
generate_negative_samples(merged_train_dict, 5100)

{'Q9Y5Y9': ['MEFPIGSLETNNFRRFTPESLVEIEKQIAAKQGTKKAREKHREQKDQEEKPRPQLDLKACNQLPKFYGELPAELIGEPLEDLDPFYSTHRTFMVLNKGRTISRFSATRALWLFSPFNLIRRTAIKVSVHSWFSLFITVTILVNCVCMTRTDLPEKIEYVFTVIYTFEALIKILARGFCLNEFTYLRDPWNWLDFSVITLAYVGTAIDLRGISGLRTFRVLRALKTVSVIPGLKVIVGALIHSVKKLADVTILTIFCLSVFALVGLQLFKGNLKNKCVKNDMAVNETTNYSSHRKPDIYINKRGTSDPLLCGNGSDSGHCPDGYICLKTSDNPDFNYTSFDSFAWAFLSLFRLMTQDSWERLYQQTLRTSGKIYMIFFVLVIFLGSFYLVNLILAVVTMAYEEQNQATTDEIEAKEKKFQEALEMLRKEQEVLAALGIDTTSLHSHNGSPLTSKNASERRHRIKPRVSEGSTEDNKSPRSDPYNQRRMSFLGLASGKRRASHGSVFHFRSPGRDISLPEGVTDDGVFPGDHESHRGSLLLGGGAGQQGPLPRSPLPQPSNPDSRHGEDEHQPPPTSELAPGAVDVSAFDAGQKKTFLSAEYLDEPFRAQRAMSVVSIITSVLEELEESEQKCPPCLTSLSQKYLIWDCCPMWVKLKTILFGLVTDPFAELTITLCIVVNTIFMAMEHHGMSPTFEAMLQIGNIVFTIFFTAEMVFKIIAFDPYYYFQKKWNIFDCIIVTVSLLELGVAKKGSLSVLRSFRLLRVFKLAKSWPTLNTLIKIIGNSVGALGNLTIILAIIVFVFALVGKQLLGENYRNNRKNISAPHEDWPRWHMHDFFHSFLIVFRILCGEWIENMWACMEVGQKSICLILFLTVMVLGNLVVLNLFIALLLNSFSADNLTAPEDDGEVNNLQVALARIQVFGHRTKQALCSFFSRSCPFPQPKAEPELVVKLPLSSSKAENHIAANTARGSSGGLQAPRGPRDEHSDF

In [58]:
Fasta(sequences=merged_train_dict).write_fasta("../data/glyco/train.o", overwrite=True)

### After RR

In [73]:
ids = []
with open("../data/glyco/rr_wscores_and_config_glyco_all.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            ids.append(line[1:].strip())

In [74]:
t = []
for i in ids:
    try:
        merged_N_test_dict[i]
        t.append(i)
    except:
        continue
len(t)

766

In [77]:
negative_sites = 0
postive_N_sites = 0
postive_O_sites = 0
for id in t:
    try:
        negative_sites += merged_N_test_dict[id][1][0].count("T")
        postive_O_sites += merged_N_test_dict[id][1][0].count("O")
        postive_N_sites += merged_N_test_dict[id][1][0].count("N")
    except:
        continue
print(f"Negative sites: {negative_sites}\nN-sites: {postive_N_sites}\nO-sites: {postive_O_sites}")

Negative sites: 1154
N-sites: 580
O-sites: 0


In [68]:
merged_N_test_dict

{}

In [76]:
merged_N_test_dict = {k:merged_N_test_dict[k] for k in t if k in merged_N_test_dict}

In [23]:

l = []
for i in ids:
    try:
        merged_train_dict[i]
        l.append(i)
    except:
        continue
len(l)

8852

In [24]:
negative_sites = 0
postive_N_sites = 0
postive_O_sites = 0
for id in l:
    try:
        negative_sites += merged_train_dict[id][1][0].count("T")
        postive_O_sites += merged_train_dict[id][1][0].count("O")
        postive_N_sites += merged_train_dict[id][1][0].count("N")
    except:
        continue
print(f"Negative sites: {negative_sites}\nN-sites: {postive_N_sites}\nO-sites: {postive_O_sites}")

Negative sites: 11212
N-sites: 11642
O-sites: 5087


In [25]:
merged_train_dict = {k:merged_train_dict[k] for k in l if k in merged_train_dict}

In [48]:
negative_sites = 0
postive_N_sites = 0
postive_O_sites = 0
for id in l:
    try:
        negative_sites += test_dict[id][1][0].count("T")
        postive_O_sites += test_dict[id][1][0].count("O")
        postive_N_sites += test_dict[id][1][0].count("N")
    except:
        continue
print(f"Negative sites: {negative_sites}\nN-sites: {postive_N_sites}\nO-sites: {postive_O_sites}")

Negative sites: 0
N-sites: 0
O-sites: 0


In [45]:
o_site = 15
n_site = 3
negative_sample = 2
scores = {}
for id in sequences.get_headers():
    labels = None
    score[id] = 0
    if id in merged_train_dict:
        labels = merged_train_dict[id][1][0]
    elif id in merged_O_test_dict:
        labels = test_dict[id][1][0]
        scores[id] += 10
    elif id in val_dict:
        labels = val_dict[id][1][0]
    elif id in merged_N_test_dict:
        labels = merged_N_test_dict[id][1][0]
    else:
        scores[id] = 1
        continue

    if labels.count("T") != 0:
        scores[id] = negative_sample
        continue
    score = labels.count("O") * o_site + labels.count("N") * n_site + labels.count("T") * negative_sample
    scores[id] = score

with open("../../data/glyco/rr_scores.tsv", "w") as f:
    for id, score in scores.items():
        f.write(f"{id}\t{score}\n")

In [79]:
len(set(merged_train_dict.keys()).intersection(set(merged_N_test_dict.keys())))

0

In [47]:
len(set(merged_train_dict.keys()).intersection(set(merged_O_test_dict.keys())))

0