In [1]:
%cd .. 

/home/d/PycharmProjects/protein_properties/src


In [2]:
import pandas as pd
from Bio import ExPASy
from Bio import SwissProt
import re
import numpy as np
from fasta import Fasta
from utils import seed_all
seed_all(13)

Global seed set to 13


In [3]:
# position of glyco site is one based -> will be converted to zero based
def fetch_protein_sequence_pdb(uniprot_id):
    handle = ExPASy.get_sprot_raw(uniprot_id)
    record = SwissProt.read(handle)
    return record.sequence

- load data
- merge train 
- remove test from train 
- get sequences 
    - for O-linked dataset: site positions are part of the fasta header -> one header per protein is enough 
    - for the rest: group the entries by PID and accumulate glyco site positions 
- map sites to sequences
- write to fasta file 
    - one fasta for training and one for RR, containing the PID and sequences 

### Load Data

In [4]:
import os
import os
os.getcwd()

'/home/d/PycharmProjects/protein_properties/src'

The negative samples of Captor are not usable, since they do not provide the exact position of the window -> would mean I have to align and map the labels to the sequence = pain

In [5]:
O_captor_train = []
with open("../data/O_captor/Ptrain.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            O_captor_train.append(line.strip("\t").strip("\s").strip("\s")[4:])
rgx = re.compile('[%s]' % "\s\t|")
O_captor_train_dict = {}
for i in range(len(O_captor_train)):
    O_captor_train[i] = rgx.sub("", O_captor_train[i]).split("#")
    O_captor_train_dict[O_captor_train[i][0]] = set([int(j) - 1 for j in O_captor_train[i][1:]])
O_captor_train_series = pd.Series(O_captor_train_dict.values(), O_captor_train_dict.keys())

O_captor_test = []
with open("../data/O_captor/Ptest.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            O_captor_test.append(line.strip("\t").strip("\s").strip("\s")[4:])
O_captor_test_dict = {}
for i in range(len(O_captor_test)):
    O_captor_test[i] = rgx.sub("", O_captor_test[i]).split("#")
    O_captor_test_dict[O_captor_test[i][0]] = set([int(j) - 1 for j in O_captor_test[i][1:]])
O_captor_test_df = pd.Series(O_captor_test_dict.values(), O_captor_test_dict.keys())
# remove overlapping proteins from train and test set -> remove from test set
O_captor_test_df = O_captor_test_df.drop(O_captor_train_series.index, errors="ignore")

N_LMNgly_train_df = pd.read_csv('../data/LMNglyPred/Train_Data_NGlycositeAtlas.csv', usecols=["label","PID","Position"])
N_LMNgly_test_df = pd.read_csv('../data/LMNglyPred/Test_Data_NGlycositeAtlas.csv', usecols=["label","PID","Position"])
# skip weird proteins at the top of the file
N_taherzadeh_train_df = pd.read_csv('../data/N_taherzadeh/Datasets.csv', usecols=["Protein name","Position"], skiprows=range(1, 12))
N_taherzadeh_train_df["Protein name"] = N_taherzadeh_train_df["Protein name"].apply(lambda x: x.strip("'"))

In [6]:
O_captor_train_series = pd.concat([O_captor_train_series, O_captor_test_df])

In [7]:
O_captor_train_df = O_captor_train_series.to_frame()
O_captor_train_df.columns = ["Position"]
O_captor_train_df["PID"] = O_captor_train_df.index
O_captor_train_df["label"] = 2
O_captor_train_df.reset_index(drop=True, inplace=True)
O_captor_train_df = O_captor_train_df.explode("Position", ignore_index=True)

In [8]:
len(O_captor_train_df["PID"])

6825

In [9]:
N_taherzadeh_train_df["label"] = 1
N_taherzadeh_train_df.columns = ["PID", "Position", "label"]

### Clean Data

In [10]:
N_merged_train_df = pd.concat([N_LMNgly_train_df, N_taherzadeh_train_df, O_captor_train_df], ignore_index=True)
print(N_merged_train_df[N_merged_train_df["label"] == 2])
N_merged_train_df.drop_duplicates(inplace=True, keep="first")
N_merged_train_df

            PID Position  label
36847  Q92954.3     1038      2
36848  Q92954.3      528      2
36849  Q92954.3      531      2
36850  Q92954.3      539      2
36851  Q92954.3      540      2
...         ...      ...    ...
43667  Q15223.3      338      2
43668  Q86UP2.1      187      2
43669  Q9NXG2.2      321      2
43670  Q496F6.2      141      2
43671  P43308.1      124      2

[6825 rows x 3 columns]


Unnamed: 0,PID,Position,label
0,Q13425,167,1
1,Q9NZR2,1921,1
2,O43663,489,1
3,Q495M3,181,1
4,Q9BXP8,1408,1
...,...,...,...
43667,Q15223.3,338,2
43668,Q86UP2.1,187,2
43669,Q9NXG2.2,321,2
43670,Q496F6.2,141,2


In [11]:
# drop the proteins that are in the either of the two test set
N_merged_train_df = N_merged_train_df[~N_merged_train_df["PID"].isin(N_LMNgly_test_df["PID"])]
N_merged_train_df["PID"] = N_merged_train_df["PID"].astype(str)
N_merged_train_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  N_merged_train_df["PID"] = N_merged_train_df["PID"].astype(str)


Unnamed: 0,PID,Position,label
0,Q13425,167,1
1,Q9NZR2,1921,1
2,O43663,489,1
3,Q495M3,181,1
4,Q9BXP8,1408,1
...,...,...,...
43667,Q15223.3,338,2
43668,Q86UP2.1,187,2
43669,Q9NXG2.2,321,2
43670,Q496F6.2,141,2


In [12]:
N_merged_train_df[N_merged_train_df["label"] == 2]

Unnamed: 0,PID,Position,label
36847,Q92954.3,1038,2
36848,Q92954.3,528,2
36849,Q92954.3,531,2
36850,Q92954.3,539,2
36851,Q92954.3,540,2
...,...,...,...
43667,Q15223.3,338,2
43668,Q86UP2.1,187,2
43669,Q9NXG2.2,321,2
43670,Q496F6.2,141,2


In [13]:
from Bio import Entrez
from Bio.Seq import Seq
import concurrent.futures

import requests

def get_protein_sequences(protein_ids):
    sequences = {}
    protein_ids = set(protein_ids)
    # Separate UniProt and NCBI IDs
    uniprot_ids = [id for id in protein_ids if not id.startswith('NP_')]
    ncbi_ids = [id for id in protein_ids if id.startswith('NP_')]
    
    # Fetch sequences for UniProt IDs
    if uniprot_ids:
        uniprot_sequences = fetch_uniprot_sequences(uniprot_ids)  # Fetch UniProt sequences
        sequences.update(uniprot_sequences)
    
    # Fetch sequences for NCBI IDs
    if ncbi_ids:
        ncbi_sequences = fetch_ncbi_sequences(ncbi_ids)  # Fetch NCBI sequences
        sequences.update(ncbi_sequences)
    
    return sequences

def fetch_uniprot_sequences(uniprot_ids):
    sequences = {}
    
    for uniprot_id in uniprot_ids:
        
        # Make a request to UniProt for the FASTA sequence
        url = f'https://www.uniprot.org/uniprot/{uniprot_id}.fasta'
        response = requests.get(url)
        
        if response.ok:
            sequences[uniprot_id] = [''.join(response.text.split('\n')[1:])]
    
    return sequences

def fetch_ncbi_sequences(ncbi_ids):
    Entrez.email = 'd.hasler@tum.de'  # Set your email address here
    sequences = {}
    
    def fetch_sequence(ncbi_id):
        handle = Entrez.efetch(db='protein', id=ncbi_id, rettype='fasta', retmode='text')
        record = handle.read()
        handle.close()
        sequences[ncbi_id] = [record.split('\n', 1)[1].replace('\n', '')]
    
    # Fetch sequences using concurrent futures
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(fetch_sequence, ncbi_ids)
    
    return sequences

In [14]:
# this will take around 90 minutes
"""all_PID = set((N_merged_train_df["PID"].tolist() + 
               N_LMNgly_test_df["PID"].tolist() + 
               O_captor_test_df.index.tolist()))
sequences = get_protein_sequences(all_PID)"""

'all_PID = set((N_merged_train_df["PID"].tolist() + \n               N_LMNgly_test_df["PID"].tolist() + \n               O_captor_test_df.index.tolist()))\nsequences = get_protein_sequences(all_PID)'

In [None]:
"""Fasta(sequences=sequences).write_fasta("../../data/glyco/glyco_all.fasta", overwrite=True)"""

In [15]:
sequences = Fasta("../data/glyco/glyco_all.fasta")

In [16]:
merged_train_dict = {}
label_dict = {0: "T", 1: "N", 2: "O", 5: "X"}
count_dict = {0: 0, 1: 0, 2: 0, 5: 0}
not_found = []
for i in set(N_merged_train_df["PID"]):
    try:
        seq = sequences[i][0]
    except KeyError:
        not_found.append(i)
        continue
    position = N_merged_train_df[N_merged_train_df["PID"] == i]["Position"].tolist()
    true_labels = N_merged_train_df[N_merged_train_df["PID"] == i]["label"].tolist()
    labels = np.zeros(len(seq)) + 5
    for idx, pos in enumerate(position):
        try:
            labels[pos] = true_labels[idx]
        except IndexError:
            continue
    for l in labels:
        count_dict[l] += 1
    merged_train_dict[i] = [seq, ["".join([label_dict[l] for l in labels.tolist()])]]

print(len(merged_train_dict.keys()))
print(len(merged_train_dict.keys()) / len(N_merged_train_df["PID"].unique()))

10706
0.9231697852892989


In [17]:
set(not_found).intersection(set(O_captor_train_df["PID"]))

{'P33827-1', 'Q13275.2'}

In [18]:
shit_ids = []
for item in merged_train_dict.items():
    if len(set(list(item[1][1][0]))) == 1:
        shit_ids.append(item[0])
for id in shit_ids:
    merged_train_dict.pop(id)

In [19]:
print(len(merged_train_dict.keys()))
print(len(merged_train_dict.keys()) / len(N_merged_train_df["PID"].unique()))

10668
0.9198930757954643


In [69]:

merged_N_test_dict = {}
label_dict = {0: "T", 1: "N", 2: "O", 5: "X"}
for i in set(N_LMNgly_test_df["PID"]):
    try:
        seq = sequences[i][0]
    except KeyError:
        print(f"Skipping {i}")
        continue
    position = N_LMNgly_test_df[N_LMNgly_test_df["PID"] == i]["Position"].tolist()
    true_labels = N_LMNgly_test_df[N_LMNgly_test_df["PID"] == i]["label"].tolist()
    labels = np.zeros(len(seq)) + 5
    for idx, pos in enumerate(position):
        try:
            labels[pos] = true_labels[idx]
        except IndexError:
            print(f"Skipping {i} because of IndexError")
            continue
    merged_N_test_dict[i] = [seq, ["".join([label_dict[l] for l in labels.tolist()])]]
shit_ids = []
for item in merged_N_test_dict.items():
    if len(set(list(item[1][1][0]))) == 1:
        shit_ids.append(item[0])
for id in shit_ids:
    merged_N_test_dict.pop(id)

In [70]:
len(set(N_LMNgly_test_df["PID"]))

955

In [71]:
len(merged_N_test_dict)

955

In [78]:
Fasta(sequences=merged_N_test_dict).write_fasta("../../data/glyco/N_test.o", overwrite=True)

In [72]:
negative_sites = 0
postive_N_sites = 0
postive_O_sites = 0
for id in merged_train_dict.keys():
    try:
        negative_sites += merged_train_dict[id][1][0].count("T")
        postive_O_sites += merged_train_dict[id][1][0].count("O")
        postive_N_sites += merged_train_dict[id][1][0].count("N")
    except:
        continue
print(f"Negative sites: {negative_sites}\nN-sites: {postive_N_sites}\nO-sites: {postive_O_sites}")

Negative sites: 15778
N-sites: 11057
O-sites: 4637


### Split O-Linked test from train

In [27]:
# select a number of protein that contain only O-sites
o_prot = {}
for id in merged_train_dict.keys():
    try:
        t = merged_train_dict[id][1][0].count("T")
        o = merged_train_dict[id][1][0].count("O")
        n = merged_train_dict[id][1][0].count("N")
        if o > 0 and t == 0 and n == 0:
            o_prot[id] = o
    except:
        continue

In [28]:
len(o_prot.keys())

1227

In [29]:
o_prot_ids = []
for id in o_prot.keys():
    if "." in id:
        o_prot_ids.append(id.split(".")[0])
    else:
        o_prot_ids.append(id)

In [30]:
import random

pids = list(o_prot.keys())
random.shuffle(pids)
test_ids = []
num_in_test = 0
for i in pids:
    if num_in_test < 200:
        test_ids.append(i)
        num_in_test += o_prot[i]
    else:
        break
test_ids

['Q13790.2',
 'Q92187.1',
 'Q9BT09.1',
 'Q99618.1',
 'Q92508.4',
 'Q6P9A2.2',
 'P52594.2',
 'P07204.2',
 'O95972.2',
 'Q2UY09.2',
 'Q9NPR9.3',
 'P01024.2',
 'Q9NSC7.1',
 'Q9UKU9.1',
 'P08318.1',
 'O95631.2',
 'Q86X29.4',
 'O60609.2',
 'Q08397.2',
 'P31150.2',
 'P04141.1',
 'Q8IVY1.1',
 'P05362.2',
 'Q96BF3.2',
 'P28300.2',
 'Q9UBG0.2',
 'P02787.3',
 'Q8WWX8.1',
 'Q9BRP8.1',
 'Q9HAB3.1',
 'Q9Y624.1',
 'Q8IZ83.2',
 'Q13231.1',
 'Q99941.2',
 'Q9P0U3.2',
 'Q9NVR5.2',
 'Q12841.1',
 'P55157.1',
 'Q6UWL6.2',
 'Q63HQ2.2',
 'O43399.2',
 'Q9GZZ8.1',
 'O60353.2',
 'Q4G148.2',
 'O60502.2',
 'Q9UMX5.1',
 'Q76M96.1',
 'P49641.3',
 'P21453.2',
 'Q15125.3',
 'Q8N1G4.1',
 'P08697.3',
 'P01127.1',
 'Q14697.3',
 'O75631.3',
 'Q14435.2',
 'Q14669.1',
 'O75821.2',
 'Q86VZ4.2',
 'Q9NP72.1',
 'Q99795.1',
 'Q9Y2H0.3',
 'Q9NPY3.3',
 'Q9HBJ8.1',
 'Q5VU43.3',
 'A5D8T8.3',
 'Q9UM21.1',
 'Q96RP7.1',
 'P08709.1']

In [31]:
O_test_dict = {i: merged_train_dict.pop(i) for i in test_ids}

In [32]:
set(O_test_dict.keys()) & set(merged_train_dict.keys())

set()

### Add negative samples

In [33]:
def generate_negative_samples(data_dict, total_num_negative):
    num_negative_samples = 0
    ids_shuffled = list(data_dict.keys())
    random.shuffle(ids_shuffled)
    for id in ids_shuffled:
        seq = data_dict[id][0]
        labels = data_dict[id][1][0]
        num_negative = random.randint(1, 5) if total_num_negative > 5 else total_num_negative
        st_idx = [i for i, x in enumerate(list(seq)) if x == "S" or x == "T"]
        for i in range(num_negative):
            while True:
                idx = random.choice(st_idx)
                if not list(labels)[idx] in ["N", "T", "O"]:
                    labels = labels[:idx] + "T" + labels[idx + 1 :]
                    break
        num_negative_samples += num_negative
        data_dict[id][1][0] = labels
        if num_negative_samples >= total_num_negative:
            break
    return data_dict

In [96]:
test_dict = {"A": ["STASASAT", ["XXXNXXXN"]], "B": ["STASASAT", ["XXXNXXXN"]], "C": ["STASASAT", ["XXXNXXXN"]]}
generate_negative_samples(test_dict, 6)

In [34]:
sum([o_prot[i] for i in O_test_dict.keys()])

200

In [37]:
sum([O_test_dict[i][1][0].count("T") for i in O_test_dict.keys()])

204

In [36]:
generate_negative_samples(O_test_dict, 205)

{'Q13790.2': ['MIPVELLLCYLLLHPVDATSYGKQTNVLMHFPLSLESQTPSSDPLSCQFLHPKSLPGFSHMAPLPKFLVSLALRNALEEAGCQADVWALQLQLYRQGGVNATQVLIQHLRGLQKGRSTERNVSVEALASALQLLAREQQSTGRVGRSLPTEDCENEKEQAVHNVVQLLPGVGTFYNLGTALYYATQNCLGKARERGRDGAIDLGYDLLMTMAGMSGGPMGLAISAALKPALRSGVQQLIQYYQDQKDANISQPETTKEGLRAISDVSDLEETTTLASFISEVVSSAPYWGWAIIKSYDLDPGAGSLEI',
  ['XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXTXXXXXXXXXXXXXXXXXXXTXXOXOXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXTXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXOXXXOOXXXXXXXXXXTXXXXXXXXXXXXXXXXOOXXXXX']],
 'Q92187.1': ['MRSIRKRWTICTISLLLIFYKTKEIARTEEHQETQLIGDGELSLSRSLVNSSDKIIRKAGSSIFQHNVEGWKINSSLVLEIRKNILRFLDAERDVSVVKSSFKPGDVIHYVLDRRRTLNISHDLHSLLPEVSPMKNRRFKTCAVVGNSGILLDSECGKEIDSHNFVIRCNLAPVVEFAADVGTKSDFITMNPSVVQRAFGGFRNESDREKFVHRLSMLNDSVLWIPAFMVKGGEKHVEWVNALILKNKLKVRTAYPSLRLIHAVRGYWLTNKVPIKRPSTGLLMYTLATRFCDEIHLYGFWPFPKDLNGKAVKYHYYDDLKYRYFSNASPHRMPLEFKTLNV

In [38]:
Fasta(sequences=O_test_dict).write_fasta("../data/glyco/O_test.o", overwrite=True)

### Split validation from train

In [50]:
pids = list(merged_train_dict.keys())
random.shuffle(pids)
val_ids = []
num_in_val = 0
for i in pids:
    if num_in_val < 400:
        val_ids.append(i)
        num_in_val += 1
    else:
        break
num_in_val

400

In [57]:
negative_sites = 0
postive_N_sites = 0
postive_O_sites = 0
for id in merged_train_dict.keys():
    try:
        negative_sites += merged_train_dict[id][1][0].count("T")
        postive_O_sites += merged_train_dict[id][1][0].count("O")
        postive_N_sites += merged_train_dict[id][1][0].count("N")
    except:
        continue
print(f"Negative sites: {negative_sites}\nN-sites: {postive_N_sites}\nO-sites: {postive_O_sites}")

Negative sites: 15778
N-sites: 11057
O-sites: 4637


In [52]:
val_dict = {i: merged_train_dict.pop(i) for i in val_ids}

In [53]:
generate_negative_samples(val_dict, 250)

{'Q93063.1': ['MCASVKYNIRGPALIPRMKTKHRIYYITLFSIVLLGLIATGMFQFWPHSIESSNDWNVEKRSIRDVPVVRLPADSPIPERGDLSCRMHTCFDVYRCGFNPKNKIKVYIYALKKYVDDFGVSVSNTISREYNELLMAISDSDYYTDDINRACLFVPSIDVLNQNTLRIKETAQAMAQLSRWDRGTNHLLFNMLPGGPPDYNTALDVPRDRALLAGGGFSTWTYRQGYDVSIPVYSPLSAEVDLPEKGPGPRQYFLLSSQVGLHPEYREDLEALQVKHGESVLVLDKCTNLSEGVLSVRKRCHKHQVFDYPQVLQEATFCVVLRGARLGQAVLSDVLQAGCVPVVIADSYILPFSEVLDWKRASVVVPEEKMSDVYSILQSIPQRQIEEMQRQARWFWEAYFQSIKAIALATLQIINDRIYPYAAISYEEWNDPPAVKWGSVSNPLFLPLIPPQSQGFTAIVLTYDRVESLFRVITEVSKVPSLSKLLVVWNNQNKNPPEDSLWPKIRVPLKVVRTAENKLSNRFFPYDEIETEAVLAIDDDIIMLTSDELQFGYEVWREFPDRLVGYPGRLHLWDHEMNKWKYESEWTNEVSMVLTGAAFYHKYFNYLYTYKMPGDIKNWVDAHMNCEDIAMNFLVANVTGKAVIKVTPRKKFKCPECTAIDGLSLDQTHMVERSECINKFASVFGTMPLKVVEHRADPVLYKDDFPEKLKSFPNIGSL',
  ['XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

In [54]:
Fasta(sequences=val_dict).write_fasta("../data/glyco/val.o", overwrite=True)

In [56]:
generate_negative_samples(merged_train_dict, 5100)

{'Q9Y5Y9': ['MEFPIGSLETNNFRRFTPESLVEIEKQIAAKQGTKKAREKHREQKDQEEKPRPQLDLKACNQLPKFYGELPAELIGEPLEDLDPFYSTHRTFMVLNKGRTISRFSATRALWLFSPFNLIRRTAIKVSVHSWFSLFITVTILVNCVCMTRTDLPEKIEYVFTVIYTFEALIKILARGFCLNEFTYLRDPWNWLDFSVITLAYVGTAIDLRGISGLRTFRVLRALKTVSVIPGLKVIVGALIHSVKKLADVTILTIFCLSVFALVGLQLFKGNLKNKCVKNDMAVNETTNYSSHRKPDIYINKRGTSDPLLCGNGSDSGHCPDGYICLKTSDNPDFNYTSFDSFAWAFLSLFRLMTQDSWERLYQQTLRTSGKIYMIFFVLVIFLGSFYLVNLILAVVTMAYEEQNQATTDEIEAKEKKFQEALEMLRKEQEVLAALGIDTTSLHSHNGSPLTSKNASERRHRIKPRVSEGSTEDNKSPRSDPYNQRRMSFLGLASGKRRASHGSVFHFRSPGRDISLPEGVTDDGVFPGDHESHRGSLLLGGGAGQQGPLPRSPLPQPSNPDSRHGEDEHQPPPTSELAPGAVDVSAFDAGQKKTFLSAEYLDEPFRAQRAMSVVSIITSVLEELEESEQKCPPCLTSLSQKYLIWDCCPMWVKLKTILFGLVTDPFAELTITLCIVVNTIFMAMEHHGMSPTFEAMLQIGNIVFTIFFTAEMVFKIIAFDPYYYFQKKWNIFDCIIVTVSLLELGVAKKGSLSVLRSFRLLRVFKLAKSWPTLNTLIKIIGNSVGALGNLTIILAIIVFVFALVGKQLLGENYRNNRKNISAPHEDWPRWHMHDFFHSFLIVFRILCGEWIENMWACMEVGQKSICLILFLTVMVLGNLVVLNLFIALLLNSFSADNLTAPEDDGEVNNLQVALARIQVFGHRTKQALCSFFSRSCPFPQPKAEPELVVKLPLSSSKAENHIAANTARGSSGGLQAPRGPRDEHSDF

In [58]:
Fasta(sequences=merged_train_dict).write_fasta("../data/glyco/train.o", overwrite=True)

### After RR

In [73]:
ids = []
with open("../data/glyco/rr_wscores_and_config_glyco_all.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            ids.append(line[1:].strip())

In [74]:
t = []
for i in ids:
    try:
        merged_N_test_dict[i]
        t.append(i)
    except:
        continue
len(t)

766

In [77]:
negative_sites = 0
postive_N_sites = 0
postive_O_sites = 0
for id in t:
    try:
        negative_sites += merged_N_test_dict[id][1][0].count("T")
        postive_O_sites += merged_N_test_dict[id][1][0].count("O")
        postive_N_sites += merged_N_test_dict[id][1][0].count("N")
    except:
        continue
print(f"Negative sites: {negative_sites}\nN-sites: {postive_N_sites}\nO-sites: {postive_O_sites}")

Negative sites: 1154
N-sites: 580
O-sites: 0


In [68]:
merged_N_test_dict

{}

In [76]:
merged_N_test_dict = {k:merged_N_test_dict[k] for k in t if k in merged_N_test_dict}

In [23]:

l = []
for i in ids:
    try:
        merged_train_dict[i]
        l.append(i)
    except:
        continue
len(l)

8852

In [24]:
negative_sites = 0
postive_N_sites = 0
postive_O_sites = 0
for id in l:
    try:
        negative_sites += merged_train_dict[id][1][0].count("T")
        postive_O_sites += merged_train_dict[id][1][0].count("O")
        postive_N_sites += merged_train_dict[id][1][0].count("N")
    except:
        continue
print(f"Negative sites: {negative_sites}\nN-sites: {postive_N_sites}\nO-sites: {postive_O_sites}")

Negative sites: 11212
N-sites: 11642
O-sites: 5087


In [25]:
merged_train_dict = {k:merged_train_dict[k] for k in l if k in merged_train_dict}

In [48]:
negative_sites = 0
postive_N_sites = 0
postive_O_sites = 0
for id in l:
    try:
        negative_sites += test_dict[id][1][0].count("T")
        postive_O_sites += test_dict[id][1][0].count("O")
        postive_N_sites += test_dict[id][1][0].count("N")
    except:
        continue
print(f"Negative sites: {negative_sites}\nN-sites: {postive_N_sites}\nO-sites: {postive_O_sites}")

Negative sites: 0
N-sites: 0
O-sites: 0


In [45]:
o_site = 15
n_site = 3
negative_sample = 2
scores = {}
for id in sequences.get_headers():
    labels = None
    score[id] = 0
    if id in merged_train_dict:
        labels = merged_train_dict[id][1][0]
    elif id in merged_O_test_dict:
        labels = test_dict[id][1][0]
        scores[id] += 10
    elif id in val_dict:
        labels = val_dict[id][1][0]
    elif id in merged_N_test_dict:
        labels = merged_N_test_dict[id][1][0]
    else:
        scores[id] = 1
        continue

    if labels.count("T") != 0:
        scores[id] = negative_sample
        continue
    score = labels.count("O") * o_site + labels.count("N") * n_site + labels.count("T") * negative_sample
    scores[id] = score

with open("../../data/glyco/rr_scores.tsv", "w") as f:
    for id, score in scores.items():
        f.write(f"{id}\t{score}\n")

In [79]:
len(set(merged_train_dict.keys()).intersection(set(merged_N_test_dict.keys())))

0

In [47]:
len(set(merged_train_dict.keys()).intersection(set(merged_O_test_dict.keys())))

0