In [1]:
import pandas as pd
from Bio import ExPASy
from Bio import SwissProt
import re
import numpy as np
from fasta import Fasta

In [2]:
# position of glyco site is one based -> will be converted to zero based
def fetch_protein_sequence_pdb(uniprot_id):
    handle = ExPASy.get_sprot_raw(uniprot_id)
    record = SwissProt.read(handle)
    return record.sequence

- load data
- merge train 
- remove test from train 
- get sequences 
    - for O-linked dataset: site positions are part of the fasta header -> one header per protein is enough 
    - for the rest: group the entries by PID and accumulate glyco site positions 
- map sites to sequences
- write to fasta file 
    - one fasta for training and one for RR, containing the PID and sequences 

### Load Data

In [3]:
import os
import os
os.getcwd()

'/home/d/PycharmProjects/protein_properties/src/data'

The negative samples of Captor are not usable, since they do not provide the exact position of the window -> would mean I have to align and map the labels to the sequence = pain

In [4]:
O_captor_train = []
with open("../../data/O_captor/Ptrain.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            O_captor_train.append(line.strip("\t").strip("\s").strip("\s")[4:])
rgx = re.compile('[%s]' % "\s\t|")
O_captor_train_dict = {}
for i in range(len(O_captor_train)):
    O_captor_train[i] = rgx.sub("", O_captor_train[i]).split("#")
    O_captor_train_dict[O_captor_train[i][0]] = set([int(j) - 1 for j in O_captor_train[i][1:]])
O_captor_train_series = pd.Series(O_captor_train_dict.values(), O_captor_train_dict.keys())

O_captor_test = []
with open("../../data/O_captor/Ptest.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            O_captor_test.append(line.strip("\t").strip("\s").strip("\s")[4:])
O_captor_test_dict = {}
for i in range(len(O_captor_test)):
    O_captor_test[i] = rgx.sub("", O_captor_test[i]).split("#")
    O_captor_test_dict[O_captor_test[i][0]] = set([int(j) - 1 for j in O_captor_test[i][1:]])
O_captor_test_df = pd.Series(O_captor_test_dict.values(), O_captor_test_dict.keys())
# remove overlapping proteins from train and test set -> remove from test set
O_captor_test_df = O_captor_test_df.drop(O_captor_train_series.index, errors="ignore")

N_LMNgly_train_df = pd.read_csv('../../data/LMNglyPred/Train_Data_NGlycositeAtlas.csv', usecols=["label","PID","Position"])
N_LMNgly_test_df = pd.read_csv('../../data/LMNglyPred/Test_Data_NGlycositeAtlas.csv', usecols=["label","PID","Position"])
# skip weird proteins at the top of the file
N_taherzadeh_train_df = pd.read_csv('../../data/N_taherzadeh/Datasets.csv', usecols=["Protein name","Position"], skiprows=range(1, 12))
N_taherzadeh_train_df["Protein name"] = N_taherzadeh_train_df["Protein name"].apply(lambda x: x.strip("'"))

In [5]:
O_captor_train_df = O_captor_train_series.to_frame()
O_captor_train_df.columns = ["Position"]
O_captor_train_df["PID"] = O_captor_train_df.index
O_captor_train_df["label"] = 2
O_captor_train_df.reset_index(drop=True, inplace=True)
O_captor_train_df = O_captor_train_df.explode("Position", ignore_index=True)

In [6]:
len(O_captor_train_df["PID"])

6706

In [7]:
O_captor_test_df.apply(lambda x: len(x)).sum()

119

In [10]:
N_taherzadeh_train_df["label"] = 1
N_taherzadeh_train_df.columns = ["PID", "Position", "label"]

### Clean Data

In [11]:
N_merged_train_df = pd.concat([N_LMNgly_train_df, N_taherzadeh_train_df, O_captor_train_df], ignore_index=True)
print(N_merged_train_df[N_merged_train_df["label"] == 2])
N_merged_train_df.drop_duplicates(inplace=True, keep="first")
N_merged_train_df

            PID Position  label
36847  Q92954.3     1038      2
36848  Q92954.3      528      2
36849  Q92954.3      531      2
36850  Q92954.3      539      2
36851  Q92954.3      540      2
...         ...      ...    ...
43548  Q03001.4     7238      2
43549  P31639.1        4      2
43550  Q9BT09.1      267      2
43551  Q9BT09.1      270      2
43552  Q9NT22.2      521      2

[6706 rows x 3 columns]


Unnamed: 0,PID,Position,label
0,Q13425,167,1
1,Q9NZR2,1921,1
2,O43663,489,1
3,Q495M3,181,1
4,Q9BXP8,1408,1
...,...,...,...
43548,Q03001.4,7238,2
43549,P31639.1,4,2
43550,Q9BT09.1,267,2
43551,Q9BT09.1,270,2


In [12]:
N_merged_train_df[N_merged_train_df["label"] == 2]

Unnamed: 0,PID,Position,label
36847,Q92954.3,1038,2
36848,Q92954.3,528,2
36849,Q92954.3,531,2
36850,Q92954.3,539,2
36851,Q92954.3,540,2
...,...,...,...
43548,Q03001.4,7238,2
43549,P31639.1,4,2
43550,Q9BT09.1,267,2
43551,Q9BT09.1,270,2


In [13]:
# drop the proteins that are in the either of the two test set
N_merged_train_df = N_merged_train_df[~N_merged_train_df["PID"].isin(N_LMNgly_test_df["PID"])]
N_merged_train_df = N_merged_train_df[~N_merged_train_df["PID"].isin(O_captor_test_df.index)]
N_merged_train_df["PID"] = N_merged_train_df["PID"].astype(str)
N_merged_train_df

Unnamed: 0,PID,Position,label
0,Q13425,167,1
1,Q9NZR2,1921,1
2,O43663,489,1
3,Q495M3,181,1
4,Q9BXP8,1408,1
...,...,...,...
43548,Q03001.4,7238,2
43549,P31639.1,4,2
43550,Q9BT09.1,267,2
43551,Q9BT09.1,270,2


In [14]:
N_merged_train_df[N_merged_train_df["label"] == 2]

Unnamed: 0,PID,Position,label
36847,Q92954.3,1038,2
36848,Q92954.3,528,2
36849,Q92954.3,531,2
36850,Q92954.3,539,2
36851,Q92954.3,540,2
...,...,...,...
43548,Q03001.4,7238,2
43549,P31639.1,4,2
43550,Q9BT09.1,267,2
43551,Q9BT09.1,270,2


In [15]:
from Bio import Entrez
from Bio.Seq import Seq
import concurrent.futures

import requests

def get_protein_sequences(protein_ids):
    sequences = {}
    protein_ids = set(protein_ids)
    # Separate UniProt and NCBI IDs
    uniprot_ids = [id for id in protein_ids if not id.startswith('NP_')]
    ncbi_ids = [id for id in protein_ids if id.startswith('NP_')]
    
    # Fetch sequences for UniProt IDs
    if uniprot_ids:
        uniprot_sequences = fetch_uniprot_sequences(uniprot_ids)  # Fetch UniProt sequences
        sequences.update(uniprot_sequences)
    
    # Fetch sequences for NCBI IDs
    if ncbi_ids:
        ncbi_sequences = fetch_ncbi_sequences(ncbi_ids)  # Fetch NCBI sequences
        sequences.update(ncbi_sequences)
    
    return sequences

def fetch_uniprot_sequences(uniprot_ids):
    sequences = {}
    
    for uniprot_id in uniprot_ids:
        
        # Make a request to UniProt for the FASTA sequence
        url = f'https://www.uniprot.org/uniprot/{uniprot_id}.fasta'
        response = requests.get(url)
        
        if response.ok:
            sequences[uniprot_id] = [''.join(response.text.split('\n')[1:])]
    
    return sequences

def fetch_ncbi_sequences(ncbi_ids):
    Entrez.email = 'd.hasler@tum.de'  # Set your email address here
    sequences = {}
    
    def fetch_sequence(ncbi_id):
        handle = Entrez.efetch(db='protein', id=ncbi_id, rettype='fasta', retmode='text')
        record = handle.read()
        handle.close()
        sequences[ncbi_id] = [record.split('\n', 1)[1].replace('\n', '')]
    
    # Fetch sequences using concurrent futures
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(fetch_sequence, ncbi_ids)
    
    return sequences

In [None]:
# this will take around 90 minutes
"""all_PID = set((N_merged_train_df["PID"].tolist() + 
               N_LMNgly_test_df["PID"].tolist() + 
               O_captor_test_df.index.tolist()))
sequences = get_protein_sequences(all_PID)"""

In [None]:
"""Fasta(sequences=sequences).write_fasta("../../data/glyco/glyco_all.fasta", overwrite=True)"""

In [17]:
sequences = Fasta("../../data/glyco/glyco_all.fasta")

In [18]:
N_merged_train_df[N_merged_train_df["label"] == 2]

Unnamed: 0,PID,Position,label
36847,Q92954.3,1038,2
36848,Q92954.3,528,2
36849,Q92954.3,531,2
36850,Q92954.3,539,2
36851,Q92954.3,540,2
...,...,...,...
43548,Q03001.4,7238,2
43549,P31639.1,4,2
43550,Q9BT09.1,267,2
43551,Q9BT09.1,270,2


In [21]:
merged_train_dict = {}
label_dict = {0: "T", 1: "N", 2: "O", 5: "X"}
count_dict = {0: 0, 1: 0, 2: 0, 5: 0}
not_found = []
for i in set(N_merged_train_df["PID"]):
    try:
        seq = sequences[i][0]
    except KeyError:
        not_found.append(i)
        continue
    position = N_merged_train_df[N_merged_train_df["PID"] == i]["Position"].tolist()
    true_labels = N_merged_train_df[N_merged_train_df["PID"] == i]["label"].tolist()
    labels = np.zeros(len(seq)) + 5
    for idx, pos in enumerate(position):
        try:
            labels[pos] = true_labels[idx]
        except IndexError:
            continue
    for l in labels:
        count_dict[l] += 1
    merged_train_dict[i] = [seq, ["".join([label_dict[l] for l in labels.tolist()])]]

print(len(merged_train_dict.keys()))
print(len(merged_train_dict.keys()) / len(N_merged_train_df["PID"].unique()))

10613
0.9225486787204451


In [27]:
set(not_found).intersection(set(O_captor_train_df["PID"]))

{'P33827-1', 'Q13275.2'}

In [28]:
shit_ids = []
for item in merged_train_dict.items():
    if len(set(list(item[1][1][0]))) == 1:
        shit_ids.append(item[0])
for id in shit_ids:
    merged_train_dict.pop(id)

In [29]:
print(len(merged_train_dict.keys()))
print(len(merged_train_dict.keys()) / len(N_merged_train_df["PID"].unique()))

10578
0.9195062586926287


In [30]:
Fasta(sequences=merged_train_dict).write_fasta("../../data/glyco/train_seq_labels.o", overwrite=True)

In [31]:
O_captor_test_df = O_captor_test_df.to_frame()
O_captor_test_df.columns = ["Position"]
O_captor_test_df["PID"] = O_captor_test_df.index
O_captor_test_df["label"] = 2
O_captor_test_df.reset_index(drop=True, inplace=True)
O_captor_test_df = O_captor_test_df.explode("Position", ignore_index=True)

In [32]:

merged_O_test_dict = {}
label_dict = {0: "T", 1: "N", 2: "O", 5: "X"}
for i in set(O_captor_test_df["PID"]):
    try:
        seq = sequences[i][0]
    except KeyError:
        continue
    position = O_captor_test_df[O_captor_test_df["PID"] == i]["Position"].tolist()
    true_labels = O_captor_test_df[O_captor_test_df["PID"] == i]["label"].tolist()
    labels = np.zeros(len(seq)) + 5
    for idx, pos in enumerate(position):
        try:
            labels[pos] = true_labels[idx]
        except IndexError:
            continue
    merged_O_test_dict[i] = [seq, ["".join([label_dict[l] for l in labels.tolist()])]]


In [33]:
shit_ids = []
for item in merged_O_test_dict.items():
    if len(set(list(item[1][1][0]))) == 1:
        shit_ids.append(item[0])
for id in shit_ids:
    merged_O_test_dict.pop(id)
print(len(merged_O_test_dict.keys()))
print(len(merged_O_test_dict.keys()) / len(O_captor_test_df["PID"].unique()))

90
0.967741935483871


In [34]:
len(set(O_captor_test_df["PID"]))

93

In [35]:
Fasta(sequences=merged_O_test_dict).write_fasta("../../data/glyco/O_test_seq_labels.o", overwrite=True)

In [36]:

merged_N_test_dict = {}
label_dict = {0: "T", 1: "N", 2: "O", 5: "X"}
for i in set(N_LMNgly_test_df["PID"]):
    try:
        seq = sequences[i][0]
    except KeyError:
        print(f"Skipping {i}")
        continue
    position = N_LMNgly_test_df[N_LMNgly_test_df["PID"] == i]["Position"].tolist()
    true_labels = N_LMNgly_test_df[N_LMNgly_test_df["PID"] == i]["label"].tolist()
    labels = np.zeros(len(seq)) + 5
    for idx, pos in enumerate(position):
        try:
            labels[pos] = true_labels[idx]
        except IndexError:
            print(f"Skipping {i} because of IndexError")
            continue
    merged_N_test_dict[i] = [seq, ["".join([label_dict[l] for l in labels.tolist()])]]
shit_ids = []
for item in merged_N_test_dict.items():
    if len(set(list(item[1][1][0]))) == 1:
        shit_ids.append(item[0])
for id in shit_ids:
    merged_N_test_dict.pop(id)

In [37]:
len(set(N_LMNgly_test_df["PID"]))

955

In [38]:
len(merged_N_test_dict)

955

In [39]:
Fasta(sequences=merged_N_test_dict).write_fasta("../../data/glyco/N_test_seq_labels.o", overwrite=True)

In [40]:
ids = []
with open("../../data/glyco/rr_wscores_and_config_glyco_all.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            ids.append(line[1:].strip())

In [41]:
t = []
for i in ids:
    try:
        merged_N_test_dict[i]
        t.append(i)
    except:
        continue
len(t)

766

In [42]:
negative_sites = 0
postive_N_sites = 0
postive_O_sites = 0
for id in t:
    try:
        negative_sites += merged_N_test_dict[id][1][0].count("T")
        postive_O_sites += merged_N_test_dict[id][1][0].count("O")
        postive_N_sites += merged_N_test_dict[id][1][0].count("N")
    except:
        continue
print(f"Negative sites: {negative_sites}\nN-sites: {postive_N_sites}\nO-sites: {postive_O_sites}")

Negative sites: 1154
N-sites: 580
O-sites: 0


In [43]:

l = []
for i in ids:
    try:
        merged_train_dict[i]
        l.append(i)
    except:
        continue
len(l)

8791

In [44]:
negative_sites = 0
postive_N_sites = 0
postive_O_sites = 0
for id in l:
    try:
        negative_sites += merged_train_dict[id][1][0].count("T")
        postive_O_sites += merged_train_dict[id][1][0].count("O")
        postive_N_sites += merged_train_dict[id][1][0].count("N")
    except:
        continue
print(f"Negative sites: {negative_sites}\nN-sites: {postive_N_sites}\nO-sites: {postive_O_sites}")

Negative sites: 11212
N-sites: 11639
O-sites: 5011


In [45]:
o_site = 15
n_site = 3
negative_sample = 2
scores = {}
for id in sequences.get_headers():
    labels = None
    if id in merged_train_dict:
        labels = merged_train_dict[id][1][0]
    elif id in merged_O_test_dict:
        labels = merged_O_test_dict[id][1][0]
    elif id in merged_N_test_dict:
        labels = merged_N_test_dict[id][1][0]
    else:
        scores[id] = 1
        continue

    if labels.count("T") != 0:
        scores[id] = negative_sample
        continue
    score = labels.count("O") * o_site + labels.count("N") * n_site + labels.count("T") * negative_sample
    scores[id] = score

with open("../../data/glyco/rr_scores.tsv", "w") as f:
    for id, score in scores.items():
        f.write(f"{id}\t{score}\n")

In [46]:
len(set(merged_train_dict.keys()).intersection(set(merged_N_test_dict.keys())))

0

In [47]:
len(set(merged_train_dict.keys()).intersection(set(merged_O_test_dict.keys())))

0