In [1]:
import pandas as pd
from Bio import ExPASy
from Bio import SwissProt
import re
import numpy as np
from fasta import Fasta

In [2]:
# position of glyco site is one based -> will be converted to zero based
def fetch_protein_sequence_pdb(uniprot_id):
    handle = ExPASy.get_sprot_raw(uniprot_id)
    record = SwissProt.read(handle)
    return record.sequence

- load data
- merge train 
- remove test from train 
- get sequences 
    - for O-linked dataset: site positions are part of the fasta header -> one header per protein is enough 
    - for the rest: group the entries by PID and accumulate glyco site positions 
- map sites to sequences
- write to fasta file 
    - one fasta for training and one for RR, containing the PID and sequences 

### Load Data

In [3]:
import os
import os
os.getcwd()

'/home/d/PycharmProjects/protein_properties/src/data'

The negative samples of Captor are not usable, since they do not provide the exact position of the window -> would mean I have to align and map the labels to the sequence = pain

In [4]:
O_captor_train = []
with open("../../data/O_captor/Ptrain.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            O_captor_train.append(line.strip("\t").strip("\s").strip("\s")[4:])
rgx = re.compile('[%s]' % "\s\t|")
O_captor_train_dict = {}
for i in range(len(O_captor_train)):
    O_captor_train[i] = rgx.sub("", O_captor_train[i]).split("#")
    O_captor_train_dict[O_captor_train[i][0]] = set([int(j) - 1 for j in O_captor_train[i][1:]])
O_captor_train_series = pd.Series(O_captor_train_dict.values(), O_captor_train_dict.keys())

O_captor_test = []
with open("../../data/O_captor/Ptest.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            O_captor_test.append(line.strip("\t").strip("\s").strip("\s")[4:])
O_captor_test_dict = {}
for i in range(len(O_captor_test)):
    O_captor_test[i] = rgx.sub("", O_captor_test[i]).split("#")
    O_captor_test_dict[O_captor_test[i][0]] = set([int(j) - 1 for j in O_captor_test[i][1:]])
O_captor_test_df = pd.Series(O_captor_test_dict.values(), O_captor_test_dict.keys())

N_LMNgly_train_df = pd.read_csv('../../data/LMNglyPred/Train_Data_NGlycositeAtlas.csv', usecols=["label","PID","Position"])
N_LMNgly_test_df = pd.read_csv('../../data/LMNglyPred/Test_Data_NGlycositeAtlas.csv', usecols=["label","PID","Position"])
# skip weird proteins at the top of the file
N_taherzadeh_train_df = pd.read_csv('../../data/N_taherzadeh/Datasets.csv', usecols=["Protein name","Position"], skiprows=range(1, 12))
N_taherzadeh_train_df["Protein name"] = N_taherzadeh_train_df["Protein name"].apply(lambda x: x.strip("'"))

In [5]:
O_captor_train_df = O_captor_train_series.to_frame()
O_captor_train_df.columns = ["Position"]
O_captor_train_df["PID"] = O_captor_train_df.index
O_captor_train_df["label"] = 2
O_captor_train_df.reset_index(drop=True, inplace=True)
O_captor_train_df = O_captor_train_df.explode("Position", ignore_index=True)

In [6]:
O_captor_train_df

Unnamed: 0,Position,PID,label
0,1038,Q92954.3,2
1,528,Q92954.3,2
2,531,Q92954.3,2
3,539,Q92954.3,2
4,540,Q92954.3,2
...,...,...,...
6701,7238,Q03001.4,2
6702,4,P31639.1,2
6703,267,Q9BT09.1,2
6704,270,Q9BT09.1,2


In [7]:
N_taherzadeh_train_df["label"] = 1
N_taherzadeh_train_df.columns = ["PID", "Position", "label"]

### Clean Data

In [8]:
N_merged_train_df = pd.concat([N_LMNgly_train_df, N_taherzadeh_train_df, O_captor_train_df], ignore_index=True)
N_merged_train_df.drop_duplicates(inplace=True, keep="first")
N_merged_train_df

Unnamed: 0,PID,Position,label
0,Q13425,167,1
1,Q9NZR2,1921,1
2,O43663,489,1
3,Q495M3,181,1
4,Q9BXP8,1408,1
...,...,...,...
43548,Q03001.4,7238,2
43549,P31639.1,4,2
43550,Q9BT09.1,267,2
43551,Q9BT09.1,270,2


In [9]:
# drop the proteins that are in the either of the two test set
N_merged_train_df = N_merged_train_df[~N_merged_train_df["PID"].isin(N_LMNgly_test_df["PID"])]
N_merged_train_df = N_merged_train_df[~N_merged_train_df["PID"].isin(O_captor_test_df.index)]
N_merged_train_df["PID"] = N_merged_train_df["PID"].astype(str)
N_merged_train_df

Unnamed: 0,PID,Position,label
0,Q13425,167,1
1,Q9NZR2,1921,1
2,O43663,489,1
3,Q495M3,181,1
4,Q9BXP8,1408,1
...,...,...,...
43548,Q03001.4,7238,2
43549,P31639.1,4,2
43550,Q9BT09.1,267,2
43551,Q9BT09.1,270,2


In [10]:
(N_merged_train_df["label"] == 2).any()

True

In [11]:
from Bio import Entrez
from Bio.Seq import Seq
import concurrent.futures

import requests

def get_protein_sequences(protein_ids):
    sequences = {}
    
    # Separate UniProt and NCBI IDs
    uniprot_ids = [id for id in protein_ids if not id.startswith('NP_')]
    ncbi_ids = [id for id in protein_ids if id.startswith('NP_')]
    
    # Fetch sequences for UniProt IDs
    if uniprot_ids:
        uniprot_sequences = fetch_uniprot_sequences(uniprot_ids)  # Fetch UniProt sequences
        sequences.update(uniprot_sequences)
    
    # Fetch sequences for NCBI IDs
    if ncbi_ids:
        ncbi_sequences = fetch_ncbi_sequences(ncbi_ids)  # Fetch NCBI sequences
        sequences.update(ncbi_sequences)
    
    return sequences

def fetch_uniprot_sequences(uniprot_ids):
    sequences = {}
    
    for uniprot_id in uniprot_ids:
        
        # Make a request to UniProt for the FASTA sequence
        url = f'https://www.uniprot.org/uniprot/{uniprot_id}.fasta'
        response = requests.get(url)
        
        if response.ok:
            sequences[uniprot_id] = [''.join(response.text.split('\n')[1:])]
    
    return sequences

def fetch_ncbi_sequences(ncbi_ids):
    Entrez.email = 'd.hasler@tum.de'  # Set your email address here
    sequences = {}
    
    def fetch_sequence(ncbi_id):
        handle = Entrez.efetch(db='protein', id=ncbi_id, rettype='fasta', retmode='text')
        record = handle.read()
        handle.close()
        sequences[ncbi_id] = [record.split('\n', 1)[1].replace('\n', '')]
    
    # Fetch sequences using concurrent futures
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(fetch_sequence, ncbi_ids)
    
    return sequences

In [13]:
# this will take around 90 minutes
"""all_PID = set((N_merged_train_df["PID"].tolist() + 
               N_LMNgly_test_df["PID"].tolist() + 
               O_captor_test_df.index.tolist()))
sequences = get_protein_sequences(all_PID)"""

KeyboardInterrupt: 

In [None]:
"""Fasta(sequences=sequences).write_fasta("../../data/glyco/glyco_all.fasta", overwrite=True)"""

In [13]:
sequences = Fasta("../../data/glyco/glyco_all.fasta")

In [22]:
merged_train_dict = {}
label_dict = {0: "T", 1: "N", 2: "O", 5: "X"}
for i in set(N_merged_train_df["PID"]):
    try:
        seq = sequences[i][0]
    except KeyError:
        print(f"Skipping {i}")
        continue
    position = N_merged_train_df[N_merged_train_df["PID"] == i]["Position"].tolist()
    true_labels = N_merged_train_df[N_merged_train_df["PID"] == i]["label"].tolist()
    labels = np.zeros(len(seq)) + 5
    for idx, pos in enumerate(position):
        try:
            labels[pos] = true_labels[idx]
        except IndexError:
            print(f"Skipping {i} because of IndexError")
            continue
    merged_train_dict[i] = [seq, ["".join([label_dict[l] for l in labels.tolist()])]]

Skipping NP_005715.1
Skipping Q03154
Skipping NP_001985.2
Skipping Q8N3X1
Skipping Q03001.4 because of IndexError
Skipping Q96ST8
Skipping P30305
Skipping Q9Y5B9
Skipping Q03721
Skipping Q96RS0
Skipping NP_004607.1
Skipping Q86UU0
Skipping Q7L311
Skipping Q5VY43
Skipping P01023
Skipping Q6UXB3
Skipping Q9Y2U8
Skipping Q9H5Y7.3 because of IndexError
Skipping Q9H5Y7.3 because of IndexError
Skipping Q58WW2
Skipping Q9H2X6
Skipping P55285
Skipping P35542
Skipping Q96JN0
Skipping Q96M93
Skipping Q9UMS5
Skipping Q96KB5
Skipping P28062
Skipping O75556
Skipping Q96S38
Skipping NP_000069.2
Skipping Q6IE36
Skipping Q30154
Skipping Q9UHR4
Skipping Q9ULZ3
Skipping Q9BZM6
Skipping NP_001191017.1
Skipping Q6ZRS4
Skipping Q9NPG4
Skipping Q9H6B1
Skipping Q71UI9
Skipping Q14993.3 because of IndexError
Skipping Q14993.3 because of IndexError
Skipping Q8WW33
Skipping P55197.2 because of IndexError
Skipping NP_036474.1
Skipping J3QSS3
Skipping O60563
Skipping P01857
Skipping Q9Y463
Skipping Q96KQ4
Skippin

In [23]:
shit_ids = []
for item in merged_train_dict.items():
    if len(set(list(item[1][1][0]))) == 1:
        shit_ids.append(item[0])
for id in shit_ids:
    merged_train_dict.pop(id)

In [24]:
Fasta(sequences=merged_train_dict).write_fasta("../../data/glyco/train_seq_labels.o", overwrite=True)

In [25]:
O_captor_test_df = O_captor_test_df.to_frame()
O_captor_test_df.columns = ["Position"]
O_captor_test_df["PID"] = O_captor_test_df.index
O_captor_test_df["label"] = 2
O_captor_test_df.reset_index(drop=True, inplace=True)
O_captor_test_df = O_captor_test_df.explode("Position", ignore_index=True)

In [26]:

merged_O_test_dict = {}
label_dict = {0: "T", 1: "N", 2: "O", 5: "X"}
for i in set(O_captor_test_df["PID"]):
    try:
        seq = sequences[i][0]
    except KeyError:
        print(f"Skipping {i}")
        continue
    position = O_captor_test_df[O_captor_test_df["PID"] == i]["Position"].tolist()
    true_labels = O_captor_test_df[O_captor_test_df["PID"] == i]["label"].tolist()
    labels = np.zeros(len(seq)) + 5
    for idx, pos in enumerate(position):
        try:
            labels[pos] = true_labels[idx]
        except IndexError:
            print(f"Skipping {i} because of IndexError")
            continue
    merged_O_test_dict[i] = [seq, ["".join([label_dict[l] for l in labels.tolist()])]]

Skipping P03952.1
Skipping Q7Z4P5.2
Skipping Q9Y5M8.3
Skipping P07360.3
Skipping O95831.1
Skipping P01215.1
Skipping P13693.1
Skipping Q96PQ0.3
Skipping Q6ZRP7.3 because of IndexError
Skipping Q6ZRP7.3 because of IndexError
Skipping Q6ZRP7.3 because of IndexError
Skipping Q6ZRP7.3 because of IndexError
Skipping Q6ZRP7.3 because of IndexError
Skipping Q6ZRP7.3 because of IndexError
Skipping Q02447.3
Skipping P04180.1
Skipping Q15223.3
Skipping Q04721.3 because of IndexError
Skipping Q04721.3 because of IndexError
Skipping Q04721.3 because of IndexError
Skipping Q04721.3 because of IndexError
Skipping P58397.2
Skipping Q8IZP9.2
Skipping Q8N436.3
Skipping Q9Y6Z7.2
Skipping P14625.1
Skipping Q86X52.3
Skipping Q86X27.1
Skipping O43155.1
Skipping Q9UHX1.1
Skipping Q9NZ20.2
Skipping O95782.3
Skipping P54108.1
Skipping Q15056.5
Skipping Q9UKU6.1
Skipping Q9BXS4.1
Skipping P08697.3 because of IndexError
Skipping Q8IWF2.1
Skipping P36507.1
Skipping P09486.1
Skipping Q29980.1
Skipping Q9ULI3.3 be

In [27]:
shit_ids = []
for item in merged_O_test_dict.items():
    if len(set(list(item[1][1][0]))) == 1:
        shit_ids.append(item[0])
for id in shit_ids:
    merged_O_test_dict.pop(id)

In [28]:
len(set(O_captor_test_df["PID"]))

282

In [29]:
len(merged_O_test_dict)

189

In [30]:
Fasta(sequences=merged_O_test_dict).write_fasta("../../data/glyco/O_test_seq_labels.o", overwrite=True)

In [31]:

merged_N_test_dict = {}
label_dict = {0: "T", 1: "N", 2: "O", 5: "X"}
for i in set(N_LMNgly_test_df["PID"]):
    try:
        seq = sequences[i][0]
    except KeyError:
        print(f"Skipping {i}")
        continue
    position = N_LMNgly_test_df[N_LMNgly_test_df["PID"] == i]["Position"].tolist()
    true_labels = N_LMNgly_test_df[N_LMNgly_test_df["PID"] == i]["label"].tolist()
    labels = np.zeros(len(seq)) + 5
    for idx, pos in enumerate(position):
        try:
            labels[pos] = true_labels[idx]
        except IndexError:
            print(f"Skipping {i} because of IndexError")
            continue
    merged_N_test_dict[i] = [seq, ["".join([label_dict[l] for l in labels.tolist()])]]
shit_ids = []
for item in merged_N_test_dict.items():
    if len(set(list(item[1][1][0]))) == 1:
        shit_ids.append(item[0])
for id in shit_ids:
    merged_N_test_dict.pop(id)

In [32]:
len(set(N_LMNgly_test_df["PID"]))

955

In [33]:
len(merged_N_test_dict)

955

In [36]:
Fasta(sequences=merged_N_test_dict).write_fasta("../../data/glyco/N_test_seq_labels.o", overwrite=True)

In [None]:
ids = []
with open("../../data/glyco/rr_wscores_and_config_glyco_all.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            ids.append(line[1:].strip())

In [None]:
t = []
for i in ids:
    try:
        merged_O_test_dict[i]
        t.append(i)
    except:
        continue
len(t)

172

In [None]:
negative_sites = 0
postive_N_sites = 0
postive_O_sites = 0
for id in t:
    try:
        negative_sites += merged_O_test_dict[id][1][0].count("T")
        postive_O_sites += merged_O_test_dict[id][1][0].count("O")
        postive_N_sites += merged_O_test_dict[id][1][0].count("N")
    except:
        continue
print(f"Negative sites: {negative_sites}\nN-sites: {postive_N_sites}\nO-sites: {postive_O_sites}")

Negative sites: 0
N-sites: 0
O-sites: 2241


In [None]:

l = []
for i in ids:
    try:
        merged_train_dict[i]
        l.append(i)
    except:
        continue
len(l)

8596

In [None]:
negative_sites = 0
postive_N_sites = 0
postive_O_sites = 0
for id in l:
    try:
        negative_sites += merged_train_dict[id][1][0].count("T")
        postive_O_sites += merged_train_dict[id][1][0].count("O")
        postive_N_sites += merged_train_dict[id][1][0].count("N")
    except:
        continue
print(f"Negative sites: {negative_sites}\nN-sites: {postive_N_sites}\nO-sites: {postive_O_sites}")

Negative sites: 11194
N-sites: 12116
O-sites: 2551


In [None]:
o_site = 15
n_site = 3
negative_sample = 2
scores = {}
for id in sequences.get_headers():
    labels = None
    if id in merged_train_dict:
        labels = merged_train_dict[id][1][0]
    elif id in merged_O_test_dict:
        labels = merged_O_test_dict[id][1][0]
    elif id in merged_N_test_dict:
        labels = merged_N_test_dict[id][1][0]
    else:
        scores[id] = 1
        continue

    if labels.count("T") != 0:
        scores[id] = negative_sample
        continue
    score = labels.count("O") * o_site + labels.count("N") * n_site + labels.count("T") * negative_sample
    scores[id] = score

with open("../../data/glyco/rr_scores.tsv", "w") as f:
    for id, score in scores.items():
        f.write(f"{id}\t{score}\n")

In [None]:
len(set(merged_train_dict.keys()).intersection(set(merged_N_test_dict.keys())))

0

In [35]:
len(set(merged_train_dict.keys()).intersection(set(merged_O_test_dict.keys())))

0