In [1]:
import pandas as pd
from Bio import ExPASy
from Bio import SwissProt
import re
import numpy as np
from fasta import Fasta

In [2]:
# position of glyco site is one based -> will be converted to zero based
def fetch_protein_sequence_pdb(uniprot_id):
    handle = ExPASy.get_sprot_raw(uniprot_id)
    record = SwissProt.read(handle)
    return record.sequence

- load data
- merge train 
- remove test from train 
- get sequences 
    - for O-linked dataset: site positions are part of the fasta header -> one header per protein is enough 
    - for the rest: group the entries by PID and accumulate glyco site positions 
- map sites to sequences
- write to fasta file 
    - one fasta for training and one for RR, containing the PID and sequences 

### Load Data

In [3]:
import os
import os
os.getcwd()

'/home/d/PycharmProjects/protein_properties/src/data'

The negative samples of Captor are not usable, since they do not provide the exact position of the window -> would mean I have to align and map the labels to the sequence = pain

In [121]:
O_captor_train = []
with open("../../data/O_captor/Ptrain.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            O_captor_train.append(line.strip("\t").strip("\s").strip("\s")[4:])
rgx = re.compile('[%s]' % "\s\t|")
O_captor_train_dict = {}
for i in range(len(O_captor_train)):
    O_captor_train[i] = rgx.sub("", O_captor_train[i]).split("#")
    O_captor_train_dict[O_captor_train[i][0]] = set([int(j) - 1 for j in O_captor_train[i][1:]])
O_captor_train_series = pd.Series(O_captor_train_dict.values(), O_captor_train_dict.keys())

O_captor_test = []
with open("../../data/O_captor/Ptrain.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            O_captor_test.append(line.strip("\t").strip("\s").strip("\s")[4:])
O_captor_test_dict = {}
for i in range(len(O_captor_test)):
    O_captor_test[i] = rgx.sub("", O_captor_test[i]).split("#")
    O_captor_test_dict[O_captor_test[i][0]] = set([int(j) - 1 for j in O_captor_test[i][1:]])
O_captor_test_df = pd.Series(O_captor_test_dict.values(), O_captor_test_dict.keys())

N_LMNgly_train_df = pd.read_csv('../../data/LMNglyPred/Train_Data_NGlycositeAtlas.csv', usecols=["label","PID","Position"])
N_LMNgly_test_df = pd.read_csv('../../data/LMNglyPred/Test_Data_NGlycositeAtlas.csv', usecols=["label","PID","Position"])
# skip weird proteins at the top of the file
N_taherzadeh_train_df = pd.read_csv('../../data/N_taherzadeh/Datasets.csv', usecols=["Protein name","Position"], skiprows=range(1, 12))
N_taherzadeh_train_df["Protein name"] = N_taherzadeh_train_df["Protein name"].apply(lambda x: x.strip("'"))

In [8]:
O_captor_train_df = O_captor_train_series.to_frame()
O_captor_train_df.columns = ["Position"]
O_captor_train_df["PID"] = O_captor_train_df.index
O_captor_train_df["label"] = 2
O_captor_train_df.reset_index(drop=True, inplace=True)
O_captor_train_df = O_captor_train_df.explode("Position", ignore_index=True)

In [10]:
N_taherzadeh_train_df["label"] = 1
N_taherzadeh_train_df.columns = ["PID", "Position", "label"]

Unnamed: 0,PID,Position,label
0,'A0A5B9',69,1
1,'A0AV02',221,1
2,'A1A5B4',641,1
3,'A1A5B4',652,1
4,'A1A5B4',674,1
...,...,...,...
12577,'P02768',549,1
12578,'P02768',558,1
12579,'P02768',560,1
12580,'P02768',569,1


### Clean Data

In [30]:
N_merged_train_df = pd.concat([N_LMNgly_train_df, N_taherzadeh_train_df, O_captor_train_df], ignore_index=True)
N_merged_train_df.drop_duplicates(inplace=True, keep="first")
N_merged_train_df

Unnamed: 0,PID,Position,label
0,Q13425,167,1
1,Q9NZR2,1921,1
2,O43663,489,1
3,Q495M3,181,1
4,Q9BXP8,1408,1
...,...,...,...
43548,Q03001.4,7238,2
43549,P31639.1,4,2
43550,Q9BT09.1,267,2
43551,Q9BT09.1,270,2


In [31]:
# drop the proteins that are in the either of the two test set
N_merged_train_df = N_merged_train_df[~N_merged_train_df.index.isin(N_LMNgly_test_df.index)]
N_merged_train_df = N_merged_train_df[~N_merged_train_df.index.isin(O_captor_test_df.index)]
N_merged_train_df["PID"] = N_merged_train_df["PID"].astype(str)

In [32]:
N_merged_train_df[N_merged_train_df["PID"] == "P15514.2"]

Unnamed: 0,PID,Position,label
36976,P15514.2,40,2
36977,P15514.2,51,2
36978,P15514.2,52,2
36979,P15514.2,168,2


In [14]:
from Bio import Entrez
from Bio.Seq import Seq
import concurrent.futures

import requests

def get_protein_sequences(protein_ids):
    sequences = {}
    
    # Separate UniProt and NCBI IDs
    uniprot_ids = [id for id in protein_ids if not id.startswith('NP_')]
    ncbi_ids = [id for id in protein_ids if id.startswith('NP_')]
    
    # Fetch sequences for UniProt IDs
    if uniprot_ids:
        uniprot_sequences = fetch_uniprot_sequences(uniprot_ids)  # Fetch UniProt sequences
        sequences.update(uniprot_sequences)
    
    # Fetch sequences for NCBI IDs
    if ncbi_ids:
        ncbi_sequences = fetch_ncbi_sequences(ncbi_ids)  # Fetch NCBI sequences
        sequences.update(ncbi_sequences)
    
    return sequences

def fetch_uniprot_sequences(uniprot_ids):
    sequences = {}
    
    for uniprot_id in uniprot_ids:
        
        # Make a request to UniProt for the FASTA sequence
        url = f'https://www.uniprot.org/uniprot/{uniprot_id}.fasta'
        response = requests.get(url)
        
        if response.ok:
            sequences[uniprot_id] = [''.join(response.text.split('\n')[1:])]
    
    return sequences

def fetch_ncbi_sequences(ncbi_ids):
    Entrez.email = 'd.hasler@tum.de'  # Set your email address here
    sequences = {}
    
    def fetch_sequence(ncbi_id):
        handle = Entrez.efetch(db='protein', id=ncbi_id, rettype='fasta', retmode='text')
        record = handle.read()
        handle.close()
        sequences[ncbi_id] = [record.split('\n', 1)[1].replace('\n', '')]
    
    # Fetch sequences using concurrent futures
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(fetch_sequence, ncbi_ids)
    
    return sequences

In [21]:
# this will take around 90 minutes
"""all_PID = set((N_merged_train_df["PID"].tolist() + 
               N_LMNgly_test_df["PID"].tolist() + 
               O_captor_test_df.index.tolist()))
sequences = get_protein_sequences(all_PID)"""

In [22]:
"""Fasta(sequences=sequences).write_fasta("../../data/glyco/glyco_all.fasta", overwrite=True)"""

In [78]:
len(sequences.get_headers())

11569

In [46]:
sequences = Fasta("../../data/glyco/glyco_all.fasta")

In [80]:
merged_train_dict = {}
label_dict = {0: "T", 1: "N", 2: "O", 5: "X"}
for i in set(N_merged_train_df["PID"]):
    try:
        seq = sequences[i][0]
    except KeyError:
        print(f"Skipping {i}")
        continue
    position = N_merged_train_df[N_merged_train_df["PID"] == i]["Position"].tolist()
    true_labels = N_merged_train_df[N_merged_train_df["PID"] == i]["label"].tolist()
    labels = np.zeros(len(seq)) + 5
    for idx, pos in enumerate(position):
        try:
            labels[pos] = true_labels[idx]
        except IndexError:
            print(f"Skipping {i} because of IndexError")
            continue
    merged_train_dict[i] = [seq, ["".join([label_dict[l] for l in labels.tolist()])]]

Skipping NP_001985.2
Skipping NP_006334.2
Skipping NP_001868.2
Skipping NP_075051.4
Skipping NP_000177.2
Skipping NP_001070666.1
Skipping NP_006745.1
Skipping NP_001258867.1
Skipping Q8WXI7.3 because of IndexError
Skipping Q8WXI7.3 because of IndexError
Skipping Q8WXI7.3 because of IndexError
Skipping NP_653247.1
Skipping A6NF01.2 because of IndexError
Skipping P16112.3 because of IndexError
Skipping P16112.3 because of IndexError
Skipping P16112.3 because of IndexError
Skipping P16112.3 because of IndexError
Skipping P16112.3 because of IndexError
Skipping P16112.3 because of IndexError
Skipping P16112.3 because of IndexError
Skipping P16112.3 because of IndexError
Skipping P16112.3 because of IndexError
Skipping P16112.3 because of IndexError
Skipping P16112.3 because of IndexError
Skipping P16112.3 because of IndexError
Skipping P16112.3 because of IndexError
Skipping P16112.3 because of IndexError
Skipping P16112.3 because of IndexError
Skipping P16112.3 because of IndexError
Skipp

In [100]:
shit_ids = []
for item in merged_train_dict.items():
    if len(set(list(item[1][1][0]))) == 1:
        shit_ids.append(item[0])
for id in shit_ids:
    merged_train_dict.pop(id)

In [104]:
Fasta(sequences=merged_train_dict).write_fasta("../../data/glyco/train_seq_labels.o")

In [107]:
O_captor_test_df = O_captor_test_df.to_frame()
O_captor_test_df.columns = ["Position"]
O_captor_test_df["PID"] = O_captor_test_df.index
O_captor_test_df["label"] = 2
O_captor_test_df.reset_index(drop=True, inplace=True)
O_captor_test_df = O_captor_test_df.explode("Position", ignore_index=True)

In [108]:

merged_O_test_dict = {}
label_dict = {0: "T", 1: "N", 2: "O", 5: "X"}
for i in set(O_captor_test_df["PID"]):
    try:
        seq = sequences[i][0]
    except KeyError:
        print(f"Skipping {i}")
        continue
    position = O_captor_test_df[O_captor_test_df["PID"] == i]["Position"].tolist()
    true_labels = O_captor_test_df[O_captor_test_df["PID"] == i]["label"].tolist()
    labels = np.zeros(len(seq)) + 5
    for idx, pos in enumerate(position):
        try:
            labels[pos] = true_labels[idx]
        except IndexError:
            print(f"Skipping {i} because of IndexError")
            continue
    merged_O_test_dict[i] = [seq, ["".join([label_dict[l] for l in labels.tolist()])]]

Skipping O00339.4 because of IndexError
Skipping O00339.4 because of IndexError
Skipping O00339.4 because of IndexError
Skipping O00339.4 because of IndexError
Skipping O00339.4 because of IndexError
Skipping O75096.4 because of IndexError
Skipping O75096.4 because of IndexError
Skipping O75096.4 because of IndexError
Skipping O75096.4 because of IndexError
Skipping O75096.4 because of IndexError
Skipping O75096.4 because of IndexError
Skipping Q14766.4 because of IndexError
Skipping Q14766.4 because of IndexError
Skipping Q14766.4 because of IndexError
Skipping Q14766.4 because of IndexError
Skipping Q14766.4 because of IndexError
Skipping Q14766.4 because of IndexError
Skipping Q14766.4 because of IndexError
Skipping Q14766.4 because of IndexError
Skipping Q14766.4 because of IndexError
Skipping Q14766.4 because of IndexError
Skipping Q14766.4 because of IndexError
Skipping Q14766.4 because of IndexError
Skipping Q14766.4 because of IndexError
Skipping Q14766.4 because of IndexError


In [112]:
shit_ids = []
for item in merged_O_test_dict.items():
    if len(set(list(item[1][1][0]))) == 1:
        shit_ids.append(item[0])
for id in shit_ids:
    merged_O_test_dict.pop(id)

In [117]:
len(set(O_captor_test_df["PID"]))

1326

In [118]:
len(merged_O_test_dict)

1289

In [115]:
Fasta(sequences=merged_O_test_dict).write_fasta("../../data/glyco/O_test_seq_labels.o")

In [122]:

merged_N_test_dict = {}
label_dict = {0: "T", 1: "N", 2: "O", 5: "X"}
for i in set(N_LMNgly_test_df["PID"]):
    try:
        seq = sequences[i][0]
    except KeyError:
        print(f"Skipping {i}")
        continue
    position = N_LMNgly_test_df[N_LMNgly_test_df["PID"] == i]["Position"].tolist()
    true_labels = N_LMNgly_test_df[N_LMNgly_test_df["PID"] == i]["label"].tolist()
    labels = np.zeros(len(seq)) + 5
    for idx, pos in enumerate(position):
        try:
            labels[pos] = true_labels[idx]
        except IndexError:
            print(f"Skipping {i} because of IndexError")
            continue
    merged_N_test_dict[i] = [seq, ["".join([label_dict[l] for l in labels.tolist()])]]
shit_ids = []
for item in merged_N_test_dict.items():
    if len(set(list(item[1][1][0]))) == 1:
        shit_ids.append(item[0])
for id in shit_ids:
    merged_N_test_dict.pop(id)

In [123]:
len(set(N_LMNgly_test_df["PID"]))

955

In [124]:
len(merged_N_test_dict)

955

In [None]:
Fasta(sequences=merged_N_test_dict).write_fasta("../../data/glyco/N_test_seq_labels.o")