In [1]:
import pandas as pd
from Bio import ExPASy
from Bio import SwissProt
import re
import numpy as np
from fasta import Fasta

In [2]:
# position of glyco site is one based -> will be converted to zero based
def fetch_protein_sequence_pdb(uniprot_id):
    handle = ExPASy.get_sprot_raw(uniprot_id)
    record = SwissProt.read(handle)
    return record.sequence

- load data
- merge train 
- remove test from train 
- get sequences 
    - for O-linked dataset: site positions are part of the fasta header -> one header per protein is enough 
    - for the rest: group the entries by PID and accumulate glyco site positions 
- map sites to sequences
- write to fasta file 
    - one fasta for training and one for RR, containing the PID and sequences 

### Load Data

In [3]:
import os
import os
os.getcwd()

'/home/d/PycharmProjects/protein_properties/src/data'

In [4]:
O_captor_train = []
with open("../../data/O_captor/Ptrain.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            O_captor_train.append(line.strip("\t").strip("\s").strip("\s")[4:])
rgx = re.compile('[%s]' % "\s\t|")
O_captor_train_dict = {}
for i in range(len(O_captor_train)):
    O_captor_train[i] = rgx.sub("", O_captor_train[i]).split("#")
    O_captor_train_dict[O_captor_train[i][0]] = set([int(j) - 1 for j in O_captor_train[i][1:]])
O_captor_train_df = pd.Series(O_captor_train_dict.values(), O_captor_train_dict.keys())

O_captor_test = []
with open("../../data/O_captor/Ptrain.fasta", "r") as f:
    for line in f.readlines():
        if line.startswith(">"):
            O_captor_test.append(line.strip("\t").strip("\s").strip("\s")[4:])
O_captor_test_dict = {}
for i in range(len(O_captor_test)):
    O_captor_test[i] = rgx.sub("", O_captor_test[i]).split("#")
    O_captor_test_dict[O_captor_test[i][0]] = set([int(j) - 1 for j in O_captor_test[i][1:]])
O_captor_test_df = pd.Series(O_captor_test_dict.values(), O_captor_test_dict.keys())

N_LMNgly_train = pd.read_csv('../../data/LMNglyPred/df_train_data_without_independent_test_and_protein.csv', usecols=["label","PID","Position"])
N_LMNgly_test = pd.read_csv('../../data/LMNglyPred/df_independent_test_again_done_that_has_unique_protein_and_unique_sequences.csv', usecols=["label","PID","Position"])
N_taherzadeh_train = pd.read_csv('../../data/N_taherzadeh/Datasets.csv')

In [5]:
N_LMNgly_train_df = N_LMNgly_train.groupby("PID").apply(lambda x: set([i - 1 for i in x["Position"].tolist()]))
N_LMNgly_test_df = N_LMNgly_test.groupby("PID").apply(lambda x: set([i - 1 for i in x["Position"].tolist()]))
N_taherzadeh_train_df = N_taherzadeh_train.groupby("Protein name").apply(lambda x: set([i - 1 for i in x["Position"].tolist()]))
N_taherzadeh_train_df.index = [str(i.replace("'", "")) for i in N_taherzadeh_train_df.index]

In [6]:
O_captor_train_df.index

Index(['Q92954.3', 'P49589.3', 'Q9HBR0.2', 'O94985.1', 'Q8IYE1.2', 'P52823.1',
       'Q6ZRP7.3', 'Q685J3.2', 'P02786.2', 'P15514.2',
       ...
       'Q9NVR5.2', 'O43508.1', 'P01308.1', 'Q96SB4.2', 'Q9Y561.1', 'Q8N158.1',
       'Q03001.4', 'P31639.1', 'Q9BT09.1', 'Q9NT22.2'],
      dtype='object', length=1326)

### Clean Data

In [89]:
N_merged_train_df = pd.concat([N_LMNgly_train_df, N_taherzadeh_train_df])
dubs = N_merged_train_df[N_merged_train_df.index.duplicated()].groupby(level=0).apply(lambda x: set.union(*x))
N_merged_train_df = N_merged_train_df[N_merged_train_df.index.duplicated(keep=False)]
N_merged_train_df = pd.concat([N_merged_train_df, dubs]).apply(lambda x: set.union(*x))

TypeError: descriptor 'union' for 'set' objects doesn't apply to a 'int' object

In [86]:
# drop the proteins that are in the either of the two test set
N_merged_train_df = N_merged_train_df[~N_merged_train_df.index.isin(N_LMNgly_test_df.index)]
N_merged_train_df = N_merged_train_df[~N_merged_train_df.index.isin(O_captor_test_df.index)]

In [29]:
from Bio import Entrez
from Bio.Seq import Seq
import concurrent.futures

import requests

def get_protein_sequences(protein_ids):
    sequences = {}
    
    # Separate UniProt and NCBI IDs
    uniprot_ids = [id for id in protein_ids if not id.startswith('NP_')]
    ncbi_ids = [id for id in protein_ids if id.startswith('NP_')]
    
    # Fetch sequences for UniProt IDs
    if uniprot_ids:
        uniprot_sequences = fetch_uniprot_sequences(uniprot_ids)  # Fetch UniProt sequences
        sequences.update(uniprot_sequences)
    
    # Fetch sequences for NCBI IDs
    if ncbi_ids:
        ncbi_sequences = fetch_ncbi_sequences(ncbi_ids)  # Fetch NCBI sequences
        sequences.update(ncbi_sequences)
    
    return sequences

def fetch_uniprot_sequences(uniprot_ids):
    sequences = {}
    
    for uniprot_id in uniprot_ids:
        
        # Make a request to UniProt for the FASTA sequence
        url = f'https://www.uniprot.org/uniprot/{uniprot_id}.fasta'
        response = requests.get(url)
        
        if response.ok:
            sequences[uniprot_id] = [''.join(response.text.split('\n')[1:])]
    
    return sequences

def fetch_ncbi_sequences(ncbi_ids):
    Entrez.email = 'your_email@example.com'  # Set your email address here
    sequences = {}
    
    def fetch_sequence(ncbi_id):
        handle = Entrez.efetch(db='protein', id=ncbi_id, rettype='fasta', retmode='text')
        record = handle.read()
        handle.close()
        sequences[ncbi_id] = [record.split('\n', 1)[1].replace('\n', '')]
    
    # Fetch sequences using concurrent futures
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(fetch_sequence, ncbi_ids)
    
    return sequences

In [30]:
all_PID = set((N_merged_train_df.index.tolist() + N_LMNgly_test_df.index.tolist() + O_captor_test_df.index.tolist() + O_captor_train_df.index.tolist()))
sequences = get_protein_sequences(all_PID)

In [36]:
Fasta(sequences=sequences).write_fasta("../../data/glyco/glyco_all.fasta", overwrite=True)

In [82]:
sequences = Fasta("../../data/glyco/glyco_all.fasta")

In [88]:
merged_train_dict = {}
for i in N_merged_train_df.index:
    seq = sequences[i]
    labels = np.zeros(len(seq))
    try:
        labels[list(N_merged_train_df[i])] = 1
    except:
        print(i)
    merged_train_dict[i] = [seq, labels]

for i in O_captor_train_df.index:
    seq = sequences[i]
    if i in merged_train_dict:
        merged_train_dict[i][list(O_captor_train_df[i])] = 2
    else:
        labels = np.zeros(len(seq))
        labels[list(O_captor_train_df[i])] = 2
        merged_train_dict[i] = [seq, labels]
train_fasta = Fasta(sequence=merged_train_dict)

A1A5B4
A8K7I4
O00462
O00533
O00624
O14594
O14672
O14917
O15031
O15321
O15393
O15455
O15460
O43184
O43291
O43451
O43570
O43827
O60486
O60602
O75015
O75054
O75094
O75197
O75460
O75503
O75556
O94813
O94856
O94923
O95196
O95274
O95393
O95477
O95490
O95970
O95980
O95998
O96005
O96014
P00736
P00742
P00750
P01133
P01857
P01871
P02788
P04216
P05106
P05231
P06127
P06729
P06731
P07911
P07949
P08069
P08519
P08648
P08922
P09326
P09619
P09758
P10153
P10586
P11362
P11464
P11717
P11912
P12109
P12111
P12830
P13284
P13473
P13688
P14210
P14679
P14784
P15289
P15391
P15813
P16066
P16144
P16234
P16278
P16473
P16671
P17181
P17927
P18564
P20036
P20061
P20138
P20701
P20702
P20742
P21439
P21757
P21810
P22303
P23229
P23515
P25101
P25391
P26006
P26012
P27701
P27930
P28827
P29017
P29320
P29965
P30530
P31785
P32004
P32238
P32942
P33527
P33681
P34969
P35052
P35354


KeyError: 'P35542'