In [1]:
import numpy as np
import pandas as pd
from glob import glob

In [3]:
def get_files():
    files = [''.join(i.split('.')[:-1]).replace('\\', '/') for i in glob('rawDataset2025/*.zip')]
    files = dict.fromkeys([i.split('/')[-1] for i in files])
    return files

files = get_files()

ptms = list(files.keys())
for ptm in ptms:
    files[ptm] = ptm

In [4]:
def construct_subsequence(protein: str, site: int, no_stream_aa: int = 10):
    # Treat PTM site as 0-based (so subtract 1 for convenience)
    start = max(0, site - no_stream_aa)
    end = min(len(protein), site + no_stream_aa + 1)

    subsequence: str = protein[start:end]

    if site < no_stream_aa:
        subsequence = ('-' * (no_stream_aa - site)) + subsequence 
    if site + no_stream_aa > len(protein):
        subsequence += ('-' * ((site + no_stream_aa + 1) - len(protein)))
    
    return subsequence

In [5]:
import requests

def fetch_prot_id(accession_number: str) -> str:
    got_response = True
    got_id = True
    while got_response:
        try:
            got_response = False
            response = requests.get(
                f'http://www.ebi.ac.uk/proteins/api/proteins/{accession_number}'
            )
            print(f"\tGot response for {accession_number}.")
        except:
            got_response = True
        try:
            _id = response.json()['id']
        except:
            got_response = True
            while got_response:
                try:
                    got_response = False
                    response = requests.get(
                        f'https://rest.uniprot.org/uniprotkb/{accession_number}',
                        headers={
                            'Accept': 'application/json'
                        }
                    )
                    print("\tGot response for", accession_number)
                except Exception as e:
                    got_response = True
            got_id = True
            try:
                response: dict = response.json()
                _id = response['uniProtkbId']
            except Exception as e:
                _id = ''
                got_id = False
                print("\t\tNo sequence found for", accession_number)

    return (got_id, _id)


def fetch_sequence(accession_number: str) -> tuple[bool, str]:
    got_response = True
    got_sequence = True
    while got_response:
        try:
            got_response = False
            response = requests.get(
                f'http://www.ebi.ac.uk/proteins/api/proteins/{accession_number}'
            )
            print(f"\tGot response for {accession_number}.")
        except:
            got_response = True
        try:
            sequence = response.json()['sequence']['sequence']
        except:
            got_response = True
            while got_response:
                try:
                    got_response = False
                    response = requests.get(
                        f'https://rest.uniprot.org/uniprotkb/{accession_number}',
                        headers={
                            'Accept': 'application/json'
                        }
                    )
                    print("\tGot response for", accession_number)
                except Exception as e:
                    got_response = True
            got_sequence = True
            try:
                response: dict = response.json()
                if 'inactiveReason' in response.keys():
                    if response['inactiveReason']['inactiveReasonType'] == 'DELETED':
                        print("\tWas deleted")
                        uniparc_id = response['extraAttributes']['uniParcId']
                        got_response = True
                        while got_response:
                            try:
                                got_response = False
                                response: dict = requests.get(
                                    f'https://rest.uniprot.org/uniparc/{uniparc_id}',
                                    headers={
                                        'Accept': 'application/json'
                                    }
                                )
                            except Exception as e:
                                got_response = True
                    elif response['inactiveReason']['inactiveReasonType'] == 'DEMERGED':    
                        print("\tWas merged")
                        accession_number = response['inactiveReason']['mergeDemergeTo'][0]
                        got_response = True
                        while got_response:
                            try:
                                got_response = False
                                response: dict = requests.get(
                                    f'https://rest.uniprot.org/uniprotkb/{accession_number}',
                                    headers={
                                        'Accept': 'application/json'
                                    }
                                )
                            except Exception as e:
                                got_response = True
                    sequence = response.json()['sequence']['value']
                else:
                    sequence = response['sequence']['value']
            except Exception as e:
                sequence = ''
                got_sequence = False
                print("\t\tNo sequence found for", accession_number)

    return (got_sequence, sequence)



# positions = invalid_sequence_df[invalid_sequence_df['Acc#'] == accession_number].ModSite.to_list()
# for position in positions:
#     if got_sequence:
#         subsequence = construct_subsequence(sequence, position-1)
#     else:
#         subsequence = ''
#     df.loc[(df['Acc#'] == accession_number) & (df['ModSite'] == position), 'Seq'] = subsequence                

In [6]:
def fill_sequences_of_ptm(ptm: str) -> None:
    df = pd.read_csv(
        'rawDataset2025/' + ptm,
        names=[
            'ProID',
            'Acc#',
            'ModSite',
            'PTM',
            'EvdId',
            'Seq'
        ],
        header=None,
        sep='\t'
    )
    nan_df = df[df['Seq'].isna()]
    if len(nan_df):
        print("NAN FOR", ptm, "FOUND - TOTAL", len(nan_df))
        for acc in nan_df['Acc#'].unique():
            got_sequence, sequence = fetch_sequence(acc)
            for position in nan_df[nan_df['Acc#'] == acc].ModSite.to_list():
                if got_sequence:
                    subsequence = construct_subsequence(sequence, position-1)
                else:
                    subsequence = ''
                df.loc[(df['Acc#'] == acc) & (df['ModSite'] == position), 'Seq'] = subsequence
    else:
        print("No NAN for", ptm)
    df.to_csv(f'rawDataset2025/{ptm}.csv', index=False)

In [7]:
from threading import Thread

class CustomThread(Thread):
    def __init__(self, group=None, target=None, name=None, args=(), kwargs={}, verbose=None):
        # Initializing the Thread class
        super().__init__(group, target, name, args, kwargs)
        self._return = None

    # Overriding the Thread.run function
    def run(self):
        if self._target is not None:
            self._return = self._target(*self._args, **self._kwargs)

    def join(self):
        super().join()
        return self._return

In [18]:
import aiohttp
import asyncio

# Set a maximum number of concurrent requests
MAX_CONCURRENT_REQUESTS = 50  # Change this number as needed

# Semaphore to limit concurrent HTTP requests
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

async def fetch_sequence_thread(accession_number: str) -> tuple[str, bool, str]:
    real_acc = accession_number
    got_response = True
    got_sequence = True
    sequence = ''
    
    async with semaphore:
        async with aiohttp.ClientSession() as session:
            while got_response:
                try:
                    got_response = False
                    async with session.get(f'http://www.ebi.ac.uk/proteins/api/proteins/{accession_number}') as response:
                        print(f"\tGot response for {accession_number}.")
                        if response.status == 200:
                            data = await response.json()
                            sequence = data['sequence']['sequence']
                            got_sequence = True
                        else:
                            got_sequence = False
                            print(f"\tError: {response.status} while fetching from EBI API")
                except Exception as e:
                    print(f"\tError fetching from EBI API: {e}")
                    got_response = True

            if not got_sequence:  # If sequence wasn't found in the first API, try the second one
                while got_response:
                    try:
                        got_response = False
                        async with session.get(f'https://rest.uniprot.org/uniprotkb/{accession_number}', headers={'Accept': 'application/json'}) as response:
                            print(f"\tGot response for {accession_number} from UniProt.")
                            if response.status == 200:
                                data = await response.json()
                                if 'inactiveReason' in data:
                                    if data['inactiveReason']['inactiveReasonType'] == 'DELETED':
                                        print("\tWas deleted")
                                        uniparc_id = data['extraAttributes']['uniParcId']
                                        # Try fetching UniParc data
                                        async with session.get(f'https://rest.uniprot.org/uniparc/{uniparc_id}', headers={'Accept': 'application/json'}) as uniparc_response:
                                            if uniparc_response.status == 200:
                                                uniparc_data = await uniparc_response.json()
                                                sequence = uniparc_data['sequence']['value']
                                                got_sequence = True
                                    elif data['inactiveReason']['inactiveReasonType'] == 'DEMERGED':
                                        print("\tWas merged")
                                        accession_number = data['inactiveReason']['mergeDemergeTo'][0]
                                        got_response = True
                                        continue  # If merged, retry with new accession number
                                else:
                                    sequence = data['sequence']['value']
                                    got_sequence = True
                            else:
                                print(f"\tError: {response.status} while fetching from UniProt")
                    except Exception as e:
                        print(f"\tError fetching from UniProt API: {e}")
                        got_response = True

    return real_acc, got_sequence, sequence

In [14]:
# thread = CustomThread(target=fetch_sequence, args=())
# thread.start()
# print(thread.join())  # This is where you get your return value.p

async def main(file):
    print("DOING FOR", file)
    df = pd.read_csv(
    'rawDataset2025/' + file,
        names=[
            'ProID',
            'Acc#',
            'ModSite',
            'PTM',
            'EvdId',
            'Seq'
        ],
        header=None,
        sep='\t'
    )
    df['FullSeq'] = ''
    responses = [fetch_sequence_thread(acc) for acc in df['Acc#'].unique()]
    results = await asyncio.gather(*responses)
    for result in results:
        df.loc[df['Acc#'] == result[0], 'FullSeq'] = result[2]
    df.to_csv('rawDataset2025/' + file + '_full.csv', index=False)
    print("DONE FOR", file)


In [19]:
it = iter(get_files())

In [20]:
await main(next(it))

DOING FOR Acetylation
	Got response for P18281.
	Got response for O23629.
	Got response for P19456.
	Got response for P46077.
	Got response for Q9XIE2.
	Got response for Q9SLG0.
	Got response for Q9T074.
	Got response for Q9LQQ4.
	Got response for O81270.
	Got response for Q9ZVI3.
	Got response for Q06611.
	Got response for Q9SJT1.
	Got response for Q9SZR1.
	Got response for Q56YA5.
	Got response for Q6NLC1.
	Got response for Q39196.
	Got response for Q08733.
	Got response for Q56Z59.
	Got response for Q56ZI2.
	Got response for P59169.
	Got response for Q9XII1.
	Got response for Q8VZ95.
	Got response for Q9LF97.
	Got response for P54150.
	Got response for P06525.
	Got response for Q9ZSD4.
	Got response for Q1WIQ6.
	Got response for Q9CAI7.
	Got response for O81108.
	Got response for Q9FVE6.
	Got response for P56780.
	Got response for Q9C5Z1.
	Got response for P42643.
	Got response for Q9SKZ1.
	Got response for Q08682.
	Got response for Q9SVC2.
	Got response for P83755.
	Got response fo

In [1]:
import psutil


In [13]:
import requests

def fetch_sequence_thread(accession_number: str) -> tuple[str, bool, str]:
    real_acc = accession_number
    got_response = True
    got_sequence = True
    sequence = ''
    
    while got_response:
        try:
            got_response = False
            response = requests.get(f'http://www.ebi.ac.uk/proteins/api/proteins/{accession_number}')
            print(f"\tGot response for {accession_number}.")
            if response.status_code == 200:
                data = response.json()
                sequence = data['sequence']['sequence']
                got_sequence = True
            else:
                got_sequence = False
                print(f"\tError: {response.status_code} while fetching from EBI API")
        except Exception as e:
            print(f"\tError fetching from EBI API: {e}")
            got_response = True

    if not got_sequence:  # If sequence wasn't found in the first API, try the second one
        while got_response:
            try:
                got_response = False
                response = requests.get(
                    f'https://rest.uniprot.org/uniprotkb/{accession_number}',
                    headers={'Accept': 'application/json'}
                )
                print(f"\tGot response for {accession_number} from UniProt.")
                if response.status_code == 200:
                    data = response.json()
                    if 'inactiveReason' in data:
                        if data['inactiveReason']['inactiveReasonType'] == 'DELETED':
                            print("\tWas deleted")
                            uniparc_id = data['extraAttributes']['uniParcId']
                            # Try fetching UniParc data
                            uniparc_response = requests.get(
                                f'https://rest.uniprot.org/uniparc/{uniparc_id}',
                                headers={'Accept': 'application/json'}
                            )
                            if uniparc_response.status_code == 200:
                                uniparc_data = uniparc_response.json()
                                sequence = uniparc_data['sequence']['value']
                                got_sequence = True
                        elif data['inactiveReason']['inactiveReasonType'] == 'DEMERGED':
                            print("\tWas merged")
                            accession_number = data['inactiveReason']['mergeDemergeTo'][0]
                            got_response = True
                            continue  # If merged, retry with new accession number
                    else:
                        sequence = data['sequence']['value']
                        got_sequence = True
                else:
                    print(f"\tError: {response.status_code} while fetching from UniProt")
            except Exception as e:
                print(f"\tError fetching from UniProt API: {e}")
                got_response = True

    return real_acc, got_sequence, sequence

# For running the synchronous operation serially
def run_serially():
    files = ['ADP-ribosylation'] # list(get_files().keys())
    for file in files:
        print("DOING FOR", file)
        df = pd.read_csv(
            'rawDataset2025/' + file,
            names=[
                'ProID',
                'Acc#',
                'ModSite',
                'PTM',
                'EvdId',
                'Seq'
            ],
            header=None,
            sep='\t'
        )
        df['FullSeq'] = ''
        for acc in df['Acc#'].unique():
            acc, got_sequence, sequence = fetch_sequence_thread(acc)
            df.loc[df['Acc#'] == acc, 'FullSeq'] = sequence
        df.to_csv('rawDataset2025/' + file + '_full.csv', index=False)
        print("DONE FOR", file)



In [14]:
run_serially()

DOING FOR ADP-ribosylation
	Got response for P61247.
	Got response for Q9NWY4.
	Got response for O00479.
	Got response for P05114.
	Got response for P05204.
	Got response for P06748.
	Got response for P07305.
	Got response for P09429.
	Got response for P09874.
	Got response for P10412.
	Got response for P16403.
	Got response for P17096.
	Got response for P35659.
	Got response for P58876.
	Got response for P61224.
	Got response for P68431.
	Got response for P84243.
	Got response for Q00839.
	Got response for Q71DI3.
	Got response for Q96EY4.
	Got response for Q99879.
	Got response for Q99880.
	Got response for Q9Y2S6.
	Got response for P0A7Z4.
	Got response for P0A850.
	Got response for P0A9A6.
	Got response for P0AE70.
	Got response for P20974.
	Got response for P22921.
	Got response for P43220.
	Got response for P48675.
	Got response for P59665.
	Got response for P68135.
	Got response for Q03250.
	Got response for Q8RXG3.
	Got response for Q9Z1M0.
	Got response for P61585.
	Got respon

In [16]:
df_serial = pd.read_csv('rawDataset2025/ADP-ribosylation_full_serial.csv')
df_parallel = pd.read_csv('rawDataset2025/ADP-ribosylation_full_parallel.csv')

In [18]:
df_serial.equals(df_parallel)

True