In [8]:
import requests
import pandas as pd


In [4]:
df = pd.read_csv('phosphorylationRaw/phosphorylation_edited.csv')

In [6]:
df = df.drop(columns=list(df.columns)[2:])

In [7]:
accession_list = df.iloc[:10]

In [11]:
from pprint import pprint

urls = [
    'proteins',
    'features',
    'variation',
    'antigen',
    'epitope',
    'mutagenesis'
]
url_to_choose = 1

for accession in accession_list.itertuples(False):
    response = requests.get(
        f'http://www.ebi.ac.uk/proteins/api/{urls[url_to_choose]}/{accession[1]}'
    )
    pprint(response.json()['features'])
    break

[{'begin': '1',
  'category': 'MOLECULE_PROCESSING',
  'description': 'Myosin heavy chain IB',
  'end': '1147',
  'ftId': 'PRO_0000123361',
  'molecule': '',
  'type': 'CHAIN'},
 {'begin': '9',
  'category': 'DOMAINS_AND_SITES',
  'description': 'Myosin motor',
  'end': '677',
  'evidences': [{'code': 'ECO:0000255',
                 'source': {'id': 'PRU00782',
                            'name': 'PROSITE-ProRule',
                            'url': 'https://prosite.expasy.org/unirule/PRU00782'}}],
  'molecule': '',
  'type': 'DOMAIN'},
 {'begin': '715',
  'category': 'DOMAINS_AND_SITES',
  'description': 'TH1',
  'end': '900',
  'evidences': [{'code': 'ECO:0000255',
                 'source': {'id': 'PRU01093',
                            'name': 'PROSITE-ProRule',
                            'url': 'https://prosite.expasy.org/unirule/PRU01093'}}],
  'molecule': '',
  'type': 'DOMAIN'},
 {'begin': '1090',
  'category': 'DOMAINS_AND_SITES',
  'description': 'SH3',
  'end': '1147',
  'e

In [None]:
CACHE: dict = {}

In [None]:
from glob import glob
import os
import numpy as np
import pandas as pd
import asyncio
import requests
import json


semaphore = asyncio.Semaphore(100)

# We're going to fetch all of the data from UniProt (for the benchmark dataset),
# first by using the benchmark dataset to parse over the raw ones and filter out,
# and then using the remaining raw information's accession numbers to fetch features.

def process(filename: str, folder: str):
    with open(filename, 'r') as f:
        content = f.readlines()
        proteins = content[::2]
        sequences = content[1::2]
    protein_identifiers = []
    position_of_modifications = []
    accession_numbers = []
    for i in range(len(proteins)):
        protein_identifer = '_'.join(proteins[i].split('_')[:-1])[1:]
        protein_identifiers.append(protein_identifer)
        position_of_modifications.append(
            int(proteins[i].split('_')[-1])
        )

        matching_rows = df['Protein Identifier'].to_numpy() == protein_identifer
        # Check if any matching rows exist
        if np.any(matching_rows):
            accession_numbers.append(
                df['Accession number (UniProt)'].to_numpy()[matching_rows][0]
            )
        else:
            # Handle the case where no matches are found (e.g., append None or some default value)
            accession_numbers.append(None)  # or whatever is appropriate for your case

    df_temp = pd.DataFrame(
        {
            'ProID': protein_identifiers,
            'AccessionNo': accession_numbers,
            'PositionOfModification': position_of_modifications,
            'Sequence': [sequence.strip() for sequence in sequences]
        }
    )
    filepath = "".join(filename.split('.')[:-1]).replace('\\', '/')
    df_temp.to_csv(f"{filepath}.csv", index=False)
    print("SAVED FOR", folder)

def gather_all_accessions():
    folders = next(os.walk('phosphorylationBy'))[1]
    for folder in folders:
        file = f"phosphorylationBy/{folder}/{folder}_pos.fasta"
        process(file, folder)
    print("\tALL POSITIVES DONE")
    
    for folder in folders:
        file = f"phosphorylationBy/{folder}/{folder}_neg.fasta"
        process(file, folder)
    print("\tALL NEGATIVES DONE")

def retrieve_response(row: pd.Series):
    # can't really send a null string in the URL to get information, so some crude error handling done
    accession = row['AccessionNo']

    if CACHE.get(accession, None):
        print("Entry already exists!")
        row['FeatureSet'] = CACHE.get(accession)
        return row

    # Then check if the accession number is NaN
    if isinstance(accession, float):
        row['FeatureSet'] = {}
        return row
    
    # If not even that, retrieve the response.
    retrieved = False
    session=requests.session()
    while not retrieved:
        try:
            response = session.get(
                f'http://www.ebi.ac.uk/proteins/api/proteins/{accession}'
            )
            retrieved = True
        except requests.Timeout or ConnectionResetError:
            session=requests.session()
            print("\tConnectionTimeout Error, resetting.")
            retrieved = False

    # in case there is an issue with decoding the JSON or extracting the features key
    print(f"Got response for {accession}. ", end='')
    try:
        row['FeatureSet'] = json.dumps(response.json()['features'])
        print("Success.")
    except:
        row['FeatureSet'] = {}
    CACHE[accession] = row['FeatureSet']
    return row

from copy import deepcopy

def annotate_all_accessions(folders: list):
    
    # to_process = deepcopy(folders)
    # for i in range(len(to_process)):
    #     if to_process[i] == '-': continue
    #     df_temp = pd.read_csv(f"phosphorylationBy/{to_process[i]}/{to_process[i]}_pos.csv")
    #     df_temp['FeatureSet'] = ""
    #     df_temp = df_temp.apply(retrieve_response, axis=1, args=(df_temp,))
    #     print("Done for folder", to_process[i])
    #     df_temp.to_csv(f"phosphorylationBy/{to_process[i]}/{to_process[i]}_pos_fs.csv", index=False)
        
    to_process = deepcopy(folders)
    for i in range(len(to_process)):
        if to_process[i] == '-': continue
        df_temp = pd.read_csv(f"phosphorylationBy/{to_process[i]}/{to_process[i]}_neg.csv")
        df_temp['FeatureSet'] = ""
        df_temp = df_temp.apply(retrieve_response, axis=1, args=(df_temp,))
        print("Done for folder", to_process[i])
        df_temp.to_csv(f"phosphorylationBy/{to_process[i]}/{to_process[i]}_neg_fs.csv", index=False)
    
    CACHE.clear()

def del_all_folders(folders: list[str]):
    for folder in folders:
        try:
            os.remove(f"phosphorylationBy/{folder}/{folder}_pos_fs.csv")
        except:
            pass
        try:
            os.remove(f"phosphorylationBy/{folder}/{folder}_neg_fs.csv")
        except:
            pass

def retrieve_all_empty_accessions(folders: list[str]) -> None:
    for folder in folders:
        df_pos = pd.read_csv(f"phosphorylationBy/{folder}/{folder}_pos_fs.csv")

In [13]:
folders = next(os.walk('phosphorylationBy'))[1]


['CDK', 'CK1']
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Entry already exists!
Got response for A0A087WVR4. Got response for A0A087WVR4. Got response for A0A087WVR4. Got response for A0A087WVR4. Got response for A0A087WVR4. Got response for A0A087WVR4. Got response for A0A087WVR4. Got response for A0A087WVR4. Got response for A0A087WVR4. Got response for A0A087WVR4. Got response for A0A087WVR4. Got response for A0A087WVR4. Got response for A0A087WVR4. Got response for A0A087WVR4. Got response for A0A087WVR4. Got response for A0A08

In [5]:
from glob import glob
import os
import numpy as np
import pandas as pd
import aiohttp
import asyncio
import requests
import json


CACHE_NEW = {}

def apply_redundant_accessions(row: pd.Series):
    protein_identifer = row['ProID'].split('_')[0]
    
    if CACHE_NEW.get(protein_identifer, None):
        print(f"\t{row['ProID']} already done.")
        row['AccessionNo'] = CACHE_NEW.get(protein_identifer)[0]
        row['FeatureSet'] = CACHE_NEW.get(protein_identifer)[1]
        return row

    if row['FeatureSet'] != '{}':
        return row
    
    try:
        response = requests.get(
            f'https://rest.uniprot.org/uniparc/search?query={protein_identifer}'
        )
        response = response.json()['results'][0]['uniParcCrossReferences'][0]['id']
    except Exception as e:
        print("Could not get Accession for", protein_identifer, e)
        response = np.nan
    
    if isinstance(response, float):
        return row
    
    row['AccessionNo'] = response

    try:
        response = requests.get(
            f"http://www.ebi.ac.uk/proteins/api/proteins/{row['AccessionNo']}"
        )
        response = json.dumps(response.json()['features'])
    except Exception as e:
        print("Could not get Feature for", protein_identifer, e)
        response = '{}'
    
    row['FeatureSet'] = response

    CACHE_NEW[protein_identifer] = (
        row['AccessionNo'],
        row['FeatureSet']
    )
    return row


def second_round_of_accessions(folders: list[str]) -> None:
    for folder in folders:
        df_pos = pd.read_csv(f"phosphorylationBy/{folder}/{folder}_pos_fs.csv")
        df_pos = df_pos.apply(apply_redundant_accessions, axis=1)
        df_pos = df_pos[df_pos['FeatureSet'] != '{}'] # Had to drop some sequences - no amount of searching would get me to the point where I get the sequence + it's a minute fraction of the dataset being lost (< 1%)
        df_pos.to_csv(f"phosphorylationBy/{folder}/{folder}_pos_final.csv")
        print("DONE FOR", folder)
    print("DONE ALL POS")
    for folder in folders:
        df_neg = pd.read_csv(f"phosphorylationBy/{folder}/{folder}_neg_fs.csv")
        df_neg = df_neg.apply(apply_redundant_accessions, axis=1)
        df_neg = df_neg[df_neg['FeatureSet'] != '{}']
        df_neg.to_csv(f"phosphorylationBy/{folder}/{folder}_neg_final.csv")
        print("DONE FOR", folder)
    print("DONE ALL NEG")


In [15]:
def adjust_indices(folders: list[str]) -> None:
    for folder in folders:
        df_pos = pd.read_csv(f"phosphorylationBy/{folder}/{folder}_pos_final.csv")
        df_pos = df_pos.drop(columns=['Unnamed: 0'])
        df_pos.to_csv(f"phosphorylationBy/{folder}/{folder}_pos_final.csv", index=False)
        print("DONE FOR", folder)
    print("DONE ALL POS")
    for folder in folders:
        df_neg = pd.read_csv(f"phosphorylationBy/{folder}/{folder}_neg_final.csv")
        df_neg = df_neg.drop(columns=['Unnamed: 0'])
        df_neg.to_csv(f"phosphorylationBy/{folder}/{folder}_neg_final.csv", index=False)
        print("DONE FOR", folder)
    print("DONE ALL NEG")

In [16]:
folders = next(os.walk('phosphorylationBy'))[1]
adjust_indices(folders)

DONE FOR AKT
DONE FOR CAMK1
DONE FOR CAMK2
DONE FOR CAMKL
DONE FOR CDK
DONE FOR CK1
DONE FOR CK2
DONE FOR DAPK
DONE FOR DMPK
DONE FOR DYRK
DONE FOR GRK
DONE FOR GSK
DONE FOR MAPK
DONE FOR MAPKAPK
DONE FOR MLCK
DONE FOR NDR
DONE FOR PDK1
DONE FOR PKA
DONE FOR PKC
DONE FOR PKD
DONE FOR PKG
DONE FOR PKN
DONE FOR RAD53
DONE FOR RSK
DONE FOR SGK
DONE ALL POS
DONE FOR AKT
DONE FOR CAMK1
DONE FOR CAMK2
DONE FOR CAMKL
DONE FOR CDK
DONE FOR CK1
DONE FOR CK2
DONE FOR DAPK
DONE FOR DMPK
DONE FOR DYRK
DONE FOR GRK
DONE FOR GSK
DONE FOR MAPK
DONE FOR MAPKAPK
DONE FOR MLCK
DONE FOR NDR
DONE FOR PDK1
DONE FOR PKA
DONE FOR PKC
DONE FOR PKD
DONE FOR PKG
DONE FOR PKN
DONE FOR RAD53
DONE FOR RSK
DONE FOR SGK
DONE ALL NEG


In [3]:
import requests
from pprint import pprint

accession_number = "O14746"

response = requests.get(
    f'http://www.ebi.ac.uk/proteins/api/proteins/{accession_number}'
)
if response.ok:
    pprint(response.json())

{'accession': 'O14746',
 'comments': [{'text': [{'evidences': [{'code': 'ECO:0000269',
                                        'source': {'alternativeUrl': 'https://europepmc.org/abstract/MED/14963003',
                                                   'id': '14963003',
                                                   'name': 'PubMed',
                                                   'url': 'http://www.ncbi.nlm.nih.gov/pubmed/14963003'}},
                                       {'code': 'ECO:0000269',
                                        'source': {'alternativeUrl': 'https://europepmc.org/abstract/MED/15082768',
                                                   'id': '15082768',
                                                   'name': 'PubMed',
                                                   'url': 'http://www.ncbi.nlm.nih.gov/pubmed/15082768'}},
                                       {'code': 'ECO:0000269',
                                        'source': {'alternativeUr

In [4]:
response = requests.get(
    f'https://rest.uniprot.org/uniprotkb/{accession_number}',
    headers={
        'Accept': 'application/json'
    }
)

if response.ok:
    pprint(response.json())

{'annotationScore': 5.0,
 'comments': [{'commentType': 'FUNCTION',
               'texts': [{'evidences': [{'evidenceCode': 'ECO:0000269',
                                         'id': '14963003',
                                         'source': 'PubMed'},
                                        {'evidenceCode': 'ECO:0000269',
                                         'id': '15082768',
                                         'source': 'PubMed'},
                                        {'evidenceCode': 'ECO:0000269',
                                         'id': '15857955',
                                         'source': 'PubMed'},
                                        {'evidenceCode': 'ECO:0000269',
                                         'id': '17026956',
                                         'source': 'PubMed'},
                                        {'evidenceCode': 'ECO:0000269',
                                         'id': '17264120',
                              

In [10]:
import pandas as pd

df = pd.read_csv('database_df_altered.csv')

Unnamed: 0,Protein Identifier,Accession Number,PTMs
17044,CYC_ABUTH,P00059,"[[1, 'Acetylation', '5131735'], [80, 'Methylat..."
53035,GLYCO_ABLVH,Q8JTH0,"[[338, 'N-linked Glycosylation', 'UniProtKB CA..."
53213,GLYCO_ABLVB,Q9QSP1,"[[338, 'N-linked Glycosylation', 'UniProtKB CA..."
243008,MVP_ABMVW,P21946,"[[221, 'Phosphorylation', '19464722'], [223, '..."
