In [13]:
import coreapi
import pandas as pd
import requests

pd.set_option('display.max_rows', None)

In [55]:
def get_uniprot(factor: str):
    response = requests.get(f'https://rest.uniprot.org/uniprotkb/search?query=gene:{factor}+AND+organism_id:9606+AND+reviewed:true', headers={'Accept' : 'application/json'})
    response.raise_for_status()
    return response.json()

def get_entry(accession: str):
    response = requests.get(f'https://www.ebi.ac.uk/ebisearch/ws/rest/uniprot/entry/{accession}/', headers={'Accept' : 'application/json'})
    response.raise_for_status()
    return response.json()

In [3]:
client = coreapi.Client()
schema = client.get("https://jaspar.genereg.net/api/v1/docs")

In [4]:
with open("data/tf_list.txt") as tf_file:
    tf_list = [line.strip() for line in tf_file.readlines() if line.strip()]

In [None]:
tf_list[:5]

In [None]:
def fetch_tf(name: str):
    action = ["matrix", "list"]
    params = { "search" : name}
    return client.action(schema, action, params=params)

In [None]:
tf_search = { name : fetch_tf(name)["count"] > 0 for name in tf_list }

In [None]:
sum(tf_search.values())/len(tf_list)

In [None]:
hits = pd.DataFrame([[key, value] for key, value in tf_search.items()], columns=["name", "found"])
print(hits[hits["found"] == False])

In [22]:
get_entry('P04150')

{'entries': [{'acc': 'P04150', 'id': 'GCR_HUMAN', 'source': 'uniprot'}]}

In [56]:
gcr_results = get_uniprot("NR3C1")

In [61]:
uniprot_lookup = {name : get_uniprot(name)["results"] for name in tf_list}

In [67]:
more_than_one_hit = {name : result for name, result in uniprot_lookup.items() if len(result) > 1}

In [71]:
# more_than_one_hit['BACH1'][1]
def pick_primary(name, results):
    return [result for result in results if result["uniProtkbId"] == f'{name}_HUMAN']

In [72]:
{name : len(pick_primary(name, result)) for name, result in more_than_one_hit.items()}

{'BACH1': 1,
 'CEBPZ': 1,
 'CUX1': 1,
 'ERG': 1,
 'HBP1': 1,
 'HES1': 1,
 'HOXB13': 0,
 'IRF1': 1,
 'MCM2': 1,
 'MED1': 1,
 'NRF1': 1,
 'PAF1': 1,
 'RLF': 1,
 'SMAD5': 1,
 'SP1': 1,
 'STAG1': 1,
 'TAF15': 0,
 'TCF3': 0,
 'WT1': 1,
 'ZHX1': 1,
 'ZNF516': 0,
 'ZNF561': 0}

In [76]:
more_than_one_hit["HOXB13"][1]

{'entryType': 'UniProtKB reviewed (Swiss-Prot)',
 'primaryAccession': 'D3DTV9',
 'uniProtkbId': 'PRAC2_HUMAN',
 'entryAudit': {'firstPublicDate': '2014-11-26',
  'lastAnnotationUpdateDate': '2022-08-03',
  'lastSequenceUpdateDate': '2010-03-23',
  'entryVersion': 51,
  'sequenceVersion': 1},
 'annotationScore': 2.0,
 'organism': {'scientificName': 'Homo sapiens',
  'commonName': 'Human',
  'taxonId': 9606,
  'lineage': ['Eukaryota',
   'Metazoa',
   'Chordata',
   'Craniata',
   'Vertebrata',
   'Euteleostomi',
   'Mammalia',
   'Eutheria',
   'Euarchontoglires',
   'Primates',
   'Haplorrhini',
   'Catarrhini',
   'Hominidae',
   'Homo']},
 'proteinExistence': '5: Uncertain',
 'proteinDescription': {'recommendedName': {'fullName': {'evidences': [{'evidenceCode': 'ECO:0000305'}],
    'value': 'Putative protein PRAC2'}},
  'alternativeNames': [{'fullName': {'evidences': [{'evidenceCode': 'ECO:0000303',
       'source': 'PubMed',
       'id': '12746837'}],
     'value': 'Prostate, rectum