# TEP parser

TEP () are pulled from SGC website from here: `https://www.thesgc.org/tep`

In [103]:
import requests
from bs4 import BeautifulSoup, UnicodeDammit
import pandas as pd

def get_gene(row):
    genes = row.findAll('td')[0].text
    return genes.split('/')

def get_gene_name(row):
    return row.findAll('td')[1].text


def get_url(row):
    gene_cell = row.findAll('td')[0]
    tep_url = gene_cell.find('a').get('href')
    
    if not tep_url.startswith('http'):
        tep_url = 'https://www.thesgc.org' + tep_url
        
    return tep_url


def get_therapeutic_area(row):
    therapeutic_area = row.findAll('td')[2].text
    return therapeutic_area


def hgnc_alias_lookup(gene_label):
    
    endpoints = ['alias_name','alias_symbol','prev_symbol',"symbol","name"]
    
    # Initialize values:
    gene_name, gene_id = None, None
    
    for endpoint in endpoints:  
        url = f'http://rest.genenames.org/fetch/{endpoint}/{gene_label}'
        print(url)
        headers = {'Accept': 'application/json'} 
        r = requests.get(url, headers=headers)
        data = r.json()
    
        if data['response']['numFound'] == 0:
            continue

        elif data['response']['numFound'] == 1:
            docs = data['response']['docs'][0]
            print(f'Gene symbol found in HGNC {endpoint}')
            gene_name, gene_id =  docs['symbol'], docs['ensembl_gene_id']
            break
        else:
            docs = data['response']['docs'][0]
            print(f'Gene symbol found in HGNC {endpoint}')
            print(f'Warning! Multiple genes found for {gene_name}')
            gene_name, gene_id =  docs['symbol'], docs['ensembl_gene_id']
            break

    if gene_id is None:
        print(f'Gene not found in HGNC: {gene_label}')
    return gene_id, gene_name

def gene_name_lookup(gene_name):
    ensembl_url = f'http://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_name}?content-type=application/json'
    r = requests.get(ensembl_url)
    
    data = r.json()
    
    if 'error' in data:
        print(f'Gene not found in Ensembl: {gene_name}')
        return hgnc_alias_lookup(gene_name)
    
    # Parse data:
    return data['id'], data['display_name']
    
    
    
url = 'https://www.thesgc.org/tep'

response = requests.get(url)

html = response.text
uhtml = UnicodeDammit(html)

soup = BeautifulSoup(uhtml.unicode_markup, features="html.parser")

TEP_table = soup.findAll('table')[-1]

TEP_raw_data = []

for row in TEP_table.find('tbody').findAll('tr'):
    TEP_raw_data.append({
        "genes": get_gene(row),
        "TEP_url": get_url(row),
        "disease": get_therapeutic_area(row),
        "description": get_gene_name(row)
    })
    
TEP_raw_df = pd.DataFrame(TEP_raw_data)
TEP_raw_df = TEP_raw_df.explode('genes')

gene_ids = TEP_raw_df.genes.apply(gene_name_lookup)
gene_df = pd.DataFrame(gene_ids.tolist(), columns=['gene_id','gene_symbol'])
gene_df.head()

Gene not found in Ensembl: Moesin
http://rest.genenames.org/fetch/alias_name/Moesin
Gene symbol found in HGNC alias_name
Gene not found in Ensembl: hFBG-C
http://rest.genenames.org/fetch/alias_name/hFBG-C
http://rest.genenames.org/fetch/alias_symbol/hFBG-C
http://rest.genenames.org/fetch/prev_symbol/hFBG-C
http://rest.genenames.org/fetch/symbol/hFBG-C
http://rest.genenames.org/fetch/name/hFBG-C
Gene not found in HGNC: None
Gene not found in Ensembl: TASK1
http://rest.genenames.org/fetch/alias_name/TASK1
http://rest.genenames.org/fetch/alias_symbol/TASK1
Gene symbol found in HGNC alias_symbol
Gene not found in Ensembl: TMEM16K
http://rest.genenames.org/fetch/alias_name/TMEM16K
http://rest.genenames.org/fetch/alias_symbol/TMEM16K
http://rest.genenames.org/fetch/prev_symbol/TMEM16K
Gene symbol found in HGNC prev_symbol
Gene not found in Ensembl: PfBDP4
http://rest.genenames.org/fetch/alias_name/PfBDP4
http://rest.genenames.org/fetch/alias_symbol/PfBDP4
http://rest.genenames.org/fetch/prev

Unnamed: 0,gene_id,gene_symbol
0,ENSG00000124067,SLC12A4
1,ENSG00000140199,SLC12A6
2,ENSG00000082397,EPB41L3
3,ENSG00000168918,INPP5D
4,ENSG00000186575,NF2


In [104]:
TEP_raw_df.reset_index(drop=True, inplace=True)
gene_df.reset_index(drop=True, inplace=True)

final_df = pd.concat( [gene_df, TEP_raw_df], axis=1) 
final_df


Unnamed: 0,gene_id,gene_symbol,genes,TEP_url,disease,description
0,ENSG00000124067,SLC12A4,SLC12A4,https://www.thesgc.org/tep/slc12a4slc12a6,"Sickle cell disease (SCD), Neurological",Potassium/Chloride Co-transporter 1 and 3 (KCC...
1,ENSG00000140199,SLC12A6,SLC12A6,https://www.thesgc.org/tep/slc12a4slc12a6,"Sickle cell disease (SCD), Neurological",Potassium/Chloride Co-transporter 1 and 3 (KCC...
2,ENSG00000082397,EPB41L3,EPB41L3,https://www.thesgc.org/tep/epb41l3,Neurodegeneration,EPB41L3
3,ENSG00000168918,INPP5D,INPP5D,https://www.thesgc.org/tep/inpp5d,Neurological Disorders,INPP5D (SHIP1)
4,ENSG00000186575,NF2,Moesin,https://www.thesgc.org/tep/moesin,Alzheimer’s disease,MSN (Moesin)
5,,,hFBG-C,https://www.thesgc.org/tep/hfbg-c,Inflammatory diseases,Fibrinogen-like globe domain of human Tenascin...
6,ENSG00000164181,ELOVL7,ELOVL7,https://www.thesgc.org/tep/elovl7,Metabolic diseases,Elongation of very long chain fatty acids prot...
7,ENSG00000181192,DHTKD1,DHTKD1,https://www.thesgc.org/tep/dhtkd1,Metabolic diseases,Dehydrogenase E1 and transketolase domain-cont...
8,ENSG00000152457,DCLRE1C,DCLRE1C,https://www.thesgc.org/tep/dclre1c,Oncology,"\nArtemis (DCLRE1C, SNM1C)\n"
9,ENSG00000164458,TBXT,TBXT,https://www.thesgc.org/tep/tbxt,Cancer,\nHuman T-box transcription factor T (Brachyur...


In [47]:
data

{'responseHeader': {'status': 0, 'QTime': 0},
 'response': {'numFound': 1,
  'start': 0,
  'docs': [{'hgnc_id': 'HGNC:7773',
    'symbol': 'NF2',
    'name': 'neurofibromin 2',
    'status': 'Approved',
    'locus_type': 'gene with protein product',
    'prev_name': ['neurofibromin 2 (bilateral acoustic neuroma)',
     'neurofibromin 2 (merlin)'],
    'alias_symbol': ['merlin', 'ACN', 'SCH', 'BANF', 'merlin-1'],
    'alias_name': ['moesin-ezrin-radixin like',
     'schwannomin',
     'bilateral acoustic neurofibromatosis'],
    'location': '22q12.2',
    'date_approved_reserved': '1992-01-01T00:00:00Z',
    'date_modified': '2021-01-09T00:00:00Z',
    'date_name_changed': '2016-07-04T00:00:00Z',
    'ena': ['L11353'],
    'entrez_id': '4771',
    'mgd_id': ['MGI:97307'],
    'cosmic': 'NF2',
    'orphanet': 123774,
    'pubmed_id': [10591208],
    'refseq_accession': ['NM_000268'],
    'gene_group': ['A-kinase anchoring proteins', 'FERM domain containing'],
    'vega_id': 'OTTHUMG00000

In [102]:
url = 'https://www.thesgc.org/tep'

response = requests.get(url)

html = response.text
uhtml = UnicodeDammit(html)

soup = BeautifulSoup(uhtml.unicode_markup, features="html.parser")

TEP_table = soup.findAll('table')[-1]

TEP_raw_data = []

for row in TEP_table.find('tbody').findAll('tr'):
    TEP_raw_data.append({
        "genes": get_gene(row),
        "TEP_url": get_url(row),
        "disease": get_therapeutic_area(row),
        "description": get_gene_name(row)
    })

<tr>
<td><a href="/tep/CDK12">CDK12</a></td>
<td>Human Cyclin-Dependent Kinase 12 (CDK12), Kinase Domain</td>
<td>Oncology</td>
<td>Version 7</td>
<td>2016</td>
</tr>

In [112]:
URL = get_url(row)
URL= 'https://www.thesgc.org/tep/galtgalk1'

In [113]:
r = requests.get(URL)
html = r.text
uhtml = UnicodeDammit(html)

soup = BeautifulSoup(uhtml.unicode_markup, features="html.parser")

In [114]:
for a in soup.findAll('a'):
    url = a.get('href')
    try:
        if 'uniprot' in url:
            print(url)
    except TypeError:
        continue

http://www.uniprot.org/uniprot/P07902
http://www.uniprot.org/uniprot/P51570


In [116]:
data = [ 
            {'ENSG00000094631': {
                'id': 'ENSG00000094631',
                'symbol': 'HDAC6',
                'link': 'https://www.thesgc.org/tep/hdac6'
            }},
            {'ENSG00000120733': {
                'id': 'ENSG00000120733',
                'symbol': 'KDM3B',
                'link': 'https://www.thesgc.org/tep/kdm3b'
            }},
            {'ENSG00000186280': {
                'id': 'ENSG00000186280',
                'symbol': 'KDM4D',
                'link': 'https://www.thesgc.org/tep/kdm4d'
            }},
            {'ENSG00000146247': {
                'id': 'ENSG00000146247',
                'symbol': 'PHIP',
                'link': 'https://www.thesgc.org/tep/phip'
            }}]

In [122]:
{row['genes']:{'urls': row['TEP_url']} for i, row in TEP_raw_df.iterrows()}

{'SLC12A4': {'urls': 'https://www.thesgc.org/tep/slc12a4slc12a6'},
 'SLC12A6': {'urls': 'https://www.thesgc.org/tep/slc12a4slc12a6'},
 'EPB41L3': {'urls': 'https://www.thesgc.org/tep/epb41l3'},
 'INPP5D': {'urls': 'https://www.thesgc.org/tep/inpp5d'},
 'Moesin': {'urls': 'https://www.thesgc.org/tep/moesin'},
 'hFBG-C': {'urls': 'https://www.thesgc.org/tep/hfbg-c'},
 'ELOVL7': {'urls': 'https://www.thesgc.org/tep/elovl7'},
 'DHTKD1': {'urls': 'https://www.thesgc.org/tep/dhtkd1'},
 'DCLRE1C': {'urls': 'https://www.thesgc.org/tep/dclre1c'},
 'TBXT': {'urls': 'https://www.thesgc.org/tep/tbxt'},
 'STAG1': {'urls': 'https://www.thesgc.org/tep/stag1'},
 'MTHFR': {'urls': 'https://www.thesgc.org/tep/mthfr'},
 'ALAS2': {'urls': 'https://www.thesgc.org/tep/alas2'},
 'GALT': {'urls': 'https://www.thesgc.org/tep/galt'},
 'GALK1': {'urls': 'https://www.thesgc.org/tep/galt'},
 'KALRN': {'urls': 'https://www.thesgc.org/tep/kalrnrac1'},
 'RAC1': {'urls': 'https://www.thesgc.org/tep/kalrnrac1'},
 'KEAP

In [121]:
for i, row in TEP_raw_df.iterrows():
    print(row)

genes                                                    SLC12A4
TEP_url                https://www.thesgc.org/tep/slc12a4slc12a6
disease                  Sickle cell disease (SCD), Neurological
description    Potassium/Chloride Co-transporter 1 and 3 (KCC...
Name: 0, dtype: object
genes                                                    SLC12A6
TEP_url                https://www.thesgc.org/tep/slc12a4slc12a6
disease                  Sickle cell disease (SCD), Neurological
description    Potassium/Chloride Co-transporter 1 and 3 (KCC...
Name: 1, dtype: object
genes                                     EPB41L3
TEP_url        https://www.thesgc.org/tep/epb41l3
disease                         Neurodegeneration
description                               EPB41L3
Name: 2, dtype: object
genes                                     INPP5D
TEP_url        https://www.thesgc.org/tep/inpp5d
disease                   Neurological Disorders
description                       INPP5D (SHIP1)
Name: 3, dtype:

In [1]:
import pandas as pd

In [8]:
dfs = pd.read_html('https://www.thesgc.org/tep')
tep_df = (
    dfs[1]
    .assign(
        genes = lambda df: df.Gene.str.split('/')
    )
    .explode('genes')
    .drop(['Gene', 'Vers'])
)
tep_df.head()

Unnamed: 0,Gene,Description,Therapeutic Area,Version,Date,genes
0,SLC12A4/SLC12A6,Potassium/Chloride Co-transporter 1 and 3 (KCC...,"Sickle cell disease (SCD), Neurological",Version 1,2020,SLC12A4
0,SLC12A4/SLC12A6,Potassium/Chloride Co-transporter 1 and 3 (KCC...,"Sickle cell disease (SCD), Neurological",Version 1,2020,SLC12A6
1,EPB41L3,EPB41L3,Neurodegeneration,Version 1,2020,EPB41L3
2,INPP5D,INPP5D (SHIP1),Neurological Disorders,Version 1,2020,INPP5D
3,Moesin,MSN (Moesin),Alzheimer’s disease,Version 1,2020,Moesin


In [27]:
import requests
from bs4 import BeautifulSoup, UnicodeDammit
import pandas as pd
import logging


def retrieve_tep_list():

    def get_description(row):
        return row.findAll('td')[1].text.strip()

    def get_url(row):
        gene_cell = row.findAll('td')[0]
        tep_url = gene_cell.find('a').get('href')
        
        if not tep_url.startswith('http'):
            tep_url = 'https://www.thesgc.org' + tep_url
            
        return tep_url
        
    def get_therapeutic_area(row):
        therapeutic_area = row.findAll('td')[2].text
        return therapeutic_area
    
    def get_genes(row):
        genes = row.findAll('td')[0].text
        return genes.split('/')


    url = 'https://www.thesgc.org/tep'

    response = requests.get(url)

    html = response.text
    uhtml = UnicodeDammit(html)

    soup = BeautifulSoup(uhtml.unicode_markup, features="html.parser")

    TEP_table = soup.findAll('table')[-1]

    TEP_raw_data = []

    for row in TEP_table.find('tbody').findAll('tr'):
        TEP_raw_data.append({
            "genes": get_genes(row),
            "TEP_url": get_url(row),
            "disease": get_therapeutic_area(row),
            "description": get_description(row)
        })
        
    return (
        pd.DataFrame(TEP_raw_data)
        .explode('genes')
        .rename(columns={'genes': 'targetFromSource'})
    )


tep_list = retrieve_tep_list()
tep_list.to_json('tep-2021-09-07.json.gz', compression='infer', lines=True, orient='records')

In [29]:
%%bash 

gzcat tep-2021-09-07.json.gz | head -n1 | jq

{
  "targetFromSource": "SLC12A4",
  "TEP_url": "https://www.thesgc.org/tep/slc12a4slc12a6",
  "disease": "Sickle cell disease (SCD), Neurological",
  "description": "Potassium/Chloride Co-transporter 1 and 3 (KCC1/KCC3; SLC12A4/SLC12A6)"
}


In [41]:
tep_list.targetFromSource.duplicated().any()

False

In [40]:
import json

def id_lookup(ensembl_ids):

    headers={ "Content-Type" : "application/json", "Accept" : "application/json"}
    r = requests.post('http://rest.ensembl.org/lookup/id', headers=headers, data=json.dumps({ "ids" : ensembl_ids }))

    try:
        decoded = r.json()
    except:
        print(r.content)

    # Parse response:
    parsed = []
    for gene_id, data in decoded.items():
        parsed.append({
            'gene_id': gene_id,
            'symbol': data['display_name']
            })

    return pd.DataFrame(parsed)


def uniprot_lookup(uniprot_id):
    url = f'http://rest.ensembl.org/xrefs/symbol/homo_sapiens/{uniprot_id}?content-type=application/json'
    r = requests.get(url)
    data = r.json()

    # Parse gene id:
    for item in data:
        if item['type']:
            return item['id']

    # If gene id is not found:
    logging.info(f'Failed to retrieve Ensembl id for: {uniprot_id}')
    return None


# tep_list = ( 
#     tep_list
#     .assign(ensembl_id = lambda df: df.targetFromSource.apply(uniprot_lookup))
# )

tep_list.loc[tep_list.ensembl_id.notna()].ensembl_id.duplicated()

0     False
0     False
1     False
2     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
12    False
13    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
35    False
36    False
37    False
Name: ensembl_id, dtype: bool