## Must download this tsv: https://www.proteinatlas.org/download/proteinatlas.tsv.zip

In [35]:
!pip install xmltodict
!pip install bs4

[33mYou are using pip version 9.0.1, however version 9.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting bs4
  Downloading bs4-0.0.1.tar.gz
Collecting beautifulsoup4 (from bs4)
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K    100% |################################| 92kB 3.2MB/s ta 0:00:01
[?25hBuilding wheels for collected packages: bs4
  Running setup.py bdist_wheel for bs4 ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/84/67/d4/9e09d9d5adede2ee1c7b7e8775ba3fbb04d07c4f946f0e4f11
Successfully built bs4
Installing collected packages: beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.6.0 bs4-0.0.1
[33mYou are using pip version 9.0.1, however version 9.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [32]:
import pandas as pd
import requests
import shutil
import urllib.parse
import urllib.request
import pprint
import os

import pdb

def load_table(protein_atlas_tsv_file='./data/proteinatlas.tsv'):
    df = pd.read_table(protein_atlas_tsv_file)
    cols_to_keep = ['Gene', 'Antibody', 'Ensembl']
    new_cols_map = dict(zip(cols_to_keep, ['gene', 'antibodies', 'ensembl_id']))
    df = df.loc[:, cols_to_keep]
    df = df.rename(columns=new_cols_map)
    df = df.dropna()
    return df

def antibody_ids_for(ensembl_id, df):
    aids = df[df.ensembl_id==ensembl_id].antibodies
    aids = aids.values[0].split(',')
    munged_aids = []
    for aid in aids:
        aid = aid.strip()
        if aid.startswith('CAB'):
            aid = aid.split('CAB')[1]
        elif aid.startswith('HPA'):
            aid = aid.split('HPA')[1]
        else:
            raise ValueError('Unrecognized antibody type: {}'.format(aid))
        aid = aid.lstrip('0')
        munged_aids.append(aid)
        
    return munged_aids
    
def get_xml_for(ensembl_id):
    url = 'https://www.proteinatlas.org/{}.xml'.format(ensembl_id)
    req = urllib.request.Request(url)
    response = urllib.request.urlopen(req)
    xml_bin = response.read()
    xml_lines = xml_bin.decode("utf-8").split('\n')
    return xml_lines

def get_image_urls(xml_lines):
    res = []
    for line in xml_lines:
        if '<imageUrl>' in line:
            strs = line.split('<imageUrl>')
            r = strs[1].split('</imageUrl>')[0]
            if 'blue' in r:
                res.append(r)
    return res          

def get_urls_for_antibody_id(urls, a_id):
    a_id_str = str(a_id)
    res = [u.split(a_id_str+'/')[-1] for u in urls if a_id_str in u]
    return res

def channels():
    return ["blue", "green", "red", "yellow"]

def make_url(aid, plate, well, field, ch):
    base_url = "https://www.proteinatlas.org/images/{}/{}_{}_{}_{}.jpg"
    url = base_url.format(aid, plate, well, field, ch)
    return url

def download_img(url, path, debug=False):
    if debug:
        print(url)
    r = requests.get(url, stream=True)
    if r.status_code == 200:
        with open(path, 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)
    else:
        r.raise_for_status()

def get_plates_fields_wells_for_antibody_id(a_ids, urls):
    d = dict()
    for a in a_ids:
        d[a] = []
        urls_for_antibody = get_urls_for_antibody_id(urls, a)
        for u in urls_for_antibody:
            d_sub=dict()
            res = u.split('_')
            d_sub['plate'] = res[0]
            d_sub['well'] = res[1]
            d_sub['field'] = res[2]
            d[a].append(d_sub)
    return d

def get_plate_fields_wells_from_url(url):
    d_sub=dict()
    
    _, url = os.path.split(url)
    
    res = url.split('_')
    d_sub['plate'] = res[0]
    d_sub['well'] = res[1]
    d_sub['field'] = res[2]
    
    return d_sub


def image_url_to_channel_urls(image_url):
    #image format is:
    #<some_url>/<n1>_<n2>_<n3>_<channel>.jpg
    img_channels = channels()
    
    url_root, url_name = os.path.split(image_url)
    
    underscore_positions = [pos for pos, char in enumerate(url_name) if char == '_']
    url_name_base = url_name[0:underscore_positions[2]+1]
    
    base_url = '{}/{}{}.jpg'
    
    channel_urls = [base_url.format(url_root, url_name_base, channel) for channel in img_channels]
    
    return channel_urls
        
    

In [2]:
df = load_table(protein_atlas_tsv_file='./data/proteinatlas.tsv')
df.head(10)

Unnamed: 0,gene,antibodies,ensembl_id
0,TSPAN6,HPA004109,ENSG00000000003
1,TNMD,"HPA034961, HPA055634",ENSG00000000005
2,DPM1,HPA051818,ENSG00000000419
3,SCYL3,HPA005624,ENSG00000000457
4,C1orf112,"HPA023778, HPA024451",ENSG00000000460
5,FGR,HPA002024,ENSG00000000938
6,CFH,"CAB016385, CAB016769, HPA038922, HPA049176, HP...",ENSG00000000971
7,FUCA2,"HPA031659, HPA031660, HPA031661",ENSG00000001036
8,GCLC,"CAB009569, HPA036359, HPA036360",ENSG00000001084
9,NFYA,"CAB009250, HPA050779",ENSG00000001167


In [3]:
e_id = "ENSG00000134057" #ensemble id for CCNB1
a_ids = antibody_ids_for(e_id, df) #antibody IDs for this gene
print(a_ids)

['115', '3804', '30741', '61448']


In [10]:
# infer plates, wells, and fields from protein atlas for all antibodies
xml = get_xml_for(e_id)
urls = get_image_urls(xml)
d = get_plates_fields_wells_for_antibody_id(a_ids, urls)

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(d)

{   '115': [   {'field': '1', 'plate': '672', 'well': 'E2'},
               {'field': '2', 'plate': '672', 'well': 'E2'},
               {'field': '1', 'plate': '663', 'well': 'E2'},
               {'field': '5', 'plate': '663', 'well': 'E2'},
               {'field': '1', 'plate': '646', 'well': 'E2'},
               {'field': '2', 'plate': '646', 'well': 'E2'}],
    '30741': [   {'field': '1', 'plate': '800', 'well': 'A6'},
                 {'field': '2', 'plate': '800', 'well': 'A6'},
                 {'field': '1', 'plate': '845', 'well': 'A6'},
                 {'field': '3', 'plate': '845', 'well': 'A6'},
                 {'field': '1', 'plate': '793', 'well': 'A6'},
                 {'field': '2', 'plate': '793', 'well': 'A6'},
                 {'field': '1', 'plate': '686', 'well': 'F11'},
                 {'field': '2', 'plate': '686', 'well': 'F11'},
                 {'field': '2', 'plate': 'si28', 'well': 'H2'},
                 {'field': '3', 'plate': 'si28', 'well': 'H2'},

In [9]:
# urls

# url = 'http://v18.proteinatlas.org/images/30741/si28_H2_2_red.jpg'
# r = requests.get(url)
# if(r.status_code==200):
#     print(url)

# xml


['<?xml version="1.0" encoding="UTF-8"?>',
 '<proteinAtlas xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://v18.proteinatlas.org/download/proteinatlas.xsd" schemaVersion="2.5">',
 '\t<entry version="18" url="http://v18.proteinatlas.org/ENSG00000134057">',
 '\t\t<name>CCNB1</name>',
 '\t\t<synonym>CCNB</synonym>',
 '\t\t<identifier id="ENSG00000134057" db="Ensembl" version="88.38">',
 '\t\t\t<xref id="P14635" db="Uniprot/SWISSPROT"/>',
 '\t\t</identifier>',
 '\t\t<proteinClasses>',
 '\t\t\t<proteinClass source="TCDB" id="Ja" parent_id="" name="Transporters"/>',
 '\t\t\t<proteinClass source="TCDB" id="Jb" parent_id="Ja" name="Transporter channels and pores"/>',
 '\t\t\t<proteinClass source="SPOCTOPUS" id="Mi" parent_id="" name="SPOCTOPUS predicted membrane proteins"/>',
 '\t\t\t<proteinClass source="HPA" id="Za" parent_id="" name="Predicted intracellular proteins"/>',
 '\t\t\t<proteinClass source="Plasma Proteome Database" id="Pp" parent_id="" name="Plasma

## Print out downloadable urls for this gene

In [None]:
# look for single channel images for gene: ENSG00000134057 = CCNB1
for a_id in a_ids:
    for pwf in d[a_id]:
        for ch in ['red', 'blue', 'green', 'yellow']:
            url = make_url(a_id, pwf['plate'], pwf['well'], pwf['field'], ch=ch)
            r = requests.get(url)
            if(r.status_code==200):
                print(url)

In [19]:
def ensg2dataframe(e_id):
        # infer plates, wells, and fields from protein atlas for all antibodies
    xml = get_xml_for(e_id)
    urls = get_image_urls(xml)
    d = get_plates_fields_wells_for_antibody_id(a_ids, urls)

    pdb.set_trace()

    

In [20]:
ensg2dataframe()

--Return--
> <ipython-input-19-95e8cc81349d>(7)ensg2dataframe()->None
-> pdb.set_trace()
(Pdb) urls
['http://v18.proteinatlas.org/images/115/672_E2_1_blue_red_green.jpg', 'http://v18.proteinatlas.org/images/115/672_E2_2_blue_red_green.jpg', 'http://v18.proteinatlas.org/images/115/663_E2_1_blue_red_green.jpg', 'http://v18.proteinatlas.org/images/115/663_E2_5_blue_red_green.jpg', 'http://v18.proteinatlas.org/images/115/646_E2_1_blue_red_green.jpg', 'http://v18.proteinatlas.org/images/115/646_E2_2_blue_red_green.jpg', 'http://v18.proteinatlas.org/images/30741/800_A6_1_blue_red_green.jpg', 'http://v18.proteinatlas.org/images/30741/800_A6_2_blue_red_green.jpg', 'http://v18.proteinatlas.org/images/30741/845_A6_1_blue_red_green.jpg', 'http://v18.proteinatlas.org/images/30741/845_A6_3_blue_red_green.jpg', 'http://v18.proteinatlas.org/images/30741/793_A6_1_blue_red_green.jpg', 'http://v18.proteinatlas.org/images/30741/793_A6_2_blue_red_green.jpg', 'http://v18.proteinatlas.org/images/30741/686_F

BdbQuit: 

In [5]:
import xmltodict



url = 'https://www.proteinatlas.org/{}.xml'.format(e_id)
req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
xml_bin = response.read()

xml_dict = xmltodict.parse(xml_bin)

In [31]:
for antibody in xml_dict['proteinAtlas']['entry']['antibody']:
    
    antibody_id = antibody['@id']
    
    for if_data in antibody['cellExpression']['subAssay']['data']:
        
        
        cell_line = if_data['cellLine']
        location = if_data['location']['#text']
        
        for image in if_data['assayImage']['image']:
            
            image_url = image['imageUrl']
            
            channel_urls = image_url_to_channel_urls(image_url)
            pdb.set_trace()
            
#     antibody.id


> <ipython-input-31-3091c1c6dccf>(11)<module>()
-> for image in if_data['assayImage']['image']:
(Pdb) get_plate_field_well_from_url(image_url)
{'plate': 'http://v18.proteinatlas.org/images/115/672', 'well': 'E2', 'field': '1'}
(Pdb) exit


BdbQuit: 

In [135]:
antibody_id = 'CAB000115'

In [37]:

import urllib.parse
import urllib.request
from bs4 import BeautifulSoup

ModuleNotFoundError: No module named 'libraries'

In [43]:
e_id = 'ENSG00000134057'

url = 'https://www.proteinatlas.org/{}/antibody'.format(e_id)

req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
html_bin = response.read()
soup = BeautifulSoup(html_bin, 'html.parser')

In [132]:
souplets = soup.find_all('th', attrs={'class':'head last roundtop'})

antibody_ids = list()

for souplet in souplets:
    antibody_ids += [re.findall('>Antibody [A-Za-z0-9]*<', str(souplet))[0][10:-1]]


# import re
# re.findall('map name=\"[A-Za-z0-9]*"', str(souplet[1]))

In [140]:
import numpy as np

id_index = np.where([id == antibody_id for id in antibody_ids])[0][0]

2

In [112]:
import re
re.findall('map name=\"[A-Za-z0-9]*"', str(souplet[1]))

['map name="ENSP00000256442"',
 'map name="ENSP00000424588"',
 'map name="ENSP00000423387"',
 'map name="ENSP00000426092"']

In [145]:
import numpy as np

def antibody_and_html_to_ensp(antibody_id, antibody_html_bin):
    antibody_html_str = str(antibody_html_bin)
    
    soup = BeautifulSoup(antibody_html_bin, 'html.parser')
    
    souplets = soup.find_all('th', attrs={'class':'head last roundtop'})

    antibody_ids = list()

    for souplet in souplets:
        antibody_ids += [re.findall('>Antibody [A-Za-z0-9]*<', str(souplet))[0][10:-1]]

    id_index = np.where([id == antibody_id for id in antibody_ids])[0][0]
    
    start_ind = antibody_html_str.find('Matching transcripts')
    end_ind = antibody_html_str.find('<th class="sub_head"', start_ind)
    
    split = antibody_html_str[start_ind:end_ind].split('<td class="" style="">')[id_index+1]
    
    ensp_soup = BeautifulSoup(split, 'html.parser')
    
    souplets = ensp_soup.find_all('a', attrs={'rel': 'nofollow noopener'})
    
    protein_ids = list()
    for souplet in souplets:
        protein_ids += [re.findall('ENSP[A-Za-z0-9]* ', str(souplet))[0][:-1]]
        
    return protein_ids
    
    

In [148]:
antibody_ids = 'HPA030741', 'HPA061448', 'CAB000115', 'CAB003804'

for antibody_id in antibody_ids:
    ensp_list = antibody_and_html_to_ensp(antibody_id, html_bin)
    print(ensp_list)



['ENSP00000256442', 'ENSP00000424588', 'ENSP00000423387', 'ENSP00000426092']
['ENSP00000256442', 'ENSP00000424588', 'ENSP00000423387', 'ENSP00000426092']
[]
[]


In [149]:
e_id = 'ENSG00000126602'

url = 'https://www.proteinatlas.org/{}/antibody'.format(e_id)

req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
html_bin = response.read()

antibody_ids = 'HPA041082', 'HPA044227'

for antibody_id in antibody_ids:
    ensp_list = antibody_and_html_to_ensp(antibody_id, html_bin)
    print(ensp_list)

['ENSP00000246957', 'ENSP00000442070', 'ENSP00000458166']
['ENSP00000246957']
