## Must download this tsv: https://www.proteinatlas.org/download/proteinatlas.tsv.zip

In [1]:
import pandas as pd
import requests
import shutil
import urllib.parse
import urllib.request
import pprint

def load_table(protein_atlas_tsv_file='/Users/dganguli/Downloads/proteinatlas.tsv'):
    df = pd.read_table(protein_atlas_tsv_file)
    cols_to_keep = ['Gene', 'Antibody', 'Ensembl']
    new_cols_map = dict(zip(cols_to_keep, ['gene', 'antibodies', 'ensembl_id']))
    df = df.loc[:, cols_to_keep]
    df = df.rename(columns=new_cols_map)
    df = df.dropna()
    return df

def antibody_ids_for(ensembl_id, df):
    aids = df[df.ensembl_id==ensembl_id].antibodies
    aids = aids.values[0].split(',')
    munged_aids = []
    for aid in aids:
        aid = aid.strip()
        if aid.startswith('CAB'):
            aid = aid.split('CAB')[1]
        elif aid.startswith('HPA'):
            aid = aid.split('HPA')[1]
        else:
            raise ValueError('Unrecognized antibody type: {}'.format(aid))
        aid = aid.lstrip('0')
        munged_aids.append(aid)
        
    return munged_aids
    
def get_xml_for(ensembl_id):
    url = 'https://www.proteinatlas.org/{}.xml'.format(ensembl_id)
    req = urllib.request.Request(url)
    response = urllib.request.urlopen(req)
    xml_bin = response.read()
    xml_lines = xml_bin.decode("utf-8").split('\n')
    return xml_lines

def get_image_urls(xml_lines):
    res = []
    for line in xml_lines:
        if '<imageUrl>' in line:
            strs = line.split('<imageUrl>')
            r = strs[1].split('</imageUrl>')[0]
            if 'blue' in r:
                res.append(r)
    return res          

def get_urls_for_antibody_id(urls, a_id):
    a_id_str = str(a_id)
    res = [u.split(a_id_str+'/')[-1] for u in urls if a_id_str in u]
    return res

def channels():
    return ["blue", "green", "red", "yellow"]

def make_url(aid, plate, well, field, ch):
    base_url = "https://www.proteinatlas.org/images/{}/{}_{}_{}_{}.jpg"
    url = base_url.format(aid, plate, well, field, ch)
    return url

def download_img(url, path, debug=False):
    if debug:
        print(url)
    r = requests.get(url, stream=True)
    if r.status_code == 200:
        with open(path, 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)
    else:
        r.raise_for_status()

def get_plates_fields_wells_for_antibody_id(a_ids, urls):
    d = dict()
    for a in a_ids:
        d[a] = []
        urls_for_antibody = get_urls_for_antibody_id(urls, a)
        for u in urls_for_antibody:
            d_sub=dict()
            res = u.split('_')
            d_sub['plate'] = res[0]
            d_sub['well'] = res[1]
            d_sub['field'] = res[2]
            d[a].append(d_sub)
    return d

In [2]:
df = load_table(protein_atlas_tsv_file='/Users/dganguli/Downloads/proteinatlas.tsv')
df.head(10)

Unnamed: 0,gene,antibodies,ensembl_id
0,TSPAN6,HPA004109,ENSG00000000003
1,TNMD,"HPA034961, HPA055634",ENSG00000000005
2,DPM1,HPA051818,ENSG00000000419
3,SCYL3,HPA005624,ENSG00000000457
4,C1orf112,"HPA023778, HPA024451",ENSG00000000460
5,FGR,HPA002024,ENSG00000000938
6,CFH,"CAB016385, CAB016769, HPA038922, HPA049176, HP...",ENSG00000000971
7,FUCA2,"HPA031659, HPA031660, HPA031661",ENSG00000001036
8,GCLC,"CAB009569, HPA036359, HPA036360",ENSG00000001084
9,NFYA,"CAB009250, HPA050779",ENSG00000001167


In [3]:
e_id = "ENSG00000134057" #ensemble id for CCNB1
a_ids = antibody_ids_for(e_id, df) #antibody IDs for this gene
print(a_ids)

['115', '3804', '30741', '61448']


In [7]:
# infer plates, wells, and fields from protein atlas for all antibodies
xml = get_xml_for(e_id)
urls = get_image_urls(xml)
d = get_plates_fields_wells_for_antibody_id(a_ids, urls)

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(d)

{   '115': [   {'field': '1', 'plate': '672', 'well': 'E2'},
               {'field': '2', 'plate': '672', 'well': 'E2'},
               {'field': '1', 'plate': '663', 'well': 'E2'},
               {'field': '5', 'plate': '663', 'well': 'E2'},
               {'field': '1', 'plate': '646', 'well': 'E2'},
               {'field': '2', 'plate': '646', 'well': 'E2'}],
    '30741': [   {'field': '1', 'plate': '800', 'well': 'A6'},
                 {'field': '2', 'plate': '800', 'well': 'A6'},
                 {'field': '1', 'plate': '845', 'well': 'A6'},
                 {'field': '3', 'plate': '845', 'well': 'A6'},
                 {'field': '1', 'plate': '793', 'well': 'A6'},
                 {'field': '2', 'plate': '793', 'well': 'A6'},
                 {'field': '1', 'plate': '686', 'well': 'F11'},
                 {'field': '2', 'plate': '686', 'well': 'F11'},
                 {'field': '2', 'plate': 'si28', 'well': 'H2'},
                 {'field': '3', 'plate': 'si28', 'well': 'H2'},

## Print out downloadable urls for this gene

In [10]:
# look for single channel images for gene: ENSG00000134057 = CCNB1
for a_id in a_ids:
    for pwf in d[a_id]:
        for ch in ['red', 'blue', 'green', 'yellow']:
            url = make_url(a_id, pwf['plate'], pwf['well'], pwf['field'], ch=ch)
            r = requests.get(url)
            if(r.status_code==200):
                print(url)

https://www.proteinatlas.org/images/115/672_E2_1_red.jpg
https://www.proteinatlas.org/images/115/672_E2_1_blue.jpg
https://www.proteinatlas.org/images/115/672_E2_1_green.jpg
https://www.proteinatlas.org/images/115/672_E2_1_yellow.jpg
https://www.proteinatlas.org/images/115/672_E2_2_red.jpg
https://www.proteinatlas.org/images/115/672_E2_2_blue.jpg
https://www.proteinatlas.org/images/115/672_E2_2_green.jpg
https://www.proteinatlas.org/images/115/672_E2_2_yellow.jpg
https://www.proteinatlas.org/images/115/663_E2_1_red.jpg
https://www.proteinatlas.org/images/115/663_E2_1_blue.jpg
https://www.proteinatlas.org/images/115/663_E2_1_green.jpg
https://www.proteinatlas.org/images/115/663_E2_1_yellow.jpg
https://www.proteinatlas.org/images/115/663_E2_5_red.jpg
https://www.proteinatlas.org/images/115/663_E2_5_blue.jpg
https://www.proteinatlas.org/images/115/663_E2_5_green.jpg
https://www.proteinatlas.org/images/115/663_E2_5_yellow.jpg
https://www.proteinatlas.org/images/115/646_E2_1_red.jpg
https:/