In [52]:
import pandas as pd
import requests
import shutil
import urllib.parse
import urllib.request

def load_table(protein_atlas_tsv_file='/Users/dganguli/Downloads/proteinatlas.tsv'):
    df = pd.read_table(protein_atlas_tsv_file)
    cols_to_keep = ['Gene', 'Antibody', 'Ensembl']
    new_cols_map = dict(zip(cols_to_keep, ['gene', 'antibodies', 'ensembl_id']))
    df = df.loc[:, cols_to_keep]
    df = df.rename(columns=new_cols_map)
    df = df.dropna()
    return df

def antibody_ids_for(ensembl_id, df):
    aids = df[df.ensembl_id==ensembl_id].antibodies
    aids = aids.values[0].split(',')
    munged_aids = []
    for aid in aids:
        aid = aid.strip()
        if aid.startswith('CAB'):
            aid = aid.split('CAB')[1]
        elif aid.startswith('HPA'):
            aid = aid.split('HPA')[1]
        else:
            raise ValueError('Unrecognized antibody type: {}'.format(aid))
        aid = aid.lstrip('0')
        munged_aids.append(aid)
        
    return munged_aids

def plates(max_plates=2000):
    return range(max_plates)

def wells():
    cols = list(map(lambda x: x+1, list(range(12))))
    rows = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
    res = ('{}{}'.format(y,x) for x in cols for y in rows)
    return res

def fields(max_fields=10):
    return range(max_fields)
    
def channels():
    return ["blue", "green", "red", "yellow"]

def make_url(aid, plate, well, field, ch):
    base_url = "https://www.proteinatlas.org/images/{}/{}_{}_{}_{}.jpg"
    url = base_url.format(aid, plate, well, field, ch)
    return url

def download_img(url, path, debug=False):
    if debug:
        print(url)
    r = requests.get(url, stream=True)
    if r.status_code == 200:
        with open(path, 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)
    else:
        r.raise_for_status()
        
def get_xml_for(ensembl_id):
    url = 'https://www.proteinatlas.org/{}.xml'.format(ensembl_id)
    req = urllib.request.Request(url)
    response = urllib.request.urlopen(req)
    xml_bin = response.read()
    xml_lines = xml_bin.decode("utf-8").split('\n')
    return xml_lines

def get_image_urls(xml_lines):
    res = []
    for line in xml_lines:
        if '<imageUrl>' in line:
            strs = line.split('<imageUrl>')
            r = strs[1].split('</imageUrl>')[0]
            if 'blue' in r:
                res.append(r)
    return res          

In [53]:
df = load_table(protein_atlas_tsv_file='/Users/dganguli/Downloads/proteinatlas.tsv')
df.head()

Unnamed: 0,gene,antibodies,ensembl_id
0,TSPAN6,HPA004109,ENSG00000000003
1,TNMD,"HPA034961, HPA055634",ENSG00000000005
2,DPM1,HPA051818,ENSG00000000419
3,SCYL3,HPA005624,ENSG00000000457
4,C1orf112,"HPA023778, HPA024451",ENSG00000000460


In [54]:
e_id = "ENSG00000134057" #ensemble id for CCNB1
a_ids = antibody_ids_for(e_id, df) #antibody IDs for this gene
print(a_ids)

['115', '3804', '30741', '61448']


In [55]:
# image url paths
xml = get_xml_for(e_id)
urls = get_image_urls(xml)

for u in urls:
    print(u)

http://v18.proteinatlas.org/images/115/672_E2_1_blue_red_green.jpg
http://v18.proteinatlas.org/images/115/672_E2_2_blue_red_green.jpg
http://v18.proteinatlas.org/images/115/663_E2_1_blue_red_green.jpg
http://v18.proteinatlas.org/images/115/663_E2_5_blue_red_green.jpg
http://v18.proteinatlas.org/images/115/646_E2_1_blue_red_green.jpg
http://v18.proteinatlas.org/images/115/646_E2_2_blue_red_green.jpg
http://v18.proteinatlas.org/images/30741/800_A6_1_blue_red_green.jpg
http://v18.proteinatlas.org/images/30741/800_A6_2_blue_red_green.jpg
http://v18.proteinatlas.org/images/30741/845_A6_1_blue_red_green.jpg
http://v18.proteinatlas.org/images/30741/845_A6_3_blue_red_green.jpg
http://v18.proteinatlas.org/images/30741/793_A6_1_blue_red_green.jpg
http://v18.proteinatlas.org/images/30741/793_A6_2_blue_red_green.jpg
http://v18.proteinatlas.org/images/30741/686_F11_1_blue_red_green.jpg
http://v18.proteinatlas.org/images/30741/686_F11_2_blue_red_green.jpg
http://v18.proteinatlas.org/images/30741/si2

In [None]:
# look for downloadable images
for a_id in a_ids:
    for plate in plates(max_plates=2000):
        for well in wells(): #96 well plates A1->H12
            for field in fields(max_fields=10):
                for ch in ['red', 'blue', 'green', 'yellow']:
                    url = make_url(a_id, plate, well, field, ch=ch)
                    r = requests.get(url)
                    if(r.status_code==200):
                        print(url)