In [1]:
import os
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET

In [2]:
def get_image_urls(ensembl_input_url):
    req = urllib.request.Request(ensembl_input_url)
    xml = urllib.request.urlopen(req).read()
    doc = ET.fromstring(xml.decode("utf-8"))
    urls = []
    for proteinAtlas in doc.iter('proteinAtlas'):
        for cellExpression in proteinAtlas.iter('cellExpression'):
            for imageUrl in cellExpression.iter('imageUrl'):
                if 'green' in imageUrl.text:
                    urls += [imageUrl.text]
    return urls

In [3]:
def get_single_channel_urls(multi_channel_url):
    parname, bname = os.path.split(multi_channel_url)
    name, ext = bname.split('.')
    plate, well, field = name.split("_")[:3]
    channels = name.split("_")[3:]
    single_channel_urls = []
    for channel in channels:
        sc_name = '_'.join([plate, well, field, channel])
        sc_bname = '.'.join([sc_name,ext])
        sc_url = os.path.join(parname,sc_bname)
        single_channel_urls += [sc_url]
    return single_channel_urls

In [4]:
ensembl_input_url = 'https://www.proteinatlas.org/ENSG00000204209.xml'
req = urllib.request.Request(ensembl_input_url)
xml = urllib.request.urlopen(req).read()
doc = ET.fromstring(xml.decode("utf-8"))

In [5]:
urls = []
for proteinAtlas in doc.iter('proteinAtlas'):
    for antibody in proteinAtlas.iter('antibody'):
        for cellExpression in antibody.iter('cellExpression'):
            for subAssay in cellExpression.iter('subAssay'):
                for data in subAssay.iter('data'):
                    for imageUrl in data.iter('imageUrl'):
                        if 'green' in imageUrl.text:
                            c = [el for el in list(data) if el.tag == 'cellLine'][0]
                            urls += [{'antibody':antibody.attrib['id'],
                                      'cellLine':c.text,
                                      'imageUrls':get_single_channel_urls(imageUrl.text)}]

In [6]:
urls

[{'antibody': 'HPA008797',
  'cellLine': 'A-431',
  'imageUrls': ['http://v18.proteinatlas.org/images/8797/175_G3_1_blue.jpg',
   'http://v18.proteinatlas.org/images/8797/175_G3_1_red.jpg',
   'http://v18.proteinatlas.org/images/8797/175_G3_1_green.jpg']},
 {'antibody': 'HPA008797',
  'cellLine': 'A-431',
  'imageUrls': ['http://v18.proteinatlas.org/images/8797/175_G3_2_blue.jpg',
   'http://v18.proteinatlas.org/images/8797/175_G3_2_red.jpg',
   'http://v18.proteinatlas.org/images/8797/175_G3_2_green.jpg']},
 {'antibody': 'HPA008797',
  'cellLine': 'U-2 OS',
  'imageUrls': ['http://v18.proteinatlas.org/images/8797/176_G3_1_blue.jpg',
   'http://v18.proteinatlas.org/images/8797/176_G3_1_red.jpg',
   'http://v18.proteinatlas.org/images/8797/176_G3_1_green.jpg']},
 {'antibody': 'HPA008797',
  'cellLine': 'U-2 OS',
  'imageUrls': ['http://v18.proteinatlas.org/images/8797/176_G3_2_blue.jpg',
   'http://v18.proteinatlas.org/images/8797/176_G3_2_red.jpg',
   'http://v18.proteinatlas.org/image