In [1]:
import os
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET

In [2]:
def get_single_channel_urls(multi_channel_url):
    parname, bname = os.path.split(multi_channel_url)
    name, ext = bname.split('.')
    plate, well, field = name.split("_")[:3]
    channels = name.split("_")[3:]
    single_channel_urls = {}
    for channel in channels:
        sc_name = '_'.join([plate, well, field, channel])
        sc_bname = '.'.join([sc_name,ext])
        sc_url = os.path.join(parname,sc_bname)
        single_channel_urls[channel] = sc_url
    seg_name = '_'.join([plate, well, field, 'segmentation'])
    seg_bname = '.'.join([seg_name,'png'])
    seg_url = os.path.join(parname,seg_bname)
    return {'singleChannelUrls':single_channel_urls, 'segmentationUrl':seg_url}

In [3]:
def make_xml_tree_from_url(ensembl_input_url):
    req = urllib.request.Request(ensembl_input_url)
    xml = urllib.request.urlopen(req).read()
    return ET.fromstring(xml.decode("utf-8"))

In [4]:
def get_urls_and_info_from_xml(xml_tree):
    urls = []
    for proteinAtlas in xml_tree.iter('proteinAtlas'):
        for antibody in proteinAtlas.iter('antibody'):
            for data in antibody.iter('data'):
                for imageUrl in data.iter('imageUrl'):
                    if 'green' in imageUrl.text:
                        cellLine = [x for x in list(data) if x.tag == 'cellLine']
                        urls += [{'antibody':antibody.attrib['id'],
                                  'cellLine':cellLine[0].text if len(cellLine) == 1 else None,
                                  'imageUrls':get_single_channel_urls(imageUrl.text)}]
    return urls

In [5]:
doc = make_xml_tree_from_url('https://www.proteinatlas.org/ENSG00000204209.xml')
urls = get_urls_and_info_from_xml(doc)

In [6]:
urls

[{'antibody': 'HPA008797',
  'cellLine': 'A-431',
  'imageUrls': {'segmentationUrl': 'http://v18.proteinatlas.org/images/8797/175_G3_1_segmentation.png',
   'singleChannelUrls': {'blue': 'http://v18.proteinatlas.org/images/8797/175_G3_1_blue.jpg',
    'green': 'http://v18.proteinatlas.org/images/8797/175_G3_1_green.jpg',
    'red': 'http://v18.proteinatlas.org/images/8797/175_G3_1_red.jpg'}}},
 {'antibody': 'HPA008797',
  'cellLine': 'A-431',
  'imageUrls': {'segmentationUrl': 'http://v18.proteinatlas.org/images/8797/175_G3_2_segmentation.png',
   'singleChannelUrls': {'blue': 'http://v18.proteinatlas.org/images/8797/175_G3_2_blue.jpg',
    'green': 'http://v18.proteinatlas.org/images/8797/175_G3_2_green.jpg',
    'red': 'http://v18.proteinatlas.org/images/8797/175_G3_2_red.jpg'}}},
 {'antibody': 'HPA008797',
  'cellLine': 'U-2 OS',
  'imageUrls': {'segmentationUrl': 'http://v18.proteinatlas.org/images/8797/176_G3_1_segmentation.png',
   'singleChannelUrls': {'blue': 'http://v18.protei