# Demo of scraping xml and saving single channel images + dataframe of metadata and image locations

In [1]:
import os
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

In [2]:
def get_single_channel_urls(multi_channel_url, channel_map={'blue':'nucleus', 'red':'microtubules', 'green':'antibody'}):
    parname, bname = os.path.split(multi_channel_url)
    name, ext = bname.split('.')
    plate, well, field = name.split("_")[:3]
    channels = name.split("_")[3:]
    single_channel_urls = {}
    for channel in channels:
        sc_name = '_'.join([plate, well, field, channel])
        sc_bname = '.'.join([sc_name,ext])
        sc_url = os.path.join(parname,sc_bname)
        single_channel_urls[channel_map[channel]] = sc_url
    seg_name = '_'.join([plate, well, field, 'segmentation'])
    seg_bname = '.'.join([seg_name,'png'])
    seg_url = os.path.join(parname,seg_bname)
    return {'singleChannelUrls':single_channel_urls, 'segmentationUrl':seg_url}

In [3]:
def make_xml_tree_from_url(ensembl_input_url):
    req = urllib.request.Request(ensembl_input_url)
    xml = urllib.request.urlopen(req).read()
    return ET.fromstring(xml.decode("utf-8"))

In [4]:
def filter_element(element, tag):
    child = [x for x in list(element) if x.tag == tag]
    child = child[0] if len(child)==1 else None
    return child

In [5]:
def get_urls_and_info_from_xml(xml_tree):
    urls = []
    for proteinAtlas in xml_tree.iter('proteinAtlas'):
        for antibody in proteinAtlas.iter('antibody'):
            for data in antibody.iter('data'):
                for image in data.iter('image'):
                    for imageUrl in image.iter('imageUrl'):
                        if 'green' in imageUrl.text:
                            channels = [x for x in list(image) if x.tag == 'channel']
                            channel_info = {channel.attrib['color']:channel.text for channel in channels}
                            antigenSequence = filter_element(antibody,'antigenSequence')
                            cellLine = filter_element(data, 'cellLine')
                            identifier = filter_element(filter_element(proteinAtlas, 'entry'), 'identifier')
                            xref = filter_element(identifier, 'xref')
                            urls += [{'antibody':antibody.attrib['id'] if (antibody is not None and antibody.attrib['id'] is not None) else np.nan,
                                      'EnsemblID':identifier.attrib['id'] if (identifier is not None and identifier.attrib['id'] is not None) else np.nan,
                                      'proteinID':xref.attrib['id'] if (xref is not None and xref.attrib['id'] is not None) else np.nan,
                                      'antigenSequence':antigenSequence.text if (antigenSequence is not None and antigenSequence.text is not None) else np.nan,
                                      'cellLine':cellLine.text if (cellLine is not None and cellLine.text is not None) else np.nan,
                                      'imageUrls':get_single_channel_urls(imageUrl.text, channel_map=channel_info) if (imageUrl is not None and imageUrl.text is not None) else np.nan}]
    return urls

In [6]:
def make_local_img_path(url):
    return url.split('images/')[-1]

def get_img_and_write_local_and_return_path(url, parent_dir):
    local_img_path = make_local_img_path(url)
    fpath = os.path.join(parent_dir,local_img_path)
    if not os.path.isfile(fpath):
        req = urllib.request.Request(url)
        img = urllib.request.urlopen(req).read()
        os.makedirs(os.path.dirname(fpath), exist_ok=True)
        with open(fpath, 'wb') as handler:
            handler.write(img)
    return(local_img_path)

In [7]:
xml_urls = ['https://www.proteinatlas.org/ENSG00000204209.xml',
           'https://www.proteinatlas.org/ENSG00000134057.xml',
           'https://www.proteinatlas.org/ENSG00000126602.xml']

all_urls_and_info = []

for xml_url in tqdm_notebook(xml_urls):
    doc = make_xml_tree_from_url(xml_url)
    urls_and_info = get_urls_and_info_from_xml(doc)
    all_urls_and_info += [*urls_and_info]




In [8]:
all_urls_and_info[-1]

{'EnsemblID': 'ENSG00000126602',
 'antibody': 'HPA044227',
 'antigenSequence': 'RTTAQLGPRRNPAWSLQAGRLFSTQTAEDKEEPLHSIISSTESVQGSTSKHEFQAETKKLLDIVARSLYSEKEVFIREL',
 'cellLine': 'U-2 OS',
 'imageUrls': {'segmentationUrl': 'http://v18.proteinatlas.org/images/44227/si31_G12_11_segmentation.png',
  'singleChannelUrls': {'antibody': 'http://v18.proteinatlas.org/images/44227/si31_G12_11_green.jpg',
   'microtubules': 'http://v18.proteinatlas.org/images/44227/si31_G12_11_red.jpg',
   'nucleus': 'http://v18.proteinatlas.org/images/44227/si31_G12_11_blue.jpg'}},
 'proteinID': 'Q12931'}

In [9]:
df = pd.DataFrame(columns=['EnsemblID',  'proteinID', 'antibodyName', 'antigenSequence', 'cellLine',
                           'antibodyChannel', 'microtubuleChannel', 'nuclearChannel', 'segmentationChannel'])

for u in tqdm_notebook(all_urls_and_info):
    
    antibody_path_local = get_img_and_write_local_and_return_path(u['imageUrls']['singleChannelUrls']['antibody'], 'test_images')
    microtubule_path_local = get_img_and_write_local_and_return_path(u['imageUrls']['singleChannelUrls']['microtubules'], 'test_images')
    nuclear_path_local = get_img_and_write_local_and_return_path(u['imageUrls']['singleChannelUrls']['nucleus'], 'test_images')
    segmentation_path_local = get_img_and_write_local_and_return_path(u['imageUrls']['segmentationUrl'], 'test_images')
    
    df = df.append({'EnsemblID':u['EnsemblID'],
                    'proteinID':u['proteinID'],
                    'antibodyName':u['antibody'],
                    'antigenSequence':u['antigenSequence'],
                    'cellLine':u['cellLine'],                    
                    'antibodyChannel':antibody_path_local,
                    'microtubuleChannel':microtubule_path_local,
                    'nuclearChannel':nuclear_path_local,
                    'segmentationChannel':segmentation_path_local}, ignore_index=True)




In [10]:
df.to_csv('test.csv', index=False)

In [12]:
df.sample(5)

Unnamed: 0,EnsemblID,proteinID,antibodyName,antigenSequence,cellLine,antibodyChannel,microtubuleChannel,nuclearChannel,segmentationChannel
24,ENSG00000134057,P14635,CAB000115,,U-251 MG,115/646_E2_1_green.jpg,115/646_E2_1_red.jpg,115/646_E2_1_blue.jpg,115/646_E2_1_segmentation.png
6,ENSG00000204209,Q9UER7,HPA008797,DEEEEAAAGKDGDKSPMSSLQISNEKNLEPGKQISRSSGEQQNKGR...,HeLa,8797/fp11_E5_1_green.jpg,8797/fp11_E5_1_red.jpg,8797/fp11_E5_1_blue.jpg,8797/fp11_E5_1_segmentation.png
4,ENSG00000204209,Q9UER7,HPA008797,DEEEEAAAGKDGDKSPMSSLQISNEKNLEPGKQISRSSGEQQNKGR...,U-251 MG,8797/174_G3_1_green.jpg,8797/174_G3_1_red.jpg,8797/174_G3_1_blue.jpg,8797/174_G3_1_segmentation.png
46,ENSG00000134057,P14635,HPA061448,TRNSKINAENKAKINMAGAKRVPTAPAATSKPGLRPRTALGDIGNK...,HeLa,61448/fp7_G4_1_green.jpg,61448/fp7_G4_1_red.jpg,61448/fp7_G4_1_blue.jpg,61448/fp7_G4_1_segmentation.png
15,ENSG00000204209,Q9UER7,HPA065779,ARGSSSSGGKKCYKLENEKLFEEFLELCKMQTADHPEVVPFLYNRQ...,U-2 OS,65779/1235_D6_2_green.jpg,65779/1235_D6_2_red.jpg,65779/1235_D6_2_blue.jpg,65779/1235_D6_2_segmentation.png
