# Demo of scraping xml and saving single channel images + dataframe of metadata and image locations

In [27]:
!pip install xmltodict
!pip install bs4

Collecting xmltodict
  Downloading xmltodict-0.11.0-py2.py3-none-any.whl
Installing collected packages: xmltodict
Successfully installed xmltodict-0.11.0
[33mYou are using pip version 9.0.1, however version 9.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting bs4
  Downloading bs4-0.0.1.tar.gz
Collecting beautifulsoup4 (from bs4)
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K    100% |################################| 92kB 3.5MB/s 
[?25hBuilding wheels for collected packages: bs4
  Running setup.py bdist_wheel for bs4 ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/84/67/d4/9e09d9d5adede2ee1c7b7e8775ba3fbb04d07c4f946f0e4f11
Successfully built bs4
Installing collected packages: beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.6.0 bs4-0.0.1
[33mYou are using pip version 9.0.1, however version 9.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' comma

In [1]:
import os
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

from bs4 import BeautifulSoup
import re

import pdb

In [2]:
def get_single_channel_urls(multi_channel_url, channel_map={'blue':'nucleus', 'red':'microtubules', 'green':'antibody'}):
    parname, bname = os.path.split(multi_channel_url)
    name, ext = bname.split('.')
    plate, well, field = name.split("_")[:3]
    channels = name.split("_")[3:]
    single_channel_urls = {}
    for channel in channels:
        sc_name = '_'.join([plate, well, field, channel])
        sc_bname = '.'.join([sc_name,ext])
        sc_url = os.path.join(parname,sc_bname)
        single_channel_urls[channel_map[channel]] = sc_url
    seg_name = '_'.join([plate, well, field, 'segmentation'])
    seg_bname = '.'.join([seg_name,'png'])
    seg_url = os.path.join(parname,seg_bname)
    return {'singleChannelUrls':single_channel_urls, 'segmentationUrl':seg_url}

In [3]:
def make_xml_tree_from_url(ensembl_input_url):
    req = urllib.request.Request(ensembl_input_url)
    xml = urllib.request.urlopen(req).read()
    return ET.fromstring(xml.decode("utf-8"))

In [4]:
def filter_element(element, tag):
    child = [x for x in list(element) if x.tag == tag]
    child = child[0] if len(child)==1 else None
    return child

In [5]:
import numpy as np


def ensg_to_antibody_html(ensg_id):
    '''Gets the html corresponding to an antibody that targets the ensg_id'''
    url = 'https://www.proteinatlas.org/{}/antibody'.format(ensg_id)

    req = urllib.request.Request(url)
    response = urllib.request.urlopen(req)
    html_bin = response.read()
    
    return html_bin
#     soup = BeautifulSoup(html_bin, 'html.parser')

def antibody_and_html_to_ensp(antibody_id, antibody_html_bin):
    '''Gets the ensp_ids that are targeted by the antibody_id'''
    
    antibody_html_str = str(antibody_html_bin)
    
    soup = BeautifulSoup(antibody_html_bin, 'html.parser')
    
    souplets = soup.find_all('th', attrs={'class':'head last roundtop'})

    antibody_ids = list()

    for souplet in souplets:
        antibody_ids += [re.findall('>Antibody [A-Za-z0-9]*<', str(souplet))[0][10:-1]]

    id_index = np.where([id == antibody_id for id in antibody_ids])[0][0]
    
    start_ind = antibody_html_str.find('Matching transcripts')
    end_ind = antibody_html_str.find('<th class="sub_head"', start_ind)
    
    split = antibody_html_str[start_ind:end_ind].split('<td class="" style="">')[id_index+1]
    
    ensp_soup = BeautifulSoup(split, 'html.parser')
    
    souplets = ensp_soup.find_all('a', attrs={'rel': 'nofollow noopener'})
    
    protein_ids = list()
    for souplet in souplets:
        protein_ids += [re.findall('ENSP[A-Za-z0-9]* ', str(souplet))[0][:-1]]
        
    return protein_ids

In [6]:
def get_urls_and_info_from_xml(xml_tree):
    urls = []
    for proteinAtlas in xml_tree.iter('proteinAtlas'):
        
        identifier = filter_element(filter_element(proteinAtlas, 'entry'), 'identifier')
        ensg_id = identifier.attrib['id'] 
        antibody_html_bin = ensg_to_antibody_html(ensg_id)
        
        for antibody in proteinAtlas.iter('antibody'):
            
            antibody_id = antibody.attrib['id']
            ensp_ids = antibody_and_html_to_ensp(antibody_id, antibody_html_bin)
            
            for data in antibody.iter('data'):
                for image in data.iter('image'):
                    for imageUrl in image.iter('imageUrl'):
                        if 'green' in imageUrl.text:
                            channels = [x for x in list(image) if x.tag == 'channel']
                            channel_info = {channel.attrib['color']:channel.text for channel in channels}
                            antigenSequence = filter_element(antibody,'antigenSequence')
                            cellLine = filter_element(data, 'cellLine')
                            
                            xref = filter_element(identifier, 'xref')
                            urls += [{'antibody':antibody.attrib['id'] if (antibody is not None and antibody.attrib['id'] is not None) else np.nan,
                                      'ENSG':identifier.attrib['id'] if (identifier is not None and identifier.attrib['id'] is not None) else np.nan,
                                      'ENSP':ensp_ids,
                                      'proteinName':xref.attrib['id'] if (xref is not None and xref.attrib['id'] is not None) else np.nan,
                                      'antigenSequence':antigenSequence.text if (antigenSequence is not None and antigenSequence.text is not None) else np.nan,
                                      'cellLine':cellLine.text if (cellLine is not None and cellLine.text is not None) else np.nan,
                                      'imageUrls':get_single_channel_urls(imageUrl.text, channel_map=channel_info) if (imageUrl is not None and imageUrl.text is not None) else np.nan}]
    return urls

In [7]:
def make_local_img_path(url):
    return url.split('images/')[-1]

def get_img_and_write_local_and_return_path(url, parent_dir):
    local_img_path = make_local_img_path(url)
    fpath = os.path.join(parent_dir,local_img_path)
    if not os.path.isfile(fpath):
        req = urllib.request.Request(url)
        img = urllib.request.urlopen(req).read()
        os.makedirs(os.path.dirname(fpath), exist_ok=True)
        with open(fpath, 'wb') as handler:
            handler.write(img)
    return(local_img_path)

In [8]:
def ensg_to_info(ensg_id):
    xml_url = 'https://www.proteinatlas.org/{}.xml'.format(ensg_id)
    
    doc = make_xml_tree_from_url(xml_url)
    urls_and_info = get_urls_and_info_from_xml(doc)
        
    return urls_and_info 
#     all_urls_and_info += [*urls_and_info]
    

In [11]:
ensg_ids = ['ENSG00000204209',
           'ENSG00000134057',
           'ENSG00000126602']

save_parent = './data/hpa/{}/'


for ensg_id in tqdm_notebook(ensg_ids):
    
    
    save_dir = save_parent.format(ensg_id)
    save_file = save_dir + '/info.csv'
    
    if os.path.exists(save_file):
        continue
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    urls_and_info = ensg_to_info(ensg_id)
    
    df = pd.DataFrame(columns=['ENSG',  'ENSP', 'proteinName', 'antibodyName', 'antigenSequence', 'cellLine',
                           'antibodyChannel', 'microtubuleChannel', 'nuclearChannel', 'segmentationChannel'])
    
    for u in tqdm_notebook(urls_and_info):
        
        antibody_path_local = get_img_and_write_local_and_return_path(u['imageUrls']['singleChannelUrls']['antibody'], save_dir)
        microtubule_path_local = get_img_and_write_local_and_return_path(u['imageUrls']['singleChannelUrls']['microtubules'], save_dir)
        nuclear_path_local = get_img_and_write_local_and_return_path(u['imageUrls']['singleChannelUrls']['nucleus'], save_dir)
        segmentation_path_local = get_img_and_write_local_and_return_path(u['imageUrls']['segmentationUrl'], save_dir)

        df = df.append({'ENSG':u['ENSG'],
                        'ENSP':u['ENSP'],
                        'proteinName':u['proteinName'],
                        'antibodyName':u['antibody'],
                        'antigenSequence':u['antigenSequence'],
                        'cellLine':u['cellLine'],                    
                        'antibodyChannel':antibody_path_local,
                        'microtubuleChannel':microtubule_path_local,
                        'nuclearChannel':nuclear_path_local,
                        'segmentationChannel':segmentation_path_local}, ignore_index=True)

    df.to_csv(save_file)
    
# for xml_url in tqdm_notebook(xml_urls):
#     doc = make_xml_tree_from_url(xml_url)
#     urls_and_info = get_urls_and_info_from_xml(doc)
#     all_urls_and_info += [*urls_and_info]




Exception in thread Thread-4:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.6/site-packages/tqdm/_tqdm.py", line 148, in run
    for instance in self.tqdm_cls._instances:
  File "/opt/conda/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration




