In [45]:
import numpy as np
from skimage.io import imread
import pandas as pd
from StringIO import StringIO
from dicom import read_file
class PyqaeContext(object):
    """
    The primary context for performing PYQAE functions
    """
    def __init__(self, cur_sc = None, faulty_io = 'FAIL', retry_att = 5, *args, **kwargs):
        """
        Create or initialize a new Pyqae Context
        
        Parameters
        ----------
        cur_sc : SparkContext
            An existing initialized SparkContext, if none a new one is initialized with the other parameters.
        faulty_io : String
            A string indicating what should happen if a file is missing (FAIL, RETRY, or return an EMPTY value)
        retry_att : Int
            The number of times a retry should be attempted (if faulty_io is in mode RETRY otherwise ignored)
        """
        assert faulty_io in ['FAIL', 'RETRY', 'EMPTY'], "Faulty IO must be in the list of FAIL, RETRY, or EMPTY"
        assert retry_att>0, "Retry attempt must be greater than 0"
        self.faulty_io = faulty_io
        self.retry_att = retry_att
        if cur_sc is None: 
            from pyspark import SparkContext
            self._cur_sc = SparkContext(*args, **kwargs)
        else:
            self._cur_sc = cur_sc
    
    @staticmethod
    def _wrapIOCalls(method, faulty_io, retry_att):
        """
        A general wrapper for IO calls which should be retried or returned empty 
        
        """
        assert faulty_io in ['FAIL', 'RETRY', 'EMPTY']
        assert retry_att > 0, "Retry attempts should be more than 0, {}".format(retry_att)
        if faulty_io == 'FAIL':
            return method
        else:
            def wrap_method(*args, **kwargs):
                if faulty_io == 'RETRY': max_iter = retry_att-1
                else: max_iter = 1
                
                for i in range(max_iter):
                    try:
                        return method(*args,**kwargs)
                    except:
                        if faulty_io == 'EMPTY': return None
                # if it still hasn't passed throw the error
                return method(*args,**kwargs)
            return wrap_method
    
    @staticmethod
    def readBinaryBlobAsImageArray(iblob):
        return imread(StringIO(iblob))
    
    @staticmethod
    def readBinaryBlobAsDicomArray(iblob):
        sio_blob = BytesIO(iblob)
        return read_file(sio_blob)
    
    @staticmethod
    def imageTableToDataFrame(imt_rdd):
        return imt_rdd.map(lambda x: dict(list(x[0].iteritems())+[('image_data',x[1].tolist())])).toDF()
    
    
    def readImageDirectory(self, path, parts = 100):
        """
        Read a directory of images
        
        Parameters
        ----------
        path : String
            A path with wildcards for the images files can be prefixed with (s3, s3a, or a shared directory)
        """
        read_fun = PyqaeContext._wrapIOCalls(PyqaeContext.readBinaryBlobAsImageArray, self.faulty_io, self.retry_att)
        return self._cur_sc.binaryFiles(path, parts).mapValues(read_fun)
    
    def readDicomDirectory(self, path, parts = 100):
        """
        Read a directory of dicom files
        
        Parameters
        ----------
        path : String
            A path with wildcards for the images files can be prefixed with (s3, s3a, or a shared directory)
        """
        read_fun = PyqaeContext._wrapIOCalls(PyqaeContext.readBinaryBlobAsDicomArray, self.faulty_io, self.retry_att)
        return self._cur_sc.binaryFiles(path, parts).mapValues(read_fun)
    
    def readImageTable(self, path, col_name, im_path_prefix = '', parts = 100, read_table_func = pd.read_csv):
        """
        Read a table from images from a csv file
        
        Parameters
        ----------
        path : String
            A path to the csv file
        col_name : String
            The name of the column containing the path to individual images
        im_path_prefix : String
            The prefix to append to the path in the text file so it is opened correctly (default empty)
        read_table_func: Function (String -> Pandas DataFrame)
            The function to read the table from a file-buffer object (default is the read_csv function)
        """
        c_file = self._cur_sc.wholeTextFiles(path,1)
        assert c_file.count()==1, "This function only support a single file at the moment"
        full_table_buffer = StringIO("\n".join(c_file.map(lambda x: x[1]).collect()))
        image_table = read_table_func(full_table_buffer)
        image_paths = [os.path.join(im_path_prefix,cpath) for cpath in image_table[col_name]]
        
        rawimg_rdd = self._cur_sc.binaryFiles(",".join(image_paths),parts)
        read_fun = PyqaeContext._wrapIOCalls(PyqaeContext.readBinaryBlobAsImageArray, self.faulty_io, self.retry_att)
        img_rdd = rawimg_rdd.mapValues(read_fun)
        # add the file prefix so the keys come up in the map operation
        image_paths = ['file:{}'.format(cpath) if cpath.find(':')<0 else cpath for cpath in image_paths]
        image_list = dict(zip(image_paths,image_table.T.to_dict().values()))
        
        return img_rdd.map(lambda x: (image_list[x[0]],x[1]))
    

In [2]:
pq_context = PyqaeContext(sc)
im_files = pq_context.readImageDirectory('/Users/mader/Dropbox/4Quant/Projects/PACScrawlertools/openi_images/*.png')
im_files.mapValues(lambda x: x.shape).first()

(u'file:/Users/mader/Dropbox/4Quant/Projects/PACScrawlertools/openi_images/00000-CXR1005.png',
 (420, 512, 3))

In [44]:
from glob import glob
cf_file = glob('/Users/mader/Dropbox/4Quant/Projects/TumorSegmentation/10092825/0013_t1_tse_tra_+c/*.dcm')[0]
with open(cf_file,'r') as ifile:
    sdata = StringIO(ifile.read())

read_file(BytesIO(sdata.readlines()[0]))

(0008, 0008) Image Type                          CS: ['ORIGINAL', 'PRIMARY', 'M', 'NORM', 'DIS2D', 'FM', 'FIL']
(0008, 0012) Instance Creation Date              DA: '20140606'
(0008, 0013) Instance Creation Time              TM: '114502.640000'
(0008, 0016) SOP Class UID                       UI: MR Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.3.12.2.1107.5.2.32.35424.2014060611450240186479552
(0008, 0020) Study Date                          DA: '20140606'
(0008, 0021) Series Date                         DA: '20140606'
(0008, 0022) Acquisition Date                    DA: '20140606'
(0008, 0023) Content Date                        DA: '20140606'
(0008, 0030) Study Time                          TM: '111723.734000'
(0008, 0031) Series Time                         TM: '114502.562000'
(0008, 0032) Acquisition Time                    TM: '114327.532500'
(0008, 0033) Content Time                        TM: '114502.640000'
(0008, 0050) Accession Number                  

In [38]:
from io import BytesIO
sc.binaryFiles('/Users/mader/Dropbox/4Quant/Projects/TumorSegmentation/10092825/0002_t2_blade_tra/*.dcm').mapValues(lambda x: read_file(BytesIO(x))).first()

TypeError: Dataset contents must be DataElement instances.
To set a data_element value use data_element.value=val

In [46]:
pq_context3 = PyqaeContext(sc)
dcm_files = pq_context3.readDicomDirectory('/Users/mader/Dropbox/4Quant/Projects/TumorSegmentation/10092825/0002_t2_blade_tra/*.dcm')
dcm_files.take(2)

TypeError: Dataset contents must be DataElement instances.
To set a data_element value use data_element.value=val

In [12]:
pq_context2 = PyqaeContext(sc)
dim_files = pq_context2.readImageTable('/Users/mader/Dropbox/4Quant/Projects/PACScrawlertools/openi_db_path.csv',
                         'local_path',
                         im_path_prefix = '/Users/mader/Dropbox/4Quant/Projects/PACScrawlertools/')
dim_files.mapValues(lambda x: x.shape).first()

({'Unnamed: 0': 0,
  'abstract': '<p><b>Comparison: </b>None.</p><p><b>Indication: </b>Pruritic.</p><p><b>Findings: </b>Cardiac and mediastinal contours are within normal limits. The lungs are clear. Bony structures are intact.</p><p><b>Impression: </b>No acute findings.</p>',
  'caption': 'Chest, 2 views, frontal and lateral',
  'image_id': 'F1',
  'local_path': 'openi_images/00000-CXR1005.png',
  'major': 'normal',
  'minor': nan,
  'problem': 'normal',
  'row': 0,
  'uid': 'CXR1005',
  'url': '/imgs/512/203/1005/CXR1005_IM-0006-1001.png'},
 (420, 512, 3))

In [107]:
dim_files.count()

7619

In [125]:
d_table = pq_context2.imageTableToDataFrame(dim_files)
d_table

DataFrame[Unnamed: 0: bigint, abstract: string, caption: string, image_data: array<array<array<bigint>>>, image_id: string, local_path: string, major: string, minor: double, problem: string, row: bigint, uid: string, url: string]

In [49]:
for cpath, c_record in zip(test_table.head()['local_path'],test_table.head().to_records()):
    print(cpath,c_record)

('openi_images/00000-CXR1005.png', (0, 0, '<p><b>Comparison: </b>None.</p><p><b>Indication: </b>Pruritic.</p><p><b>Findings: </b>Cardiac and mediastinal contours are within normal limits. The lungs are clear. Bony structures are intact.</p><p><b>Impression: </b>No acute findings.</p>', 'Chest, 2 views, frontal and lateral', 'F1', 'normal', nan, 'normal', 'CXR1005', '/imgs/512/203/1005/CXR1005_IM-0006-1001.png', 0, 'openi_images/00000-CXR1005.png'))
('openi_images/00001-CXR1005.png', (1, 1, '<p><b>Comparison: </b>None.</p><p><b>Indication: </b>Pruritic.</p><p><b>Findings: </b>Cardiac and mediastinal contours are within normal limits. The lungs are clear. Bony structures are intact.</p><p><b>Impression: </b>No acute findings.</p>', 'Chest, 2 views, frontal and lateral', 'F2', 'normal', nan, 'normal', 'CXR1005', '/imgs/512/203/1005/CXR1005_IM-0006-3003.png', 1, 'openi_images/00001-CXR1005.png'))
('openi_images/00002-CXR1002.png', (2, 2, '<p><b>Indication: </b>History of chest pain</p><p><

In [23]:
?read_file

In [54]:
?sc.binaryFiles

In [55]:
?sc.binaryRecords

In [127]:
import requests
?requests.get

In [135]:
a=np.zeros((3,3,3))
import urllib
urllib.parse

In [None]:
import requests
import json
import numpy as np
try:
    import urlparse
except: # for python 3
    from urllib import parse as urlparse
class DRESTAccess(object):
    """
    A distributed access to a REST interface
    """
    
    def __init__(self, base_url, fetch_path, def_args, verbose = False):
        self.base_url = base_url
        self.fetch_path = fetch_path
        self.def_args = def_args
        self.verbose = verbose
        
    @staticmethod
    def jsonRequest(req_url, args):
        response = requests.get(req_url, args)
        if response.ok:
            return json.loads(response.content)
        raise ValueError("{} could not be processed correctly".format(req_url),args)
    
    @staticmethod
    def bufferRequest(req_url, args):
        response = requests.get(req_url, args)
        if response.ok:
            return StringIO(response.content)
        raise ValueError("{} could not be processed correctly".format(req_url),args)
    
    def pull_results(self, **args):
        full_url = urlparse.urljoin(self.base_url,self.fetch_path)
        new_param = dict(self.def_args + list(args.iteritems()))
        print(full_url, new_param)
        return DRESTAccess.jsonRequest(full_url, new_param)
    
    def parallel_pull(self, sc, arg_list, parts = 10):
        return sc.parallelize(arg_list).map(lambda x: self.pull_results(**x))
    

class OpenIDB(DRESTAccess):
    def __init__(self, step_count = 50):
        self.step_count = step_count
        DRESTAccess.__init__(self,
                         base_url = "https://openi.nlm.nih.gov", 
                        fetch_path = "retrieve.php",
                        def_args = [])
    
    def db_query(self, sc, **args):
        base_args = list(args.iteritems())
        test_query = self.pull_results(**dict(base_args + [('m',1), ('n',1)]))
        m_range = np.arange(1,test_query['total'],self.step_count)
        n_range = np.append(m_range[1:],test_query['total'])
        qry_rdd = self.parallel_pull(sc, [dict(base_args + [('m',m), ('n',n)]) for m,n in zip(m_range,n_range)])
        return qry_rdd.flatMap(lambda x: x['list'])
    
    @staticmethod
    def format_entry(ie):
        return {
            'uid': ie['uid'],
            'major': ";".join(ie['MeSH']['major']), 
               'minor': ";".join(ie['MeSH']['minor']), 
               'problem': ie['Problems'],
              'abstract':ie['abstract'],
               'caption':ie['image']['caption'],
               'image_id':ie['image']['id'],
            'url': ie['imgLarge']
              }
    
    def get_collection(self,sc, coll='cxr', it='xg', lic='byncnd', **args):
        """
        Fetch an entire collection of images as a dataframe
        """
        study_results = self.db_query(sc, **dict(list(args.iteritems())+
                                                 [('coll',coll), ('it',it), ('lic', lic)]))
        return study_results.map(OpenIDB.format_entry).toDF()

In [203]:
odb = OpenIDB()
#odb.pull_results(m=1, n=1, coll='cxr', it='xg', lic='byncnd')

In [197]:
all_results = odb.db_query(sc, coll='cxr', it='xg', lic='byncnd')

('https://openi.nlm.nih.gov/retrieve.php', {'coll': 'cxr', 'n': 1, 'm': 1, 'it': 'xg', 'lic': 'byncnd'})


In [198]:
all_results.first()

{u'MeSH': {u'major': [u'normal'], u'minor': []},
 u'Outcome': [{u'#text': u'', u'@score': u'-0.102'}],
 u'Problems': u'normal',
 u'abstract': u'<p><b>Comparison: </b>None.</p><p><b>Indication: </b>Pruritic.</p><p><b>Findings: </b>Cardiac and mediastinal contours are within normal limits. The lungs are clear. Bony structures are intact.</p><p><b>Impression: </b>No acute findings.</p>',
 u'affiliate': u'Indiana University',
 u'authors': u'Kohli MD, Rosenman M',
 u'ccLicense': u'byncnd',
 u'detailedQueryURL': u'retrieve.php?img=CXR1005_IM-0006-1001&query=&it=xg&coll=cxr&lic=byncnd&req=4',
 u'docSource': u'CXR',
 u'fulltext_html_url': u'',
 u'getArticleFigures': u'retrieve.php?uid=CXR1005&req=5',
 u'image': {u'caption': u'Chest, 2 views, frontal and lateral',
  u'id': u'F1',
  u'mention': u'',
  u'modalityMajor': u'x'},
 u'imgGrid150': u'/imgs/150/203/1005/CXR1005_IM-0006-1001.png',
 u'imgLarge': u'/imgs/512/203/1005/CXR1005_IM-0006-1001.png',
 u'imgThumb': u'/imgs/100/203/1005/CXR1005_IM-

In [204]:
nw_results = odb.get_collection(sc) 

('https://openi.nlm.nih.gov/retrieve.php', {'coll': 'cxr', 'n': 1, 'm': 1, 'it': 'xg', 'lic': 'byncnd'})


In [205]:
nw_results.first()

Row(abstract=u'<p><b>Comparison: </b>None.</p><p><b>Indication: </b>Pruritic.</p><p><b>Findings: </b>Cardiac and mediastinal contours are within normal limits. The lungs are clear. Bony structures are intact.</p><p><b>Impression: </b>No acute findings.</p>', caption=u'Chest, 2 views, frontal and lateral', image_id=u'F1', major=u'normal', minor=u'', problem=u'normal', uid=u'CXR1005', url=u'/imgs/512/203/1005/CXR1005_IM-0006-1001.png')

In [206]:
nw_results.head()

Row(abstract=u'<p><b>Comparison: </b>None.</p><p><b>Indication: </b>Pruritic.</p><p><b>Findings: </b>Cardiac and mediastinal contours are within normal limits. The lungs are clear. Bony structures are intact.</p><p><b>Impression: </b>No acute findings.</p>', caption=u'Chest, 2 views, frontal and lateral', image_id=u'F1', major=u'normal', minor=u'', problem=u'normal', uid=u'CXR1005', url=u'/imgs/512/203/1005/CXR1005_IM-0006-1001.png')

In [221]:
nw_results.registerTempTable("LungStudy")

In [None]:
sqlContext.sql("Hey")