In [104]:
import numpy as np
from skimage.io import imread
import pandas as pd
from StringIO import StringIO
class PyqaeContext(object):
    """
    The primary context for performing PYQAE functions
    """
    def __init__(self, cur_sc = None, faulty_io = 'FAIL', retry_att = 5, *args, **kwargs):
        """
        Create or initialize a new Pyqae Context
        
        Parameters
        ----------
        cur_sc : SparkContext
            An existing initialized SparkContext, if none a new one is initialized with the other parameters.
        faulty_io : String
            A string indicating what should happen if a file is missing (FAIL, RETRY, or return an EMPTY value)
        retry_att : Int
            The number of times a retry should be attempted (if faulty_io is in mode RETRY otherwise ignored)
        """
        assert faulty_io in ['FAIL', 'RETRY', 'EMPTY'], "Faulty IO must be in the list of FAIL, RETRY, or EMPTY"
        assert retry_att>0, "Retry attempt must be greater than 0"
        self.faulty_io = faulty_io
        self.retry_att = retry_att
        if cur_sc is None: 
            from pyspark import SparkContext
            self._cur_sc = SparkContext(*args, **kwargs)
        else:
            self._cur_sc = cur_sc
    
    @staticmethod
    def _wrapIOCalls(method, faulty_io, retry_att):
        """
        A general wrapper for IO calls which should be retried or returned empty 
        
        """
        assert faulty_io in ['FAIL', 'RETRY', 'EMPTY']
        assert retry_att > 0, "Retry attempts should be more than 0, {}".format(retry_att)
        if faulty_io == 'FAIL':
            return method
        else:
            def wrap_method(*args, **kwargs):
                if faulty_io == 'RETRY': max_iter = retry_att-1
                else: max_iter = 1
                
                for i in range(max_iter):
                    try:
                        return method(*args,**kwargs)
                    except:
                        if faulty_io == 'EMPTY': return None
                # if it still hasn't passed throw the error
                return method(*args,**kwargs)
            return wrap_method
    
    @staticmethod
    def readBinaryBlobAsImageArray(iblob):
        return imread(StringIO(iblob))
    
    
    def readImageDirectory(self, path, parts = 100):
        """
        Read a directory of images
        
        Parameters
        ----------
        path : String
            A path with wildcards for the images files can be prefixed with (s3, s3a, or a shared directory)
        """
        read_fun = PyqaeContext._wrapIOCalls(PyqaeContext.readBinaryBlobAsImageArray, self.faulty_io, self.retry_att)
        return self._cur_sc.binaryFiles(path, parts).mapValues(read_fun)
    
    def readImageTable(self, path, col_name, im_path_prefix = '', parts = 100, read_table_func = pd.read_csv):
        """
        Read a table from images from a csv file
        
        Parameters
        ----------
        path : String
            A path to the csv file
        col_name : String
            The name of the column containing the path to individual images
        im_path_prefix : String
            The prefix to append to the path in the text file so it is opened correctly (default empty)
        read_table_func: Function (String -> Pandas DataFrame)
            The function to read the table from a file-buffer object (default is the read_csv function)
        """
        c_file = self._cur_sc.wholeTextFiles(path,1)
        assert c_file.count()==1, "This function only support a single file at the moment"
        full_table_buffer = StringIO("\n".join(c_file.map(lambda x: x[1]).collect()))
        image_table = read_table_func(full_table_buffer)
        image_paths = [os.path.join(im_path_prefix,cpath) for cpath in image_table[col_name]]
        
        image_paths = map(lambda x: 'file:{}'.format(x) if x.find(':') < 0 else x,image_paths)
        image_dict = dict(zip(image_paths,image_table.T.to_dict().values()))
        rawimg_rdd = self._cur_sc.binaryFiles(",".join(image_paths),parts)
        read_fun = PyqaeContext._wrapIOCalls(PyqaeContext.readBinaryBlobAsImageArray, self.faulty_io, self.retry_att)
        img_rdd = rawimg_rdd.mapValues(read_fun)
        return img_rdd.map(lambda x: (image_dict[x[0]],x[1]))

In [105]:
pq_context = PyqaeContext(sc)
im_files = pq_context.readImageDirectory('/Users/mader/Dropbox/4Quant/Projects/PACScrawlertools/openi_images/*.png')
im_files.mapValues(lambda x: x.shape).first()

(u'file:/Users/mader/Dropbox/4Quant/Projects/PACScrawlertools/openi_images/00000-CXR1005.png',
 (420, 512, 3))

(u'file:/Users/mader/Dropbox/4Quant/Projects/PACScrawlertools/openi_images/00000-CXR1005.png',
 (420, 512, 3))

In [106]:
pq_context2 = PyqaeContext(sc)
dim_files = pq_context2.readImageTable('/Users/mader/Dropbox/4Quant/Projects/PACScrawlertools/openi_db_path.csv',
                         'local_path',
                         im_path_prefix = '/Users/mader/Dropbox/4Quant/Projects/PACScrawlertools/')
dim_files.mapValues(lambda x: x.shape).first()

({'Unnamed: 0': 0,
  'abstract': '<p><b>Comparison: </b>None.</p><p><b>Indication: </b>Pruritic.</p><p><b>Findings: </b>Cardiac and mediastinal contours are within normal limits. The lungs are clear. Bony structures are intact.</p><p><b>Impression: </b>No acute findings.</p>',
  'caption': 'Chest, 2 views, frontal and lateral',
  'image_id': 'F1',
  'local_path': 'openi_images/00000-CXR1005.png',
  'major': 'normal',
  'minor': nan,
  'problem': 'normal',
  'row': 0,
  'uid': 'CXR1005',
  'url': '/imgs/512/203/1005/CXR1005_IM-0006-1001.png'},
 (420, 512, 3))

In [107]:
dim_files.count()

7619

In [49]:
for cpath, c_record in zip(test_table.head()['local_path'],test_table.head().to_records()):
    print(cpath,c_record)

('openi_images/00000-CXR1005.png', (0, 0, '<p><b>Comparison: </b>None.</p><p><b>Indication: </b>Pruritic.</p><p><b>Findings: </b>Cardiac and mediastinal contours are within normal limits. The lungs are clear. Bony structures are intact.</p><p><b>Impression: </b>No acute findings.</p>', 'Chest, 2 views, frontal and lateral', 'F1', 'normal', nan, 'normal', 'CXR1005', '/imgs/512/203/1005/CXR1005_IM-0006-1001.png', 0, 'openi_images/00000-CXR1005.png'))
('openi_images/00001-CXR1005.png', (1, 1, '<p><b>Comparison: </b>None.</p><p><b>Indication: </b>Pruritic.</p><p><b>Findings: </b>Cardiac and mediastinal contours are within normal limits. The lungs are clear. Bony structures are intact.</p><p><b>Impression: </b>No acute findings.</p>', 'Chest, 2 views, frontal and lateral', 'F2', 'normal', nan, 'normal', 'CXR1005', '/imgs/512/203/1005/CXR1005_IM-0006-3003.png', 1, 'openi_images/00001-CXR1005.png'))
('openi_images/00002-CXR1002.png', (2, 2, '<p><b>Indication: </b>History of chest pain</p><p><

In [60]:
sc.binaryFiles(",".join([os.path.join('/Users/mader/Dropbox/4Quant/Projects/PACScrawlertools/',i) for i in test_table.head()['local_path']])).first()

(u'file:/Users/mader/Dropbox/4Quant/Projects/PACScrawlertools/openi_images/00000-CXR1005.png',
 '\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x02\x00\x00\x00\x01\xa4\x08\x02\x00\x00\x00\xc8\x7f\n\xc8\x00\x01\x00\x00IDATx\x9c\xcc\xfd\xdb\xb2%Kv\x95\x8d\xba{\xc4\x18\xf3\x9c\x99kU\x95\xaaT\xa8$q00a\x18\x06f\xdc\xf2\x10\xbc\r/\xc3\x0bp\xcd\x13\xc0=\xb7\x18&\x89_\x07T\x92PU\xad\xb5r\x9e\xe7\x88p\xdf\x17\xad\xda\xb7Z\xc4\x98Y\xd2\xde\x1b\xfd\x10\x17i#\xc7\x8c\x11\xe1\x87\xde[?w\xaf\xff\xe4\x9f\xfc\x93\xd6Z\xadu\x8cQ\xbep\xf5\xde\xf5\xd71Fk\x8d\xef\xf3\'\xeb\xba\xbe\xfb\xdb/=v\x8c\xc13K)\xb5V}\xee\xbd\xef\xee\xe16>\xe7\xcf\xf3~\x8dS\xffj\xa8\xd34\xb5\xd6\xf2\xb1\xbd\xf7Zk\xef]\x03^\xd7\xb5\xd6:\xcf\xf3\xe5\xe5\xe5<\xcf\x87\xc3a\x9eg\x8dGkRk\xd5st\x9b\xfe\xab?5_\xeb\xba\xea\x83f\x91??\x1c\x0e\xd34\x8d1\xf4\xaf\xc6\xac5\xacqiF\xf9p=S\x7f\xe2\xd5\x9a\x05_\xe6z\xf2CM\x90w\xe5g\x96\xeex<N\xd3\xf4\xfa\xfa\xfa\xfc\xfc\xdcZ\xd3O\xf4\x84\xab\xab\xabO\x9f>\xcd\xf3|qq1M\x93&\xae\xb1\xe9\xbfz&s\xd7\xf7\x1

In [54]:
?sc.binaryFiles

In [55]:
?sc.binaryRecords

In [100]:
list(test_table.head().T.to_dict().values())

[{'Unnamed: 0': 0,
  'abstract': '<p><b>Comparison: </b>None.</p><p><b>Indication: </b>Pruritic.</p><p><b>Findings: </b>Cardiac and mediastinal contours are within normal limits. The lungs are clear. Bony structures are intact.</p><p><b>Impression: </b>No acute findings.</p>',
  'caption': 'Chest, 2 views, frontal and lateral',
  'image_id': 'F1',
  'local_path': 'openi_images/00000-CXR1005.png',
  'major': 'normal',
  'minor': nan,
  'problem': 'normal',
  'row': 0,
  'uid': 'CXR1005',
  'url': '/imgs/512/203/1005/CXR1005_IM-0006-1001.png'},
 {'Unnamed: 0': 1,
  'abstract': '<p><b>Comparison: </b>None.</p><p><b>Indication: </b>Pruritic.</p><p><b>Findings: </b>Cardiac and mediastinal contours are within normal limits. The lungs are clear. Bony structures are intact.</p><p><b>Impression: </b>No acute findings.</p>',
  'caption': 'Chest, 2 views, frontal and lateral',
  'image_id': 'F2',
  'local_path': 'openi_images/00001-CXR1005.png',
  'major': 'normal',
  'minor': nan,
  'problem': '