In [1]:
import concurrent.futures, ctypes, distutils, distutils.sysconfig
import numpy, os, psycopg2, subprocess, sys, tempfile
import time, traceback

In [2]:
# States plus DC.  (Puerto Rico not included)
state_codes = {
    'al': '01', 'ak': '02', 'az': '04', 'ar': '05', 'ca': '06',
    'co': '08', 'ct': '09', 'de': '10', 'dc': '11', 'fl': '12',
    'ga': '13', 'hi': '15', 'id': '16', 'il': '17', 'in': '18',
    'ia': '19', 'ks': '20', 'ky': '21', 'la': '22', 'me': '23',
    'md': '24', 'ma': '25', 'mi': '26', 'mn': '27', 'ms': '28',
    'mo': '29', 'mt': '30', 'ne': '31', 'nv': '32', 'nh': '33',
    'nj': '34', 'nm': '35', 'ny': '36', 'nc': '37', 'nd': '38',
    'oh': '39', 'ok': '40', 'or': '41', 'pa': '42', 'ri': '44',
    'sc': '45', 'sd': '46', 'tn': '47', 'tx': '48', 'ut': '49',
    'vt': '50', 'va': '51', 'wa': '53', 'wi': '54', 'wy': '55',
    'wv': '56'
}

state_ids = sorted(state_codes.values())
state_names = sorted(state_codes.keys())
assert len(state_names) == 51 # 50 states plus DC

In [3]:
def subprocess_check(*args, **kwargs):
    if len(args) == 1 and type(args[0]) == str:
        kwargs['shell'] = True
    p = subprocess.Popen(*args,  
                         stdout=subprocess.PIPE, 
                         stderr=subprocess.PIPE,
                         **kwargs)
    (out, err) = p.communicate()
    ret = p.wait()
    if ret != 0:
        if out and out[-1] != '\n':
            out += '\n'
        if err and err[-1] != '\n':
            err += '\n'
        raise Exception(
            ('Call to subprocess_check failed with return code {ret}\n'
             'Standard error:\n{err}'
             'Standard out:\n{out}').format(**locals()))
    if err != '' and out != '' and err[-1] != '\n':
        err += '\n'
    return err + out

In [4]:
default_psql_database = 'census2010'

def set_default_psql_database(db):
    global default_psql_database
    default_psql_database = db
    print 'set default_psql_database to %s' % default_psql_database

def psql(cmds, database=None):
    database = database or default_psql_database
    cmd = ['psql', '-d', database]
    before = time.time()
    p = subprocess.Popen(cmd,
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    # Send cmds (string) to psql while collecting stdout, stderr
    sys.stdout.write('%s\n' % cmds.strip())
    (out, err) = p.communicate(cmds)
    p.wait()
    sys.stdout.write('Finished execution in %g secs:\n%s\n' % (time.time() - before, (out + '\n' + err).strip())) 
    
    
def query_psql(query, quiet=False, database=None):
    database = database or default_psql_database
    conn = psycopg2.connect(dbname=database, host='/var/run/postgresql')
    before = time.time()
    cur = conn.cursor()
    cur.execute(query)
    rows = cur.fetchall()
    cur.close()
    elapsed = time.time() - before
    if not quiet:
        sys.stdout.write('Execution of %s\ntook %g seconds and returned %d rows\n' % (query.strip(), elapsed, len(rows)))
    return rows

In [5]:
def get_column_names(table_name, pattern='%'):
    q = query_psql("SELECT column_name "
                   "FROM information_schema.columns "
                   "WHERE table_name='{table_name}' "
                   "AND column_name LIKE '{pattern}'".format(**locals()))
    return [x[0] for x in q]

#get_column_names('sf1_2000_p1', pattern='p001%')

# convert e.g. p35e to p035e
def canonicalize_census_table_name(name):
    (prefix, num, suffix) = re.match(r'([a-z]+)(\d+)([a-z])?$', name).groups()
    if not suffix:
        suffix = ''
    return '%s%03d%s' % (prefix, int(num), suffix)

def get_census_column_names_from_view(table_name):
    census_table_name = canonicalize_census_table_name(
        table_name.split('_')[-1])
    return get_column_names(table_name, pattern=census_table_name + '%')
    
#print get_census_column_names_from_view('sf1_2000_p35e')
#print get_census_column_names_from_view('sf1_p1')


def get_view_names(pattern='%'):
    q = query_psql("SELECT table_name "
                   "FROM INFORMATION_SCHEMA.views "
                   "WHERE table_name LIKE '{pattern}'".format(**locals()))
    return [x[0] for x in q]

def get_table_names(pattern='%'):
    q = query_psql("SELECT table_name "
                   "FROM INFORMATION_SCHEMA.tables "
                   "WHERE table_schema='public' "
                   "AND table_type='BASE TABLE' "
                   "AND table_name LIKE '{pattern}'".format(**locals()))
    return [x[0] for x in q]

#view_names = get_view_names('sf1_2000_%')

def psql_table_exists(table_name):
    try:
        query_psql('SELECT * FROM {table_name} LIMIT 1'.format(**locals()))
        return True
    except:
        return False

In [6]:
def download_file(url, filename):
    if os.path.exists(filename):
        sys.stdout.write('%s already downloaded\n' % filename)
    else:
        if not os.path.exists(os.path.dirname(filename)):
            os.makedirs(os.path.dirname(filename))
        sys.stdout.write('Downloading %s to %s\n' % (url, filename))
        conn = urllib2.urlopen(url)
        data = conn.read()
        # Confused by apparently partial reads from www2.census.gov 7/2017.  Will this ever trigger?  Should we
        # be reading until empty and then concatenating all?
        assert(not conn.read())
        open(filename + '.tmp', "wb").write(data)
        os.rename(filename + '.tmp', filename)
        sys.stdout.write('Done, wrote %d bytes to %s\n' % (len(data), filename))

def unzip_file(filename):
    exdir = os.path.splitext(filename)[0]
    if os.path.exists(exdir):
        sys.stdout.write('%s already unzipped\n' % (filename))
    else:
        tmpdir = exdir + '.tmp'
        shutil.rmtree(tmpdir, True)
        sys.stdout.write('Unzipping %s into %s\n' % (filename, tmpdir))
        subprocess_check(['unzip', filename, '-d', tmpdir])
        os.rename(tmpdir, exdir)
        print 'Success, created %s' % (exdir)
    return exdir
        
def gunzip_file(filename):
    dest = os.path.splitext(filename)[0]
    if os.path.exists(dest):
        sys.stdout.write('%s already unzipped\n' % (filename))
    else:
        tmp = dest + '.tmp'
        sys.stdout.write('gunzipping %s\n' % (filename))
        subprocess.check_call("gunzip -c '%s' > '%s'" % (filename, tmp), shell=True)
        os.rename(tmp, dest)
        sys.stdout.write('Success, created %s\n' % (dest))

In [7]:
# Raises worker exceptions in shutdown

class SimpleThreadPoolExecutor(concurrent.futures.ThreadPoolExecutor):
    def __init__(self, max_workers):
        super(SimpleThreadPoolExecutor, self).__init__(max_workers=max_workers)
        self.futures = []
        
    def submit(self, fn, *args, **kwargs):
        future = super(SimpleThreadPoolExecutor, self).submit(fn, *args, **kwargs)
        self.futures.append(future)
        return future
    
    def get_futures(self):
        return self.futures

    def shutdown(self):
        exception_count = 0
        results = []
        for completed in concurrent.futures.as_completed(self.futures):
            try:
                results.append(completed.result())
            except Exception as e:
                exception_count += 1
                sys.stderr.write(
                    'Exception caught in SimpleThreadPoolExecutor.shutdown.  Continuing until all are finished.\n' +
                    'Exception follows:\n' +
                    traceback.format_exc())
        super(SimpleThreadPoolExecutor, self).shutdown()
        if exception_count:
            raise Exception('SimpleThreadPoolExecutor failed: %d of %d raised exception' % (exception_count, len(self.futures)))
        print 'SimpleThreadPoolExecutor succeeded: all %d jobs completed' % (len(self.futures))
        return results
        

class SimpleProcessPoolExecutor(concurrent.futures.ProcessPoolExecutor):
    def __init__(self, max_workers):
        super(SimpleProcessPoolExecutor, self).__init__(max_workers=max_workers)
        self.futures = []
        
    def submit(self, fn, *args, **kwargs):
        future = super(SimpleProcessPoolExecutor, self).submit(fn, *args, **kwargs)
        self.futures.append(future)
        return future
    
    def get_futures(self):
        return self.futures

    def shutdown(self):
        exception_count = 0
        results = []
        for completed in concurrent.futures.as_completed(self.futures):
            try:
                results.append(completed.result())
            except Exception as e:
                exception_count += 1
                sys.stderr.write(
                    'Exception caught in SimpleProcessPoolExecutor.shutdown.  Continuing until all are finished.\n' +
                    'Exception follows:\n' +
                    traceback.format_exc())
        super(SimpleProcessPoolExecutor, self).shutdown()
        if exception_count:
            raise Exception('SimpleProcessPoolExecutor failed: %d of %d raised exception' % (exception_count, len(self.futures)))
        print 'SimpleProcessPoolExecutor succeeded: all %d jobs completed' % (len(self.futures))
        return results


In [33]:
def numpy_atomic_save(dest_filename, array):
    dir = os.path.dirname(os.path.abspath(dest_filename))
    try:
        os.makedirs(dir)
    except:
        pass
    tmp_file = tempfile.NamedTemporaryFile(dir=dir, delete=False)
    numpy.save(tmp_file, array)
    nrecs = len(array)
    tmp_file.close()
    os.rename(tmp_file.name, dest_filename)
    
    #nbytes = os.stat(dest_filename).st_size
    #sys.stdout.write('Wrote {nbytes} bytes to {dest_filename}\n'.format(**locals()))

# this gives mutable references, never copies
def to_ctype_reference(x):
    if type(x)==bytearray:
        return (ctypes.c_ubyte * len(x)).from_buffer(x)
    if type(x)==numpy.ndarray:
        return numpy.ctypeslib.as_ctypes(x)
    if type(x)==numpy.core.memmap and x.dtype == numpy.float32:
        return x.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    if type(x) is list:
        # ASSUMES LIST OF ARRAYS;  construct a small disposable ctypes array of pointers
        # pointing to the underlying arrays
        colptrs = [to_ctype_reference(c) for c in x]
        # Make sure all columns have same type
        coltype = colptrs[0]._type_
        for colptr in colptrs[1:]:
            assert(colptr._type_ == coltype)
        return (ctypes.POINTER(coltype) * len(x))(*colptrs)
    raise Exception('Unknown type %s in to_ctype' % type(x))

def compile_and_load(src):
    # python won't reload unless we change the name of the so; make
    # certain we always use a new name by incrementing sequence
    global compile_and_load_seq
    try:
        compile_and_load_seq += 1
    except:
        compile_and_load_seq = 0
    so_suffix = suffix='-%06d.so' % compile_and_load_seq
    with tempfile.NamedTemporaryFile(suffix='.c') as srcfile, tempfile.NamedTemporaryFile(suffix=so_suffix) as sofile:
        srcfile.write(src)
        srcfile.flush()
        cmd = 'gcc -pthread -shared -rdynamic -fno-strict-aliasing'
        cmd += ' -g -DNDEBUG -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC'
        
        cmd += ' -I' + distutils.sysconfig.get_python_inc()
        
        cmd += ' -L' + distutils.sysconfig.get_python_lib()
        
        # libpython seems to be up 2 directories on Anaconda on hal15
        if distutils.sysconfig.get_python_lib().endswith('site-packages'):
            cmd += ' -L' + distutils.sysconfig.get_python_lib() + '/../..'

        cmd += ' -lpython2.7'
        
        srcfile_name = srcfile.name
        sofile_name = sofile.name
        cmd += ' {srcfile_name} -o {sofile_name}'.format(**locals())
        print cmd
        subprocess_check(cmd)
        return ctypes.cdll.LoadLibrary(os.path.abspath(sofile_name))

In [None]:
test_to_ctype_reference = False

if test_to_ctype_reference:
    x = numpy.array([11.0, 22.0, 33.0], dtype=numpy.float32)
    x.tofile('tmp-test.bin')
    x = numpy.memmap('tmp-test.bin', dtype=numpy.float32, mode='r')
    print to_ctype_reference(x)[0]
    os.unlink('tmp-test.bin')

In [34]:
test_compile_and_load = False

if test_compile_and_load:
    foo = compile_and_load("""
int    halve_int   (int x   ) { return x/2; }
double halve_double(double x) { return x/2; }

void halve_bytes_in_place(unsigned char *array, unsigned int len) {
    for (unsigned i = 0; i < len; i++) array[i] /= 2;
}

void halve_doubles_in_place(double *array, unsigned int len) {
    for (unsigned i = 0; i < len; i++) array[i] /= 2;
}

void halve_floats_in_place(float *array, unsigned int len) {
    for (unsigned i = 0; i < len; i++) array[i] /= 2;
}

void multiply_float2d_by_row(float *array, unsigned int nrows, unsigned int ncols) {
    for (unsigned int r = 0; r < nrows; r++) {
        for (unsigned int c = 0; c < ncols; c++) {
            array[r * ncols + c] *= r;
        }
    }
}

int test_2d(float **array) {
    return (int)array[1][2];
}

""")

    print foo.halve_int(5)

    # Set argument types and return values if not int
    foo.halve_double.argtypes = [ctypes.c_double]
    foo.halve_double.restype = ctypes.c_double
    print foo.halve_double(5.0)
    
    x = bytearray([1,2,3,4,5])
    foo.halve_bytes_in_place(to_ctype_reference(x), len(x))
    print [a for a in x]

    x = numpy.array([1.0,2.0,3.0,4.0,5.0])
    foo.halve_doubles_in_place(to_ctype_reference(x), len(x))
    print x

    x = numpy.array([1.0,2.0,3.0,4.0,5.0], dtype=numpy.float32)
    foo.halve_floats_in_place(to_ctype_reference(x), len(x))
    print x

    x = numpy.array([(1.0,2.0),(3.0,4.0),(5.0,6.0)], dtype=numpy.float32)
    foo.multiply_float2d_by_row(to_ctype_reference(x), x.shape[0], x.shape[1])
    print x
    
    x = [numpy.array([11.0, 22.0, 33.0], dtype=numpy.float32),
         numpy.array([111.0, 222.0, 333.0], dtype=numpy.float32)]
    print foo.test_2d(to_ctype_reference(x))

In [None]:
def stopwatch(name):
    start = time.time()
    yield None
    end = time.time()
    sys.stdout.write('%s took %.1f seconds\n' % (name, end - start))
    
#for _ in stopwatch('Sleeping for half a second'):
#    time.sleep(0.5)

In [None]:
class Stopwatch:
    def __init__(self, name):
        self.name = name
    def __enter__(self):
        self.start = time.time()
    def __exit__(self, type, value, traceback):
        sys.stdout.write('%s took %.1f seconds\n' % (self.name, time.time() - self.start))

#with Stopwatch('Sleeping for half a second') as _:
#    time.sleep(0.5)