In [None]:
import affine, concurrent, cStringIO, glob, IPython, json, os, PIL
import rasterio, shapely, shutil, subprocess, sys, thread, time, traceback

#!conda install -y -c conda-forge geopandas
import numpy as np
import pandas as pd
import geopandas as gpd
from geopandas import GeoSeries, GeoDataFrame

#!conda install -y rasterio
import rasterio
import rasterio.features

%matplotlib inline

In [2]:
class SimpleProcessPoolExecutor(concurrent.futures.ProcessPoolExecutor):
    def __init__(self, max_workers):
        super(SimpleProcessPoolExecutor, self).__init__(max_workers=max_workers)
        self.futures = []
        
    def submit(self, fn, *args, **kwargs):
        future = super(SimpleProcessPoolExecutor, self).submit(fn, *args, **kwargs)
        self.futures.append(future)
        return future
    
    def get_futures(self):
        return self.futures

    def shutdown(self):
        exception_count = 0
        results = []
        for completed in concurrent.futures.as_completed(self.futures):
            try:
                results.append(completed.result())
            except Exception as e:
                exception_count += 1
                sys.stderr.write(
                    'Exception caught in SimpleProcessPoolExecutor.shutdown.  Continuing until all are finished.\n' +
                    'Exception follows:\n' +
                    traceback.format_exc())
        super(SimpleProcessPoolExecutor, self).shutdown()
        if exception_count:
            raise Exception('SimpleProcessPoolExecutor failed: %d of %d raised exception' % (exception_count, len(self.futures)))
        print 'SimpleProcessPoolExecutor succeeded: all %d jobs completed' % (len(self.futures))
        return results

def subprocess_check(*args, **kwargs):
    if len(args) == 1 and type(args[0]) == str:
        kwargs['shell'] = True
    p = subprocess.Popen(*args,  
                         stdout=subprocess.PIPE, 
                         stderr=subprocess.PIPE,
                         **kwargs)
    (out, err) = p.communicate()
    ret = p.wait()
    if ret != 0:
        if out and out[-1] != '\n':
            out += '\n'
        if err and err[-1] != '\n':
            err += '\n'
        raise Exception(
            ('Call to subprocess_check failed with return code {ret}\n'
             'Standard error:\n{err}'
             'Standard out:\n{out}').format(**locals()))
    if err != '' and out != '' and err[-1] != '\n':
        err += '\n'
    return err + out

def unzip_file(filename):
    exdir = os.path.splitext(filename)[0]
    if os.path.exists(exdir):
        sys.stdout.write('%s already unzipped\n' % (filename))
    else:
        tmpdir = exdir + '.tmp'
        shutil.rmtree(tmpdir, True)
        sys.stdout.write('Unzipping %s into %s\n' % (filename, tmpdir))
        subprocess_check(['unzip', filename, '-d', tmpdir])
        os.rename(tmpdir, exdir)
        sys.stdout.write('Success, created %s\n' % exdir)
    return exdir

In [None]:
exe = SimpleProcessPoolExecutor(max_workers=10)

for zipfile in glob.glob('redlist/*.zip'):
    exe.submit(unzip_file, zipfile)

exe.shutdown()
None

In [3]:
def create_species_geojson(key, dest):
    geoms = [cat.geometry[i] for i in species[key]]
    if len(geoms) == 1:
        geom = geoms[0]
    else:
        try:
            geom = shapely.ops.unary_union(geoms)
        except:
            sys.stdout.write('%s failed to perform union' % key)
            return
    tmp = dest + '.tmp.%d.%d' % (thread.get_ident(), os.getpid())
    data = json.dumps(shapely.geometry.mapping(geom))
    open(tmp, 'w').write(data)
    os.rename(tmp, dest)
    sys.stdout.write('Created %s with %d bytes\n' % (dest, len(data)))

for shapefile in glob.glob('redlist/*/*.shp'):
    if os.path.exists(shapefile + '.done'):
        print '%s already done, skipping' % shapefile
        continue
    print 'About to read from %s' % shapefile
    %time cat = gpd.read_file(shapefile);
    print 'Read %d records from %s' % (len(cat), shapefile)
    #cat = cat.query('presence<=2')
    #print '%d after removing presence>2' % len(cat)
    
    species = {}
    skipped = 0

    for i in range(cat.shape[0]):
        if cat.presence[i] > 2:
            skipped += 1
            continue
        key = cat['class'][i] + '_' + cat.order_[i] + '_' + cat.family[i] + '_' + cat.binomial[i].replace(' ', '_')
        if key in species:
            species[key].append(i)
        else:
            species[key] = [i]

    print '%d unique species' % len(species.keys())
    print 'Skipped %d records with presence > 2' % skipped

    #!mkdir -p species_geojsons
    
    exe = SimpleProcessPoolExecutor(max_workers=10)

    for key in sorted(species.keys()):
        dest = 'species_geojsons/%s.geojson' % key
        if os.path.exists(dest):
            sys.stdout.write('%s already exists, skipping\n' % dest)
        else:
            exe.submit(create_species_geojson, key, dest)
        
    exe.shutdown()
    open(shapefile + '.done', 'w')
    
None

redlist/SEAGRASSES/SEAGRASSES.shp already done, skipping
redlist/ANGELFISH/ANGELFISH.shp already done, skipping
redlist/FW_PLANTS/FW_PLANTS.shp already done, skipping
redlist/PUFFERFISH/PUFFERFISH.shp already done, skipping
redlist/BUTTERFLYFISH/BUTTERFLYFISH.shp already done, skipping
redlist/WRASSE/WRASSE.shp already done, skipping
redlist/TUNAS_BILLFISHES/TUNAS_BILLFISHES.shp already done, skipping
redlist/CORALS/CORALS_PART_3.shp already done, skipping
redlist/CORALS/CORALS_PART_2.shp already done, skipping
redlist/CORALS/CORALS_PART_1.shp already done, skipping
redlist/GROUPERS/GROUPERS.shp already done, skipping
redlist/HAGFISH/HAGFISH.shp already done, skipping
redlist/COMBTOOTHBLENNIES/COMBTOOTHBLENNIES.shp already done, skipping
redlist/AMPHIBIANS/AMPHIBIANS.shp already done, skipping
redlist/CONUS/CONUS.shp already done, skipping
redlist/REPTILES/REPTILES.shp already done, skipping
redlist/FW_ODONATA/FW_ODONATA.shp already done, skipping
redlist/SEACUCUMBERS/SEACUCUMBERS.shp 