In [216]:
import concurrent, errno, glob, json, os, shutil, subprocess, sys, time, traceback

In [230]:
class SimpleProcessPoolExecutor(concurrent.futures.ProcessPoolExecutor):
    def __init__(self, max_workers):
        super(SimpleProcessPoolExecutor, self).__init__(max_workers=max_workers)
        self.futures = []
        
    def submit(self, fn, *args, **kwargs):
        future = super(SimpleProcessPoolExecutor, self).submit(fn, *args, **kwargs)
        self.futures.append(future)
        return future
    
    def get_futures(self):
        return self.futures

    def shutdown(self):
        exception_count = 0
        results = []
        for completed in concurrent.futures.as_completed(self.futures):
            try:
                results.append(completed.result())
            except Exception as e:
                exception_count += 1
                sys.stderr.write(
                    'Exception caught in SimpleProcessPoolExecutor.shutdown.  Continuing until all are finished.\n' +
                    'Exception follows:\n' +
                    traceback.format_exc())
        super(SimpleProcessPoolExecutor, self).shutdown()
        if exception_count:
            raise Exception('SimpleProcessPoolExecutor failed: %d of %d raised exception' % 
                            (exception_count, len(self.futures)))
        print 'SimpleProcessPoolExecutor succeeded: all %d jobs completed' % (len(self.futures))
        return results


def subprocess_check(*args, **kwargs):
    if len(args) == 1 and type(args[0]) == str:
        kwargs['shell'] = True
    p = subprocess.Popen(*args,  
                         stdout=subprocess.PIPE, 
                         stderr=subprocess.PIPE,
                         **kwargs)
    (out, err) = p.communicate()
    ret = p.wait()
    if ret != 0:
        if out and out[-1] != '\n':
            out += '\n'
        if err and err[-1] != '\n':
            err += '\n'
        raise Exception(
            ('Call to subprocess_check failed with return code {ret}\n'
             'Standard error:\n{err}'
             'Standard out:\n{out}').format(**locals()))
    if err != '' and out != '' and err[-1] != '\n':
        err += '\n'
    return err + out

def unzip_file(filename):
    exdir = os.path.splitext(filename)[0]
    if os.path.exists(exdir):
        sys.stdout.write('%s already unzipped\n' % (filename))
    else:
        tmpdir = exdir + '.tmp'
        shutil.rmtree(tmpdir, True)
        sys.stdout.write('Unzipping %s into %s\n' % (filename, tmpdir))
        subprocess_check(['unzip', filename, '-d', tmpdir])
        os.rename(tmpdir, exdir)
        sys.stdout.write('Success, created %s\n' % exdir)
    return exdir

def shp_to_geojson(filename, geojson_dir):
    basename = os.path.basename(filename)    
    f = os.path.splitext(basename)[0]
    geojson_file = os.path.join(geojson_dir, f + '.geojson')
    cmd = "ogr2ogr -f GeoJSON -t_srs crs:84 %s %s" % (geojson_file, filename)
    !$cmd
    open(filename + '.done', 'w')
    

def properties_to_filepath(properties):
    path = []
    path.append(properties["kingdom"])
    path.append(properties["phylum"])
    path.append(properties["class"])
    path.append(properties["order_"])
    path.append(properties["family"])
    genus, species = properties["binomial"].split(" ")
    path.append(genus)
    return os.path.join(os.path.join(*path), species + '.geojson')
    
def split_geojson_by_species(filename, out_dir):    
    with open(filename) as f:
        json_data = json.load(f)
        for feature in json_data["features"]:
            filepath = os.path.join(SPECIES_DATA_DIR, properties_to_filepath(feature["properties"]))
            try:
                os.makedirs(os.path.dirname(filepath))
            except OSError as e:
                if e.errno != errno.EEXIST:
                    print "ERROR: Creating %s failed" % os.path.dirname(filepath)
                    #raise  # Reraise if failed for reasons other than existing already            
            json_out = {
                "type": "FeatureCollection",
                "features": [feature]
            }
            if os.path.exists(filepath):
                print "WARNING: Overwriting %s" % filepath
            with open(filepath, "w") as o_f:
                json.dump(feature, o_f)
    open(filename + '.done', 'w')

def make_shapefile(filepath, test=False):
    species = os.path.basename(filepath).split('.')[0]
    dirname = os.path.dirname(filepath)
    outdir = os.path.join(dirname,species)
    shp_filename = os.path.join(outdir, "%s.shp" % species)    
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    cmd = 'ogr2ogr -f "ESRI Shapefile" %s %s' % (shp_filename, filepath)
    if test:
        print cmd
    else:
        !$cmd
    
def rasterize_shapefile(filepath, test=False):
    species = os.path.basename(filepath).split('.')[0]
    dirname = os.path.dirname(filepath)
    outdir = os.path.join(dirname,species)
    shp_filename = os.path.join(outdir, "%s.shp" % species)    
    tif_filename = os.path.join(outdir, "%s.tif" % species)
    cmd = "gdal_rasterize -burn 255 -burn 255 -burn 255 -te -180 -90 180 90 -ts 46080 23040 -ot Byte %s %s" % (shp_filename, tif_filename)
    if test:
        print cmd
    else:
        !$cmd

def make_png(filepath, test=False):
    species = os.path.basename(filepath).split('.')[0]
    dirname = os.path.dirname(filepath)
    outdir = os.path.join(dirname,species)
    tif_filename = os.path.join(outdir, "%s.tif" % species)
    png_filename = os.path.join(outdir, "%s.png" % species)
    cmd = "gdal_translate -of PNG %s %s" % (tif_filename, png_filename)
    if test:
        print cmd
    else:
        !$cmd

def make_tiles(filepath, test=False):
    species = os.path.basename(filepath).split('.')[0]
    dirname = os.path.dirname(filepath)
    outdir = os.path.join(dirname,species)
    tilesdir = os.path.join(outdir,"tiles")
    png_filename = os.path.join(outdir, "%s.png" % species)
    if not os.path.exists(tilesdir):
        os.mkdir(tilesdir)
    cmd = "gdal2tiles.py -s EPSG:4326 -z 0-8 %s %s" % (png_filename, tilesdir)
    if test:
        print cmd
    else:
        !$cmd
    
def compress_tif(filepath, test=False):
    species = os.path.basename(filepath).split('.')[0]
    dirname = os.path.dirname(filepath)
    outdir = os.path.join(dirname,species)
    tif_filename = os.path.join(outdir, "%s.tif" % species)
    ctif_filename = os.path.join(outdir, "%s_JPEG.tif" % species)
    cmd = "gdal_translate -co COMPRESS=JPEG -co TILED=YES %s %s" % (tif_filename, ctif_filename)
    if test:
        print cmd
    else:
        !$cmd
    cmd = "mv %s %s" % (ctif_filename, tif_filename)
    if test:
        print cmd
    else:
        !$cmd
    
def do_it(filepath, test=False):
    print "Preparing to rasterize %s " % filepath
    start = time.time()
    print "Making to shapefile for %s " % filepath
    make_shapefile(geojsonfile,test)
    print "Rasterizing shapefile for %s " % filepath
    rasterize_shapefile(filepath,test)
    print "Making png for %s " % filepath
    make_png(filepath,test)
    print "Making tiles for %s " % filepath
    make_tiles(filepath,test)
    print "Compressing tif for %s " % filepath
    compress_tif(filepath,test)
    print 'Finished making shapefile in %.1f seconds\n' % (time.time() - start)
    

In [54]:
RAW_DATA_DIR = "data/raw/"
SPECIES_DATA_DIR = "data/species"

In [23]:
# 1) Unzip shapefiles files 
exe = SimpleProcessPoolExecutor(max_workers=10)

for zipfile in glob.glob('%s/*.zip' % RAW_DATA_DIR):
    exe.submit(unzip_file, zipfile)
exe.shutdown()
None

Unzipping data/raw/ANGELFISH.zip into data/raw/ANGELFISH.tmp
Success, created data/raw/ANGELFISH
SimpleProcessPoolExecutor succeeded: all 1 jobs completed


In [44]:
# 2) Convert Shapefiles into GeoJSON files
for shapefile in glob.glob('%s/*/*.shp' % RAW_DATA_DIR):
    if os.path.exists(shapefile + '.done'):
        print '%s already done, skipping' % shapefile
        continue
    print 'About to read from %s' % shapefile
    shp_to_geojson(shapefile, RAW_DATA_DIR)

data/raw/ANGELFISH/ANGELFISH.shp already done, skipping
data/raw/IBAsGlobal_2017_01/IbaMapGlobal_PNT.shp already done, skipping
data/raw/IBAsGlobal_2017_01/IbaMapGlobal_POL.shp already done, skipping


In [144]:
# 3) Split GeoJSON files into species level GeoJSON files
for geojsonfile in glob.glob('%s/*.geojson' % RAW_DATA_DIR):
    if os.path.exists(geojsonfile + '.done'):
        print '%s already done, skipping' % geojsonfile
        continue
    print 'About to read from %s' % geojsonfile
    split_geojson_by_species(geojsonfile, SPECIES_DATA_DIR)

data/raw/ANGELFISH.geojson already done, skipping


In [223]:
# 4) Rasterize each geojson file
for geojsonfile in glob.glob('%s/*/*/*/*/*/*/*.geojson' % SPECIES_DATA_DIR):
    #do_it(geojsondfile)
    #a) Make a shapefile
    #b) Rasterize shapefile
    #c) Make a PNG
    #d) Make tiles
    #e) Compress the GeoTIF

Preparing to rasterize data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Apolemichthys/arcuatus.geojson 
Finished in 0.3 seconds

Preparing to rasterize data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Apolemichthys/griffisi.geojson 
Finished in 0.3 seconds

Preparing to rasterize data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Apolemichthys/guezei.geojson 
Finished in 0.2 seconds

Preparing to rasterize data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Apolemichthys/kingi.geojson 
Finished in 0.2 seconds

Preparing to rasterize data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Apolemichthys/trimaculatus.geojson 
Finished in 2.9 seconds

Preparing to rasterize data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Apolemichthys/xanthopunctatus.geojson 
Finished in 0.2 seconds

Preparing to rasterize data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/P

Finished in 0.4 seconds

Preparing to rasterize data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Genicanthus/caudovittatus.geojson 
Finished in 0.4 seconds

Preparing to rasterize data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Genicanthus/lamarck.geojson 
Finished in 1.9 seconds

Preparing to rasterize data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Genicanthus/melanospilos.geojson 
Finished in 1.6 seconds

Preparing to rasterize data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Genicanthus/personatus.geojson 
Finished in 0.2 seconds

Preparing to rasterize data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Genicanthus/semicinctus.geojson 
Finished in 0.2 seconds

Preparing to rasterize data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Genicanthus/semifasciatus.geojson 
Finished in 0.3 seconds

Preparing to rasterize data/species/ANIMALIA/CHORDATA/ACT

In [155]:
SPECIES_DATA_DIR

'data/species'

In [225]:
f = "data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Pygoplites/diacanthus.geojson"


In [231]:
do_it(f,True)

Preparing to rasterize data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Pygoplites/diacanthus.geojson 
Making to shapefile for data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Pygoplites/diacanthus.geojson 
ogr2ogr -f "ESRI Shapefile" data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Pygoplites/diacanthus/diacanthus.shp data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Pygoplites/diacanthus.geojson
Rasterizing shapefile for data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Pygoplites/diacanthus.geojson 
gdal_rasterize -burn 255 -burn 255 -burn 255 -te -180 -90 180 90 -ts 46080 23040 -ot Byte data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Pygoplites/diacanthus/diacanthus.shp data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORMES/POMACANTHIDAE/Pygoplites/diacanthus/diacanthus.tif
Making png for data/species/ANIMALIA/CHORDATA/ACTINOPTERYGII/PERCIFORME

In [206]:
os.path.join("a", "c", "d")

'a/c/d'