### Pipeline optimization 
* Objective:
 * Build a stack of numpy arrays depicting the cost distance away from biogas sources
 
* Requirements - Packages
 * NumPy, Pandas, Scikit-Image

* Requirements - Data:
 * CSV of biogas sources, including coordinates and biogas production potential (MMBtu)
 * Cost surface for building pipelines
 
* Output: 
 * Stacked NumPy array

In [None]:
#Import packages
import numpy as np
import pandas as pd
from skimage import graph
from osgeo import gdal, osr
import geopandas as gpd
from shapely.geometry import Point
from shapely import wkt

In [None]:
#Read in cost surface
ds =  gdal.Open('../data/MIT_Surface_Subset/MIT_CostSurface_500m.img')
#Get the raster projection
ds_prj = ds.GetProjection()
#Get raster attributes
llx, x_size, x_angle, lly, y_angle, y_size = ds.GetGeoTransform()
#Extract Band1 as the cost array
arrCost = np.array(ds.GetRasterBand(1).ReadAsArray()) / 100
#Set zero costs to high costs
arrCost[arrCost == 0] = 10
#Print info
print("Lower left coordinate = ({0:.8f},{1:.8f})".format(llx,lly))
print("Pixel size is {0:.8f}(x), {1:.8f}(y)".format(x_size,y_size))
print("Values range from {} to {}".format(arrCost.min(),arrCost.max()))

In [None]:
#Read in biogas sources (example: Duplin Co)
dfBG =  pd.read_excel('../data/DuplinCountySwineFarmEconomics.xlsx',
                         sheet_name='Duplin County Swine Farm Master').iloc[:,[11,12,-6,-1]]
dfBG.sort_values(by='Total Potential Methane Yield (scf/h)',ascending=False,inplace=True)

In [None]:
#Convert to a geopandas dataframe (for easy reprojecting)
geom = [Point(xy) for xy in zip(dfBG.iloc[:,1],dfBG.iloc[:,0])]
gdfBG = gpd.GeoDataFrame(dfBG, geometry = geom)
gdfBG.crs = {'init':'epsg:4326'}

In [None]:
#Re-project the points to match the raster (takes a moment...)
gdf2 = gdfBG.to_crs(ds_prj)

In [None]:
#Iterate through all records and create a cost distance raster
cd_arrays = []
dx = {}
for i,r in gdf2.iterrows():
    print(".",end='')
    x = r.geometry.x
    y = r.geometry.y
    z = r[3]
    
    #Get array index from lat long
    xOffset = int(round((x - llx)/x_size))
    yOffset = int(round((y - lly)/y_size))

    #Create the minimum cost path (MCP) surface
    lc_graph = graph.MCP_Geometric(arrCost * z, sampling=(x_size,y_size))
    #Compute cost distances away
    cd_array = lc_graph.find_costs(starts=([(yOffset, xOffset)]))[0]
    cd_array = cd_array + z
    #cd_array[arrCost == 10] = np.NaN
    cd_arrays.append(cd_array)
    dx[i]=cd_array

In [None]:
#Export min of all rasters to geotiff

#Reduce to minimum costs
arrMin = np.amin(arrStack,axis=0)

#Create the data source object
bands,height,width = arrStack.shape
drv = gdal.GetDriverByName("GTiff")
dsOut = drv.Create('../scratch/min_cost_surface_mx.tif',width,height,1,gdal.GDT_Float32)

#Set no data values 
arrMin[arrMin > 3000] = 3000

#Write to the data source object
dsOut.GetRasterBand(1).WriteArray(arrMin)
dsOut.SetGeoTransform (ds.GetGeoTransform())
dsOut.SetProjection(ds_prj)
dsOut.FlushCache()

In [None]:
#Export lowest cost farm to geotiff
bands,height,width = arrStack.shape
drv = gdal.GetDriverByName("GTiff")
dsOut = drv.Create('../scratch/lowest_cost.tif',width,height,1,gdal.GDT_Float32)

arrFirst = arrStack[0]
arrFirst[arrFirst > 3000] = 3000
dsOut.GetRasterBand(1).WriteArray(arrFirst)
dsOut.SetGeoTransform (ds.GetGeoTransform())
dsOut.SetProjection(ds_prj)
dsOut.FlushCache()

In [None]:
arrMin[arrCost > 0.3]

In [None]:
dfBG.to_csv('../scratch/duplin_sites.csv',index_label='OID')

In [None]:
#Stack all the arrays
arrStack =np.stack(cd_arrays)
np.save('../data/DuplinStack500m.npy',arrStack)