# Build topology
This script reads in the individual least cost routes linking each biogas source to the nearest pipeline and merges them into a topologically correct network. This is done by splitting each route where two routes meet. 

In [1]:
#Import packages
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point, LineString
from shapely.ops import split

### Read in the routes feature class & tidy the data
The routes feature class includes the least cost path between each biogas source and its connection to the pipeline framework. Here we read it in and tidy it up for analysis. 

In [5]:
#Read in routes feature class as shapefile
gdfRoutes = gpd.read_file('..\\data\\processed\\Routes.shp')
gdfRoutes.head()

Unnamed: 0,Fac_ID,Waste,Biogas,TYPE,geometry
0,291,490225.577124,13726320.0,Transmission,"LINESTRING (1618626.822 -297167.070, 1618626.8..."
1,154,301328.821333,8437207.0,Transmission,"LINESTRING (1582814.308 -311794.717, 1582814.3..."
2,1930,266616.481879,7465261.0,Transmission,"LINESTRING (1623166.436 -225037.641, 1623166.4..."
3,292,243401.613396,6815245.0,Transmission,"LINESTRING (1617113.617 -299689.078, 1617113.6..."
4,1925,226652.386212,6346267.0,Transmission,"LINESTRING (1737161.199 -158961.030, 1737665.6..."


## Split the route features into topologically correct segments

#### Derive geodataframes of the route start points and end points
Start points are used to link each route's biogas potential to the final output and end points are used to split existing route features.

In [6]:
#Copy routes geodataframe and update geometry to start points
gdfStart = gdfRoutes.copy(deep=True)
gdfStart['geometry'] = gdfRoutes['geometry'].apply(lambda x: Point(x.coords[0]))
gdfStart.head()

Unnamed: 0,Fac_ID,Waste,Biogas,TYPE,geometry
0,291,490225.577124,13726320.0,Transmission,POINT (1618626.822 -297167.070)
1,154,301328.821333,8437207.0,Transmission,POINT (1582814.308 -311794.717)
2,1930,266616.481879,7465261.0,Transmission,POINT (1623166.436 -225037.641)
3,292,243401.613396,6815245.0,Transmission,POINT (1617113.617 -299689.078)
4,1925,226652.386212,6346267.0,Transmission,POINT (1737161.199 -158961.030)


In [7]:
#Copy routes geodataframe and update geometry to start points
gdfEnd = gdfRoutes.copy(deep=True)
gdfEnd['geometry'] = gdfRoutes['geometry'].apply(lambda x: Point(x.coords[-1]))
gdfEnd.head()

Unnamed: 0,Fac_ID,Waste,Biogas,TYPE,geometry
0,291,490225.577124,13726320.0,Transmission,POINT (1623166.436 -285565.833)
1,154,301328.821333,8437207.0,Transmission,POINT (1579787.898 -295653.866)
2,1930,266616.481879,7465261.0,Transmission,POINT (1627706.051 -230586.058)
3,292,243401.613396,6815245.0,Transmission,POINT (1618626.822 -297167.070)
4,1925,226652.386212,6346267.0,Transmission,POINT (1738674.404 -158456.629)


#### Split route features where new routes enter them
1. Combine all endpoint point features into a single multipoint feature
2. Split the LineString geometries with this multipoint feature, resulting in Geometry Collection features stored in the geodataframe's geometry series
3. Iterate through each fearture in the above result, splitting its geometry collection back into individual LineString features, and adding each to a growing list.
4. Reconstruct a new geodataframe of all the route segments from the split list, adding a new edge ID attribute

In [8]:
#Combine endpoints into a single multipoint object
ends = gdfEnd.geometry.unary_union

In [9]:
#Create a geoseries of split routes (geometry collections) - takes a bit of time
theSplits = gdfRoutes.geometry.apply(lambda x: split(x,ends))

In [10]:
#Create lists to fill
links = [] #List of each route's original route ID
points_top = []
points_bottom = []
geom = []  #List of the LineString objects extracted from each feature's geometry collection

In [12]:
#Iterate and add items to the list
for idx, row in gdfRoutes.iterrows():
    #Iterate through split segments in the geometry collection
    for line in theSplits[idx].geoms:
        #Add items to the list
        links.append(str(row['Fac_ID']))        #Add the index of the originating route line
        geom.append(line)                      #Add the LineString geometry to the geom list
        points_top.append(line.coords[0])      #Add the first point to the list
        points_bottom.append(line.coords[-1])  #Add the last point to the list

In [13]:
#Construct an output geodataframe from the route and geom lists created above
gdfSegments = gpd.GeoDataFrame(pd.DataFrame({'route_id':links}),
                               geometry = geom, crs = gdfRoutes.crs)

#Add the index as a unique segment ID  
gdfSegments['edge_ID'] = gdfSegments.index.astype(str)

### Add attributes to each segment, including downstream segment ID

With the segments created, we now need assign attributes to each edge. These consist of the its upstream node ID, its downstream node ID, and the amount of biogas introduced at its upstream node. 

This process is a bit tricky and is done by:
* Creating a feature class of the segment's starting vertex, linked to the segment's `edge_id`. This geodataframe is labeled `gdfDownstreamNodes` as its points are labeled with the edge that falls **downstream** of it. 
* Creating a second feature class of the segment's ending vertex, tagged with the segment's `edge_id `. This geodataframe is labeled `gdfUpstreamNodes` as its points are labeled with the edge that falls **upstream** of it. 
* The to points are spatially joined, resulting in a dataset of vertices (`gdNodes`) where each includes the label of the upstream and downstream edge_id. 

##### Create geodataframes from "upstream" and "downstream" nodes
Here the "downstream" nodes are actually the first point in each segment, and the "upstream" nodes are the last. This seems backwards, but these are both intermediate datasets used to determine "from-to" pairs, done by spatially joining them. When joined, the "from" node carries the attribute of the upstream segment and the "to" node carries the downstream one...

In [14]:
#Construct a gdf of segement start points; 
#  the 'edge_id' included is the id of the segement into which it flows, i.e. its downstream segment ID
gdfFirstPoints = gdfSegments.copy(deep=True)
gdfFirstPoints['geometry'] = gdfFirstPoints['geometry'].apply(lambda x: Point(x.coords[0]))
gdfFirstPoints['downstream_id'] = gdfFirstPoints.index.astype(str)

In [15]:
#Construct a gdf of segement start points; 
#  the 'edge_id' included is the id of the segement flowing into it, i.e. its upstream segment ID
gdfLastPoints = gdfSegments.copy(deep=True)
gdfLastPoints['geometry'] = gdfLastPoints['geometry'].apply(lambda x: Point(x.coords[-1]))
gdfLastPoints['upstream_id'] = gdfLastPoints.index.astype(str)

##### Spatially join the downstream and upstream points
Spatially joining the two datasets results in a single point feature class with each point including the edge ids of the upstream and downstream segements, respectively.

In [16]:
#Spatially join the above geodataframes and remove indices
gdfNodes = gpd.sjoin(left_df=gdfLastPoints, right_df=gdfFirstPoints, how='left')
gdfNodes.drop(columns=['route_id_left','edge_ID_left','index_right','route_id_right','edge_ID_right'],
              axis=1,inplace=True)
gdfNodes.head(1)

Unnamed: 0,geometry,upstream_id,downstream_id
0,POINT (1619131.223 -296158.267),0,1


In [17]:
#Show info on the resulting dataset
gdfNodes.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 2818 entries, 0 to 2817
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   geometry       2818 non-null   geometry
 1   upstream_id    2818 non-null   object  
 2   downstream_id  2355 non-null   object  
dtypes: geometry(1), object(2)
memory usage: 88.1+ KB


The above reveals a number of records missing `downstream_id` values. These are the terminal segments, i.e. the ones connecting routes to existing NG pipeline infrastructure. 
> This is because we joined the "first" points of each segement to the "last" ones (left join above), leaving a number of downstream labeled as NaN (those segment end points that didn't intersect another segment's start point); these are the terminal nodes in each route grouping (i.e. where the pipes connect to the existing pipeline network). 

We relabel these terminal nodes with the upstream segment ID followed by a "T". 

In [18]:
#Update null values with upstream IDs, appended with a "T"
gdfNodes['downstream_id'].fillna(gdfNodes['upstream_id'] + "T",inplace=True)
#Drop extraneous fields
gdfNodes.sample(10)

Unnamed: 0,geometry,upstream_id,downstream_id
2151,POINT (1640316.091 -258328.147),2151,1098
399,POINT (1659987.753 -214445.207),399,399T
1998,POINT (1610556.396 -261354.556),1998,1569
2704,POINT (1654439.336 -300193.480),2704,2129
2193,POINT (1643846.902 -272451.392),2193,2121
1105,POINT (1650404.123 -298175.874),1105,1106
2322,POINT (1575752.685 -276991.006),2322,1449
1878,POINT (1637077.760 -293424.339),1878,1878T
352,POINT (1589371.528 -263876.564),352,353
1719,POINT (1558098.629 -341554.412),1719,1719T


In [None]:
#Write the nodes to a file
gdfNodes.to_file('../scratch/nodes.shp')

#### Transfer node information to route segment features.
The nodes geodataframe above includes points occuring at the end vertices of each pipeline segment (again, because we joined the `gdfFirstPoints` features *to* the `gdfEndPoint` features, keeping all the `gdfEndPoint` features). And each of these points is aware of the edge_IDs of the segment upstream of it and downstream of it (or if it's a terminal node). 

What we want in the next step is to link this information to each route segment feature such that each feature knows its node ID (taken from it's upstream_id) and the node ID immediately downstream of it. This will allow us to construct a graph from all the segments. 

We also need to link each segment with the amount of biogas potential it introduces into the system. 

##### Joining node attribute data to each segment feature
The first step is done via attribute join, i.e., by merging the `downstream_id` attribute in the gdfNodes dataframe to the segment geodataframe using the `upstream_id` as the joining feature to link with the gdfSegments's `edge_ID` attribute.

In [19]:
#Join the upstream and downstream IDs to the segments features
gdfSegments_ids = gdfSegments.merge(gdfNodes[['upstream_id','downstream_id']],
                                    left_on='edge_ID', 
                                    right_on='upstream_id',
                                    how='left')
#Drop the 'upstream_ID' as it's redundant with the edge_ID
gdfSegments_ids. drop('upstream_id',axis=1,inplace=True)
#Show the table
gdfSegments_ids.sample(5)

Unnamed: 0,route_id,geometry,edge_ID,downstream_id
627,325,"LINESTRING (1621148.830 -276486.604, 1620644.4...",627,628
2688,1181,"LINESTRING (1652128.315 -114753.737, 1651917.3...",2688,2688T
1802,1575,"LINESTRING (1594919.946 -270938.187, 1594415.5...",1802,1803
1284,1083,"LINESTRING (1654073.748 -265322.487, 1654037.9...",1284,1284T
481,361,"LINESTRING (1635776.476 -297167.070, 1636280.8...",481,482


Now each segment feature knows its node/edge ID and the node ID of the segment immediately downstream!

##### Attaching biogas potential data back to each segment
Next, we need to join the Biogas Potential linked with each segment. The biogas potential is stored in the `gdfStart` geodataframe constructed by taking the first point in each original biogas route. Here, the `Biogas P_1` attribute is what we want. 

In [20]:
gdfStart.head()

Unnamed: 0,Fac_ID,Waste,Biogas,TYPE,geometry
0,291,490225.577124,13726320.0,Transmission,POINT (1618626.822 -297167.070)
1,154,301328.821333,8437207.0,Transmission,POINT (1582814.308 -311794.717)
2,1930,266616.481879,7465261.0,Transmission,POINT (1623166.436 -225037.641)
3,292,243401.613396,6815245.0,Transmission,POINT (1617113.617 -299689.078)
4,1925,226652.386212,6346267.0,Transmission,POINT (1737161.199 -158961.030)


As this dataframe has no attribute feature that would let us join the data to our segement features, we'll need to create some sort of spatial join to link the biogas potential to our segments. 

To do this, we'll spatially join the `gdfStart` features (which contain biogas potential information ) to the `gdfFirstPoints` features (which contain the node/edge ID information):

In [21]:
gdfFirstPoints.head()

Unnamed: 0,route_id,geometry,edge_ID,downstream_id
0,291,POINT (1618626.822 -297167.070),0,0
1,291,POINT (1619131.223 -296158.267),1,1
2,291,POINT (1621148.830 -293674.376),2,2
3,291,POINT (1621148.830 -293131.857),3,3
4,291,POINT (1621653.231 -288087.841),4,4


In [22]:
#Join the attribute data to the first points (i.e. the biogas source locations)
gdfBiogasLookup = gpd.sjoin(left_df=gdfFirstPoints[['geometry','edge_ID']],#Join only the geom and edge_ID cols
                            right_df=gdfStart, 
                            how='inner').drop('index_right',axis=1)
gdfBiogasLookup.head()

Unnamed: 0,geometry,edge_ID,Fac_ID,Waste,Biogas,TYPE
0,POINT (1618626.822 -297167.070),0,291,490225.577124,13726320.0,Transmission
6,POINT (1582814.308 -311794.717),6,154,301328.821333,8437207.0,Transmission
19,POINT (1623166.436 -225037.641),19,1930,266616.481879,7465261.0,Transmission
22,POINT (1617113.617 -299689.078),22,292,243401.613396,6815245.0,Transmission
24,POINT (1737161.199 -158961.030),24,1925,226652.386212,6346267.0,Transmission


This gives us a table that we can now merge to our segments dataframe. Not all segments will have biogas data, so we need to set null values to zero (which requires fixing the datatype).

In [23]:
#Review the segments data (with joined attributes)
gdfSegments_ids.head()

Unnamed: 0,route_id,geometry,edge_ID,downstream_id
0,291,"LINESTRING (1618626.822 -297167.070, 1618626.8...",0,1
1,291,"LINESTRING (1619131.223 -296158.267, 1619635.6...",1,2
2,291,"LINESTRING (1621148.830 -293674.376, 1621148.8...",2,3
3,291,"LINESTRING (1621148.830 -293131.857, 1621148.8...",3,4
4,291,"LINESTRING (1621653.231 -288087.841, 1622157.6...",4,5


In [42]:
#Merge the biogas potential to the segment featuresm using edge_ID as the common field
gdfSegments_biogas = gdfSegments_ids.merge(gdfBiogasLookup[['edge_ID','Fac_ID','Waste','Biogas','TYPE']],
                                           on='edge_ID',how='left')
#Examine the output
gdfSegments_biogas.head()

Unnamed: 0,route_id,geometry,edge_ID,downstream_id,Fac_ID,Waste,Biogas,TYPE
0,291,"LINESTRING (1618626.822 -297167.070, 1618626.8...",0,1,291.0,490225.577124,13726320.0,Transmission
1,291,"LINESTRING (1619131.223 -296158.267, 1619635.6...",1,2,,,,
2,291,"LINESTRING (1621148.830 -293674.376, 1621148.8...",2,3,,,,
3,291,"LINESTRING (1621148.830 -293131.857, 1621148.8...",3,4,,,,
4,291,"LINESTRING (1621653.231 -288087.841, 1622157.6...",4,5,,,,


A number of these records will have missing values for the `SiteNo`, `Waste`, and `Biogas` columns; these are locations that don't correspond to biogas source locations (i.e., downstream junctions). So, we need to set those values appropriately. We will also use this opportunity to flag those records that correspond to biogas sources.

In [43]:
#Create a mask of records with null SiteNo values and its inverse
non_site_mask = pd.isna(gdfSegments_biogas['Fac_ID'])
site_mask = non_site_mask == False

In [44]:
#Set null values in SiteNo, Waste, and Biogas columns
gdfSegments_biogas.loc[non_site_mask,'Fac_ID'] = -1
gdfSegments_biogas.loc[non_site_mask,'Waste'] = 0
gdfSegments_biogas.loc[non_site_mask,'Biogas'] = 0

#Set the type of records with values to "Source"
gdfSegments_biogas.loc[site_mask,'Type'] = 'Source'

#Set the type of records with a T in the downstream ID to 'Output'
gdfSegments_biogas.loc[gdfSegments_biogas['downstream_id'].str.contains('T'),'Type'] = 'Output'

#Set the type of all other records to "Route"
gdfSegments_biogas['Type'].fillna("Route",inplace=True)

#Reveal the output
gdfSegments_biogas.sample(5)

Unnamed: 0,route_id,geometry,edge_ID,downstream_id,Fac_ID,Waste,Biogas,TYPE,Type
369,27,"LINESTRING (1738674.404 -187207.520, 1739178.8...",369,316,-1,0.0,0.0,,Route
2480,1177,"LINESTRING (1656456.942 -123652.918, 1655952.5...",2480,300,1177,4225.722455,118320.228734,Transmission,Source
1680,811,"LINESTRING (1710427.914 -96415.231, 1709923.51...",1680,170,811,13406.708488,375387.837667,Transmission,Source
2620,566,"LINESTRING (1639811.689 -268416.179, 1640316.0...",2620,1673,566,2811.479196,78721.417496,Transmission,Source
46,1312,"LINESTRING (1629219.255 -320369.544, 1629723.6...",46,47,-1,0.0,0.0,,Route


In [31]:
gdfSegments_biogas.Type.unique()

array(['Source', 'Route', 'Output'], dtype=object)

In [51]:
#Fix TYPE field
def setConnection(pipeType,sourceType):
    if sourceType == 'Output': return pipeType
    return 'NA'  

gdfSegments_biogas['Connection']=gdfSegments_biogas.apply(lambda x: setConnection(x['TYPE'],x['Type']),axis=1)
gdfSegments_biogas.drop('TYPE',axis=1,inplace=True)

In [52]:
gdfSegments_biogas.sample(10)

Unnamed: 0,route_id,geometry,edge_ID,downstream_id,Fac_ID,Waste,Biogas,Type,Connection
2594,1839,"LINESTRING (1595928.749 -270938.187, 1595424.3...",2594,1801,1839,3023.606568,84660.98,Source,
2062,1215,"LINESTRING (1677641.810 -292627.456, 1677137.4...",2062,1648,1215,9311.955664,260734.8,Source,
950,781,"LINESTRING (1643846.902 -185189.914, 1643846.9...",950,950T,-1,0.0,0.0,Output,
1495,1145,"LINESTRING (1471341.553 -258328.147, 1471341.5...",1495,1496,-1,0.0,0.0,Route,
496,309,"LINESTRING (1615600.412 -283043.825, 1616104.8...",496,497,309,40566.935566,1135874.0,Source,
1957,712,"LINESTRING (1628714.854 -292123.054, 1629219.2...",1957,1958,712,9911.239939,277514.7,Source,
2524,716,"LINESTRING (1630228.059 -292627.456, 1629723.6...",2524,1958,716,4115.904229,115245.3,Source,
1065,801,"LINESTRING (1630228.059 -164509.448, 1629723.6...",1065,1065T,801,22127.4194,619567.7,Output,Transmission
2650,977,"LINESTRING (1586345.119 -257319.343, 1586849.5...",2650,2651,977,2532.864141,70920.2,Source,
1298,1933,"LINESTRING (1620644.428 -247735.713, 1620140.0...",1298,1298T,1933,18439.516166,516306.5,Output,Transmission


In [53]:
#Write out shapefile
gdfSegments_biogas.to_file('../data/processed/BasePipelines.shp')

In [54]:
#Write out edge list
gdfSegments_biogas[['edge_ID','downstream_id','Waste','Biogas']].to_csv('../data/processed/BaseEdgeList.csv',index=False)