# Build topology
This script reads in the individual least cost routes linking each biogas source to the nearest pipeline and merges them into a topologically correct network. This is done by splitting each route where two routes meet. 

In [1]:
#Import packages
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point, LineString
from shapely.ops import split

### Read in the routes feature class & tidy the data
The routes feature class includes the least cost path between each biogas source and its connection to the pipeline framework. Here we read it in and tidy it up for analysis. 

In [2]:
#Read in routes feature class as shapefile
gdfRoutes = gpd.read_file('..\\data\\processed\\Routes.shp')
gdfRoutes.head()

Unnamed: 0,Fac_ID,Waste,Biogas,TYPE,geometry
0,291,490225.577124,13726320.0,Transmission,"LINESTRING (1618626.822 -297167.070, 1618626.8..."
1,154,301328.821333,8437207.0,Transmission,"LINESTRING (1582814.308 -311794.717, 1582814.3..."
2,1930,266616.481879,7465261.0,Transmission,"LINESTRING (1623166.436 -225037.641, 1623166.4..."
3,292,243401.613396,6815245.0,Transmission,"LINESTRING (1617113.617 -299689.078, 1617113.6..."
4,1925,226652.386212,6346267.0,Transmission,"LINESTRING (1737161.199 -158961.030, 1737665.6..."


## Split the route features into topologically correct segments

#### Derive geodataframes of the route start points and end points
Start points are used to link each route's biogas potential to the final output and end points are used to split existing route features.

In [3]:
#Copy routes geodataframe and update geometry to start points
gdfStart = gdfRoutes.copy(deep=True)
gdfStart['geometry'] = gdfRoutes['geometry'].apply(lambda x: Point(x.coords[0]))
gdfStart.head()

Unnamed: 0,Fac_ID,Waste,Biogas,TYPE,geometry
0,291,490225.577124,13726320.0,Transmission,POINT (1618626.822 -297167.070)
1,154,301328.821333,8437207.0,Transmission,POINT (1582814.308 -311794.717)
2,1930,266616.481879,7465261.0,Transmission,POINT (1623166.436 -225037.641)
3,292,243401.613396,6815245.0,Transmission,POINT (1617113.617 -299689.078)
4,1925,226652.386212,6346267.0,Transmission,POINT (1737161.199 -158961.030)


In [4]:
#Copy routes geodataframe and update geometry to start points
gdfEnd = gdfRoutes.copy(deep=True)
gdfEnd['geometry'] = gdfRoutes['geometry'].apply(lambda x: Point(x.coords[-1]))
gdfEnd.head()

Unnamed: 0,Fac_ID,Waste,Biogas,TYPE,geometry
0,291,490225.577124,13726320.0,Transmission,POINT (1623166.436 -285565.833)
1,154,301328.821333,8437207.0,Transmission,POINT (1579787.898 -295653.866)
2,1930,266616.481879,7465261.0,Transmission,POINT (1627706.051 -230586.058)
3,292,243401.613396,6815245.0,Transmission,POINT (1618626.822 -297167.070)
4,1925,226652.386212,6346267.0,Transmission,POINT (1738674.404 -158456.629)


#### Split route features where new routes enter them
1. Combine all endpoint point features into a single multipoint feature
2. Split the LineString geometries with this multipoint feature, resulting in Geometry Collection features stored in the geodataframe's geometry series
3. Iterate through each fearture in the above result, splitting its geometry collection back into individual LineString features, and adding each to a growing list.
4. Reconstruct a new geodataframe of all the route segments from the split list, adding a new edge ID attribute

In [5]:
#Combine endpoints into a single multipoint object
ends = gdfEnd.geometry.unary_union

In [6]:
#Create a geoseries of split routes (geometry collections) - takes a bit of time
theSplits = gdfRoutes.geometry.apply(lambda x: split(x,ends))

In [40]:
#Create lists to fill
links = [] #List of each route's original route ID
geom = []  #List of the LineString objects extracted from each feature's geometry collection
types = [] #List of the type of existing pipeline into which the gas flows, eventually

In [41]:
#Iterate and add items to the list
for idx, row in gdfRoutes.iterrows():
    #Iterate through split segments in the geometry collection
    for line in theSplits[idx].geoms:
        #Add items to the list
        links.append(str(row['Fac_ID']))        #Add the index of the originating route line
        types.append(str(row['TYPE']))
        geom.append(line)                      #Add the LineString geometry to the geom list

In [47]:
#Construct an output geodataframe from the route and geom lists created above
gdfSegments = gpd.GeoDataFrame(pd.DataFrame({'route_id':links,'TYPE':types}),
                               geometry = geom, crs = gdfRoutes.crs)

#Add the index as a unique segment ID  
gdfSegments['edge_ID'] = gdfSegments.index.astype(str)

### Add attributes to each segment, including downstream segment ID

With the segments created, we now need assign attributes to each edge. These consist of the its upstream node ID, its downstream node ID, and the amount of biogas introduced at its upstream node. 

This process is a bit tricky and is done by:
* Creating a feature class of the segment's starting vertex, linked to the segment's `edge_id`. This geodataframe is labeled `gdfDownstreamNodes` as its points are labeled with the edge that falls **downstream** of it. 
* Creating a second feature class of the segment's ending vertex, tagged with the segment's `edge_id `. This geodataframe is labeled `gdfUpstreamNodes` as its points are labeled with the edge that falls **upstream** of it. 
* The to points are spatially joined, resulting in a dataset of vertices (`gdNodes`) where each includes the label of the upstream and downstream edge_id. 

##### Create geodataframes from "upstream" and "downstream" nodes
Here the "downstream" nodes are actually the first point in each segment, and the "upstream" nodes are the last. This seems backwards, but these are both intermediate datasets used to determine "from-to" pairs, done by spatially joining them. When joined, the "from" node carries the attribute of the upstream segment and the "to" node carries the downstream one...

In [49]:
#Construct a gdf of segement start points; 
#  the 'edge_id' included is the id of the segement into which it flows, i.e. its downstream segment ID
gdfFirstPoints = gdfSegments.copy(deep=True)
gdfFirstPoints['geometry'] = gdfFirstPoints['geometry'].apply(lambda x: Point(x.coords[0]))
gdfFirstPoints['downstream_id'] = gdfFirstPoints.index.astype(str)

In [50]:
#Construct a gdf of segement start points; 
#  the 'edge_id' included is the id of the segement flowing into it, i.e. its upstream segment ID
gdfLastPoints = gdfSegments.copy(deep=True)
gdfLastPoints['geometry'] = gdfLastPoints['geometry'].apply(lambda x: Point(x.coords[-1]))
gdfLastPoints['upstream_id'] = gdfLastPoints.index.astype(str)

##### Spatially join the downstream and upstream points
Spatially joining the two datasets results in a single point feature class with each point including the edge ids of the upstream and downstream segements, respectively.

In [53]:
#Spatially join the above geodataframes and remove indices
gdfNodes = gpd.sjoin(left_df=gdfLastPoints, right_df=gdfFirstPoints, how='left')
#Subset columns
gdfNodes = gdfNodes.loc[:,['upstream_id','downstream_id','geometry']]
#Examine output
gdfNodes.head(3)

Unnamed: 0,upstream_id,downstream_id,geometry
0,0,1,POINT (1619131.223 -296158.267)
1,1,2,POINT (1621148.830 -293636.259)
2,2,3,POINT (1621148.830 -293131.857)


In [54]:
#Show info on the resulting dataset
gdfNodes.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 2801 entries, 0 to 2800
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   upstream_id    2801 non-null   object  
 1   downstream_id  2366 non-null   object  
 2   geometry       2801 non-null   geometry
dtypes: geometry(1), object(2)
memory usage: 87.5+ KB


The above reveals a number of records missing `downstream_id` values. These are the terminal segments, i.e. the ones connecting routes to existing NG pipeline infrastructure. 
> This is because we joined the "first" points of each segement to the "last" ones (left join above), leaving a number of downstream labeled as NaN (those segment end points that didn't intersect another segment's start point); these are the terminal nodes in each route grouping (i.e. where the pipes connect to the existing pipeline network). 

We relabel these terminal nodes with the upstream segment ID followed by a "T". 

In [57]:
#Update null values with upstream IDs, appended with a "T"
gdfNodes['downstream_id'].fillna(gdfNodes['upstream_id'] + "T",inplace=True)
#Drop extraneous fields
gdfNodes.tail()

Unnamed: 0,upstream_id,downstream_id,geometry
2796,2796,1049,POINT (1596937.553 -299184.677)
2797,2797,425,POINT (1630732.460 -242187.295)
2798,2798,1403,POINT (1605512.380 -295653.866)
2799,2799,905,POINT (1629219.255 -254797.335)
2800,2800,2800T,POINT (1630228.059 -287583.440)


#### Transfer node information to route segment features.
The nodes geodataframe above includes points occuring at the end vertices of each pipeline segment (again, because we joined the `gdfFirstPoints` features *to* the `gdfEndPoint` features, keeping all the `gdfEndPoint` features). And each of these points is aware of the edge_IDs of the segment upstream of it and downstream of it (or if it's a terminal node). 

What we want in the next step is to link this information to each route segment feature such that each feature knows its node ID (taken from it's upstream_id) and the node ID immediately downstream of it. This will allow us to construct a graph from all the segments. 

We also need to link each segment with the amount of biogas potential it introduces into the system. 

##### Joining node attribute data to each segment feature
The first step is done via attribute join, i.e., by merging the `downstream_id` attribute in the gdfNodes dataframe to the segment geodataframe using the `upstream_id` as the joining feature to link with the gdfSegments's `edge_ID` attribute.

In [58]:
#Join the upstream and downstream IDs to the segments features
gdfSegments_ids = gdfSegments.merge(gdfNodes[['upstream_id','downstream_id']],
                                    left_on='edge_ID', 
                                    right_on='upstream_id',
                                    how='left')
#Drop the 'upstream_ID' as it's redundant with the edge_ID
gdfSegments_ids. drop('upstream_id',axis=1,inplace=True)
#Show the table
gdfSegments_ids.tail(5)

Unnamed: 0,route_id,TYPE,geometry,edge_ID,downstream_id
2796,1770,Transmission,"LINESTRING (1597162.785 -299334.101, 1596937.5...",2796,1049
2797,2040,Transmission,"LINESTRING (1630732.460 -241682.894, 1630732.4...",2797,425
2798,1794,Transmission,"LINESTRING (1606521.183 -297167.070, 1606521.1...",2798,1403
2799,2041,Transmission,"LINESTRING (1629219.255 -255301.737, 1629219.2...",2799,905
2800,299,Transmission,"LINESTRING (1630267.006 -287818.560, 1630228.0...",2800,2800T


Now each segment feature knows its node/edge ID and the node ID of the segment immediately downstream!

##### Attaching waste & biogas potential data back to each segment
Next, we need to join the Biogas Potential linked with each segment. The biogas potential is stored in the `gdfStart` geodataframe constructed by taking the first point in each original biogas route. Here, the `Biogas P_1` attribute is what we want. 

In [59]:
gdfStart.head()

Unnamed: 0,Fac_ID,Waste,Biogas,TYPE,geometry
0,291,490225.577124,13726320.0,Transmission,POINT (1618626.822 -297167.070)
1,154,301328.821333,8437207.0,Transmission,POINT (1582814.308 -311794.717)
2,1930,266616.481879,7465261.0,Transmission,POINT (1623166.436 -225037.641)
3,292,243401.613396,6815245.0,Transmission,POINT (1617113.617 -299689.078)
4,1925,226652.386212,6346267.0,Transmission,POINT (1737161.199 -158961.030)


As this dataframe has no attribute feature that would let us join the data to our segement features, we'll need to create some sort of spatial join to link the biogas potential to our segments. 

To do this, we'll spatially join the `gdfStart` features (which contain biogas potential information ) to the `gdfFirstPoints` features (which contain the node/edge ID information):

In [60]:
gdfFirstPoints.head()

Unnamed: 0,route_id,TYPE,geometry,edge_ID,downstream_id
0,291,Transmission,POINT (1618626.822 -297167.070),0,0
1,291,Transmission,POINT (1619131.223 -296158.267),1,1
2,291,Transmission,POINT (1621148.830 -293636.259),2,2
3,291,Transmission,POINT (1621148.830 -293131.857),3,3
4,291,Transmission,POINT (1621653.231 -288087.841),4,4


In [61]:
#Join the attribute data to the first points (i.e. the biogas source locations)
gdfBiogasLookup = gpd.sjoin(left_df=gdfFirstPoints[['geometry','edge_ID']],#Join only the geom and edge_ID cols
                            right_df=gdfStart, 
                            how='inner').drop('index_right',axis=1)
gdfBiogasLookup.head()

Unnamed: 0,geometry,edge_ID,Fac_ID,Waste,Biogas,TYPE
0,POINT (1618626.822 -297167.070),0,291,490225.577124,13726320.0,Transmission
6,POINT (1582814.308 -311794.717),6,154,301328.821333,8437207.0,Transmission
19,POINT (1623166.436 -225037.641),19,1930,266616.481879,7465261.0,Transmission
22,POINT (1617113.617 -299689.078),22,292,243401.613396,6815245.0,Transmission
24,POINT (1737161.199 -158961.030),24,1925,226652.386212,6346267.0,Transmission


This gives us a table that we can now merge to our segments dataframe. Not all segments will have biogas data, so we need to set null values to zero (which requires fixing the datatype).

In [62]:
#Review the segments data (with joined attributes)
gdfSegments_ids.head()

Unnamed: 0,route_id,TYPE,geometry,edge_ID,downstream_id
0,291,Transmission,"LINESTRING (1618626.822 -297167.070, 1618626.8...",0,1
1,291,Transmission,"LINESTRING (1619131.223 -296158.267, 1619635.6...",1,2
2,291,Transmission,"LINESTRING (1621148.830 -293636.259, 1621148.8...",2,3
3,291,Transmission,"LINESTRING (1621148.830 -293131.857, 1621148.8...",3,4
4,291,Transmission,"LINESTRING (1621653.231 -288087.841, 1622157.6...",4,5


In [66]:
#Merge the biogas potential to the segment featuresm using edge_ID as the common field
gdfSegments_biogas = (gdfSegments_ids
                      .merge(gdfBiogasLookup[['edge_ID','Fac_ID','Waste','Biogas','TYPE']],
                                           on='edge_ID',how='left')
                      .drop(columns=['route_id','TYPE_y'],axis=1) # Drop unneeded columns
                      .rename({'TYPE_x':'output_type'},axis=1)    # Rename TYPE_x column
                     )
#Examine the output
gdfSegments_biogas.head()

Unnamed: 0,output_type,geometry,edge_ID,downstream_id,Fac_ID,Waste,Biogas
0,Transmission,"LINESTRING (1618626.822 -297167.070, 1618626.8...",0,1,291.0,490225.577124,13726320.0
1,Transmission,"LINESTRING (1619131.223 -296158.267, 1619635.6...",1,2,,,
2,Transmission,"LINESTRING (1621148.830 -293636.259, 1621148.8...",2,3,,,
3,Transmission,"LINESTRING (1621148.830 -293131.857, 1621148.8...",3,4,,,
4,Transmission,"LINESTRING (1621653.231 -288087.841, 1622157.6...",4,5,,,


A number of these records will have missing values for the `SiteNo`, `Waste`, and `Biogas` columns; these are locations that don't correspond to biogas source locations (i.e., downstream junctions). So, we need to set those values appropriately. We will also use this opportunity to flag those records that correspond to biogas sources.

In [39]:
#Create a mask of records with null SiteNo values and its inverse
non_site_mask = pd.isna(gdfSegments_biogas['Fac_ID'])
site_mask = non_site_mask == False

In [67]:
#Set null values in SiteNo, Waste, and Biogas columns
gdfSegments_biogas.loc[non_site_mask,'Fac_ID'] = 'NA'
gdfSegments_biogas.loc[non_site_mask,'Waste'] = 0
gdfSegments_biogas.loc[non_site_mask,'Biogas'] = 0

#Set the type of records with values to "Source"
gdfSegments_biogas.loc[site_mask,'segment_type'] = 'Source'

#Set the type of records with a T in the downstream ID to 'Output'
gdfSegments_biogas.loc[gdfSegments_biogas['downstream_id'].str.contains('T'),'segment_type'] = 'Output'

#Set the type of all other records to "Route"
gdfSegments_biogas['segment_type'].fillna("Route",inplace=True)

#Reveal the output
gdfSegments_biogas.sample(5)

Unnamed: 0,output_type,geometry,edge_ID,downstream_id,Fac_ID,Waste,Biogas,segment_type
2076,Transmission,"LINESTRING (1623670.838 -308263.906, 1623670.8...",2076,1855,625,9219.758083,258153.22633,Source
670,Transmission,"LINESTRING (1611565.199 -249753.319, 1611565.1...",670,671,2018,31856.402525,891979.270694,Source
947,Transmission,"LINESTRING (1653430.532 -258328.147, 1652926.1...",947,948,1073,24893.346825,697013.71109,Source
2473,Transmission,"LINESTRING (1603494.773 -261354.556, 1603999.1...",2473,1424,1991,4179.225832,117018.323305,Source
1951,Transmission,"LINESTRING (1611060.798 -323395.954, 1611565.1...",1951,1519,1886,9911.239939,277514.718304,Source


In [69]:
#Ensure that all columns now have data
gdfSegments_biogas.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 2801 entries, 0 to 2800
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   output_type    2801 non-null   object  
 1   geometry       2801 non-null   geometry
 2   edge_ID        2801 non-null   object  
 3   downstream_id  2801 non-null   object  
 4   Fac_ID         2801 non-null   object  
 5   Waste          2801 non-null   float64 
 6   Biogas         2801 non-null   float64 
 7   segment_type   2801 non-null   object  
dtypes: float64(2), geometry(1), object(5)
memory usage: 196.9+ KB


In [73]:
gdfSegments_biogas.sample(10)

Unnamed: 0,output_type,geometry,edge_ID,downstream_id,Fac_ID,Waste,Biogas,segment_type
1249,Transmission,"LINESTRING (1586345.119 -397038.588, 1585840.7...",1249,795,197.0,18808.30649,526632.581712,Source
454,Transmission,"LINESTRING (1485464.798 -285565.833, 1485969.1...",454,454T,,0.0,0.0,Output
804,Transmission,"LINESTRING (1640316.091 -252275.327, 1639811.6...",804,805,,0.0,0.0,Route
2256,Transmission,"LINESTRING (1678650.613 -260345.753, 1678146.2...",2256,977,1052.0,6078.873938,170208.470262,Source
424,Transmission,"LINESTRING (1634263.271 -242187.295, 1633758.8...",424,425,,0.0,0.0,Route
967,Transmission,"LINESTRING (1575248.284 -293636.259, 1575752.6...",967,256,1835.0,24586.021555,688408.603546,Source
1518,Transmission,"LINESTRING (1612069.601 -323395.954, 1611565.1...",1518,1519,1769.0,14123.005119,395444.143341,Source
2056,Transmission,"LINESTRING (1610051.994 -235630.074, 1610556.3...",2056,2057,1999.0,9311.955664,260734.758593,Source
2236,Transmission,"LINESTRING (1614591.609 -281026.219, 1614087.2...",2236,901,,0.0,0.0,Route
736,Distribution,"LINESTRING (1529852.139 -355677.657, 1529852.1...",736,737,1428.0,30425.201675,851905.646888,Source


In [74]:
#Write out shapefile
gdfSegments_biogas.to_file('../data/processed/BasePipelines.shp')

In [None]:
#Write out edge list
gdfSegments_biogas[['edge_ID','downstream_id','Waste','Biogas']].to_csv('../data/processed/BaseEdgeList.csv',index=False)