# Build topology
This script reads in the individual least cost routes linking each biogas source to the nearest pipeline and merges them into a topologically correct network. This is done by splitting each route where two routes meet. 

In [None]:
#Import packages
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point, LineString
from shapely.ops import split

### Read in the routes feature class & tidy the data
The routes feature class includes the least cost path between each biogas source and its connection to the pipeline framework. Here we read it in and tidy it up for analysis. 

In [None]:
#Read in routes feature class as shapefile
gdfRoutes = gpd.read_file('..\\data\\processed\\Routes.shp')

#Tidy the data: drop columns
drop_cols = ['Facility N', 'Address', 'City', 'County Nam',
             'Zip', 'Latitude', 'Longitude', 'Regulated',
             'Biogas Pot','Allowable']
gdfRoutes.drop(columns=drop_cols,axis=1,inplace=True)

#Rename other columns
gdfRoutes.rename({'index':'SiteNo',
                  'Biogas P_1':'Biogas',
                  'Total Wast':'Waste'},
                 axis=1,inplace=True)
#Set the SiteNo column to str datatype
gdfRoutes['SiteNo'] = gdfRoutes['SiteNo'].astype(str)
#Examine the result
gdfRoutes.head()

### Derive geodataframes of the route start points and end points
Start points are used to link each route's biogas potential to the final output and end points are used to split existing route features.

In [None]:
#Copy routes geodataframe and update geometry to start points
gdfStart = gdfRoutes.copy(deep=True)
gdfStart['geometry'] = gdfRoutes['geometry'].apply(lambda x: Point(x.coords[0]))
gdfStart.head()

In [None]:
#Copy routes geodataframe and update geometry to start points
gdfEnd = gdfRoutes.copy(deep=True)
gdfEnd['geometry'] = gdfRoutes['geometry'].apply(lambda x: Point(x.coords[-1]))
gdfEnd.head()

#### Split route features where new routes enter them
1. Combine all endpoint point features into a single multipoint feature
2. Split the LineString geometries with this multipoint feature, resulting in Geometry Collection features stored in the geodataframe's geometry series
3. Iterate through each fearture in the above result, splitting its geometry collection back into individual LineString features, and adding each to a growing list.
4. Reconstruct a new geodataframe of all the route segments from the split list, adding a new edge ID attribute

In [None]:
#Combine endpoints into a single multipoint object
ends = gdfEnd.geometry.unary_union

In [None]:
#Create a geoseries of split routes (geometry collections) - takes a bit of time
theSplits = gdfRoutes.geometry.apply(lambda x: split(x,ends))

In [None]:
#Create lists to fill
links = [] #List of each route's original route ID
points_top = []
points_bottom = []
geom = []  #List of the LineString objects extracted from each feature's geometry collection

In [None]:
#Iterate and add items to the list
for idx, row in gdfRoutes.iterrows():
    #Iterate through split segments in the geometry collection
    for line in theSplits[idx].geoms:
        #Add items to the list
        links.append(str(row['SiteNo']))        #Add the index of the originating route line
        geom.append(line)                      #Add the LineString geometry to the geom list
        points_top.append(line.coords[0])      #Add the first point to the list
        points_bottom.append(line.coords[-1])  #Add the last point to the list

#### Create the nodes geodataframe from the end points extracted above

#Combine the top and bottom points, as sets to eliminate duplicates
points_all = list(set(points_top) | set(points_bottom))

#Convert to a geoseries object
points_all_geom = [Point(xy) for xy in points_all]

#Create a geodataframe from the points
gdf_points = gpd.GeoDataFrame(geometry=points_all_geom, crs = gdfRoutes.crs)

gdf_points['NodeID'] = gdf_points.index
gdf_points.head()

In [None]:
#Construct an output geodataframe from the route and geom lists created above
gdfSegments = gpd.GeoDataFrame(pd.DataFrame({'route_id':links}),
                               geometry = geom, crs = gdfRoutes.crs)

#Add the index as a unique segment ID  
gdfSegments['edge_ID'] = gdfSegments.index.astype(str)

With the segments created, we now need assign attributes to each edge. These consist of the its upstream node ID, its downstream node ID, and the amount of biogas introduced at its upstream node. 

This process is a bit tricky and is done by:
* Creating a feature class of the segment's starting vertex, linked to the segment's `edge_id`. This geodataframe is labeled `gdfDownstreamNodes` as its points are labeled with the edge that falls **downstream** of it. 
* Creating a second feature class of the segment's ending vertex, tagged with the segment's `edge_id `. This geodataframe is labeled `gdfUpstreamNodes` as its points are labeled with the edge that falls **upstream** of it. 
* The to points are spatially joined, resulting in a dataset of vertices (`gdNodes`) where each includes the label of the upstream and downstream edge_id. 

##### Create geodataframes from "upstream" and "downstream" nodes
Here the "downstream" nodes are actually the first point in each segment, and the "upstream" nodes are the last. This seems backwards, but these are both intermediate datasets used to determine "from-to" pairs, done by spatially joining them. When joined, the "from" node carries the attribute of the upstream segment and the "to" node carries the downstream one...

In [None]:
#Construct a gdf of segement start points; 
#  the 'edge_id' included is the id of the segement into which it flows, i.e. its downstream segment ID
gdfFirstPoints = gdfSegments.copy(deep=True)
gdfFirstPoints['geometry'] = gdfFirstPoints['geometry'].apply(lambda x: Point(x.coords[0]))
gdfFirstPoints['downstream_id'] = gdfFirstPoints.index.astype(str)

In [None]:
#Construct a gdf of segement start points; 
#  the 'edge_id' included is the id of the segement flowing into it, i.e. its upstream segment ID
gdfLastPoints = gdfSegments.copy(deep=True)
gdfLastPoints['geometry'] = gdfLastPoints['geometry'].apply(lambda x: Point(x.coords[-1]))
gdfLastPoints['upstream_id'] = gdfLastPoints.index.astype(str)

##### Spatially join the downstream and upstream points
Spatially joining the two datasets results in a single point feature class with each point including the edge ids of the upstream and downstream segements, respectively.

In [None]:
#Spatially join the above geodataframes and remove indices
gdfNodes = gpd.sjoin(left_df=gdfLastPoints, right_df=gdfFirstPoints, how='left')
gdfNodes.drop(columns=['route_id_left','edge_ID_left','index_right','route_id_right','edge_ID_right'],
              axis=1,inplace=True)
gdfNodes.head(1)

In [None]:
#Show info on the resulting dataset
gdfNodes.info()

The above reveals a number of records missing `downstream_id` values. These are the terminal segments, i.e. the ones connecting routes to existing NG pipeline infrastructure. 
> This is because we joined the "first" points of each segement to the "last" ones (left join above), leaving a number of downstream labeled as NaN (those segment end points that didn't intersect another segment's start point); these are the terminal nodes in each route grouping (i.e. where the pipes connect to the existing pipeline network). 

We relabel these terminal nodes with the upstream segment ID followed by a "T". 

In [None]:
#Update null values with upstream IDs, appended with a "T"
gdfNodes['downstream_id'].fillna(gdfNodes['upstream_id'] + "T",inplace=True)
#Drop extraneous fields
gdfNodes.sample(10)

In [None]:
#Write the nodes to a file
gdfNodes.to_file('../scratch/nodes.shp')

#### Transfer node information to route segment features.
The nodes geodataframe above includes points occuring at the end vertices of each pipeline segment (again, because we joined the `gdfFirstPoints` features *to* the `gdfEndPoint` features, keeping all the `gdfEndPoint` features). And each of these points is aware of the edge_IDs of the segment upstream of it and downstream of it (or if it's a terminal node). 

What we want in the next step is to link this information to each route segment feature such that each feature knows its node ID (taken from it's upstream_id) and the node ID immediately downstream of it. This will allow us to construct a graph from all the segments. 

We also need to link each segment with the amount of biogas potential it introduces into the system. 

##### Joining node attribute data to each segment feature
The first step is done via attribute join, i.e., by merging the `downstream_id` attribute in the gdfNodes dataframe to the segment geodataframe using the `upstream_id` as the joining feature to link with the gdfSegments's `edge_ID` attribute.

In [None]:
#Join the upstream and downstream IDs to the segments features
gdfSegments_ids = gdfSegments.merge(gdfNodes[['upstream_id','downstream_id']],
                                    left_on='edge_ID', 
                                    right_on='upstream_id',
                                    how='left')
#Drop the 'upstream_ID' as it's redundant with the edge_ID
gdfSegments_ids. drop('upstream_id',axis=1,inplace=True)
#Show the table
gdfSegments_ids.sample(10)

Now each segment feature knows its node/edge ID and the node ID of the segment immediately downstream!

##### Attaching biogas potential data back to each segment
Next, we need to join the Biogas Potential linked with each segment. The biogas potential is stored in the `gdfStart` geodataframe constructed by taking the first point in each original biogas route. Here, the `Biogas P_1` attribute is what we want. 

In [None]:
gdfStart.head()

As this dataframe has no attribute feature that would let us join the data to our segement features, we'll need to create some sort of spatial join to link the biogas potential to our segments. 

To do this, we'll spatially join the `gdfStart` features (which contain biogas potential information ) to the `gdfFirstPoints` features (which contain the node/edge ID information):

In [None]:
gdfFirstPoints.head()

In [None]:
#Join the attribute data to the first points (i.e. the biogas source locations)
gdfBiogasLookup = gpd.sjoin(left_df=gdfFirstPoints[['geometry','edge_ID']],#Join only the geom and edge_ID cols
                            right_df=gdfStart, 
                            how='inner').drop('index_right',axis=1)
gdfBiogasLookup.head()

This gives us a table that we can now merge to our segments dataframe. Not all segments will have biogas data, so we need to set null values to zero (which requires fixing the datatype).

In [None]:
#Review the setments data
gdfSegments_ids.head()

In [None]:
#Merge the biogas potential to the segment featuresm using edge_ID as the common field
gdfSegments_biogas = gdfSegments_ids.merge(gdfBiogasLookup[['edge_ID','SiteNo','Waste','Biogas']],
                                           on='edge_ID',how='left')

#Fix waste and biogas columns (set NaN to zero)
gdfSegments_biogas.loc[pd.isna(gdfSegments_biogas['Biogas']),'Waste'] = 0
gdfSegments_biogas.loc[pd.isna(gdfSegments_biogas['Biogas']),'Biogas'] = 0

#Examine the output
gdfSegments_biogas.head()

A number of these records will have missing values for the `SiteNo`, `Waste`, and `Biogas` columns; these are locations that don't correspond to biogas source locations (i.e., downstream junctions). So, we need to set those values 

In [None]:
#Union all route starting points into a single multipoint geometry
all_start_points = gdfStart.geometry.unary_union

In [None]:
gdfStart.head()

In [None]:
#create a gdf of segment start points
gdfSegmentStarts = gdfSegments_biogas.copy(deep=True)
gdfSegmentStarts['geometry'] = gdfSegmentStarts.geometry.apply(lambda x: Point(x.coords[0]))
#spatially join those with the route start points
gpd.sjoin(left_df=gdfSegmentStarts,right_df=gdfStart,how='left').head()

In [None]:
#Set the node as start if it intersects the start point
gpd.sjoin(left_df=gdfStart,)

In [None]:
#Add segment attributes: Biogas site | Junction | Terminal
gdfSegments['Node']

In [None]:
#Write out shapefile
gdfSegments_biogas[['edge_ID','downstream_id','route_id','Total Wast',
                    'BG_potential','geometry']].to_file('../data/processed/BasePipeline.shp')

In [None]:
#Write out edge list
gdfSegments_biogas[['edge_ID','downstream_id','Total Wast','Biogas P_1']].to_csv('../data/processed/BaseEdgeList.csv',index=False)