# Create complete network
- get all layers created so far (IRIS, centroids, roads for cars and bikes, public transport layer(s))

- connect centroids to the carbike network: 
    - combine centroids and carbike nodes into one nodes_network dataframe
    - put the combined nodes and the carbike edges into snkit
    - create a base_network with them, create a linked network
    - go back to separate carbike and centroid dataframes as needed



In [1]:
import networkx as nx
import os
os.environ['USE_PYGEOS'] = '1'
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import shapely
import snkit
import snkit.network
crs_fr = 2154 #4326 ?
plt.rcParams['figure.figsize'] = (10, 10)



In [2]:
# Check function
def check_missing_nodes(set11, set12, node_IDs):
    set11 = set(set11)
    set12 = set(set12)
    node_IDs = set(node_IDs)
    IDs_in_edges = set11.union(set12)
    missing = list(sorted(IDs_in_edges - node_IDs))
    added = list(sorted(node_IDs - IDs_in_edges))
    return(missing, added)

In [3]:
# --- From Anastassia
# import packages
%run -i packages.py

# CUSTOM FUNCTION

# define function that creates attribute dictionary for nodes and edges
# (for input to nx.add_edges_from/add_nodes_from)
def make_attr_dict(*args, **kwargs): 
    
    argCount = len(kwargs)
    
    if argCount > 0:
        attributes = {}
        for kwarg in kwargs:
            attributes[kwarg] = kwargs.get(kwarg, None)
        return attributes
    else:
        return None # (if no attributes are given)

## Get layers

### grid, boundary

In [4]:
# Get boundary of GPM (for plots)

GPM = gpd.read_file('data/raw/GPM_geometry/GPM.geojson').to_crs(crs_fr)

In [5]:
# Get grid and centroids
grid = pd.read_csv('data/processed/IRIS_GPM.csv').rename(columns = {'geometry' : 'cells'})

# turn into geodataframe and separate centroids (for plots)
for c in ['cells', 'centroid']:
    grid[c] = grid[c].apply(shapely.wkt.loads)
    
grid = gpd.GeoDataFrame(grid, crs = crs_fr, geometry= 'cells' )
centroids = grid.set_geometry('centroid').set_crs(crs_fr)


### roads 

In [6]:
# Get roads (car/bike)
edges_all = pd.read_csv("./data/processed/carbike_edges.csv").drop(columns = 'key')
nodes_all = pd.read_csv("./data/processed/carbike_nodes.csv")


In [7]:
# Part of Anastassia's pre-process for nodes
# Creating the column "nodetype" 
bike_nodes = set(edges_all.loc[edges_all['edgetype'] == 'bike']['u'])
nodes_all['nodetype'] = 0

for index, row in nodes_all.iterrows():
    if row["osmid"] in bike_nodes:
        nodes_all.loc[index, "nodetype"] = 'bike'
    else:
        nodes_all.loc[index, "nodetype"] = 'car'
        
# Turn into GeoDataFrame
nodes_all["geometry"] = nodes_all.apply(lambda x: shapely.wkt.loads(x.geometry), axis = 1)
nodes_all = gpd.GeoDataFrame(nodes_all, geometry = 'geometry', crs = 4326)


In [8]:
check_missing_nodes(edges_all['u'], edges_all['v'], nodes_all['osmid'])

([], [])

In [9]:
# Part of Anastassia's pre-process for edges 

# add edge ids (strings with "id1, id2" sorted (id1 < id2))
edges_all["edge_id"] = edges_all.apply(lambda x: str(sorted([x["u"], x["v"]])), axis = 1)

# MOD: removed oneway and length from the subset
edges_all_unique = edges_all
edges_all_unique = edges_all_unique.drop_duplicates(subset = ["osmid", "edge_id"], keep = "first", ignore_index = True)

# remove duplicates 
edges_all_unique = edges_all_unique.drop_duplicates(subset = ["u", "v", "osmid", "edge_id", "edgetype"], 
                          keep = "first",
                          ignore_index = True, 
                          inplace = False)

edges_all_unique_tokeep = edges_all_unique[edges_all_unique.duplicated("edge_id", keep = False) & (edges_all_unique["edgetype"]=="bike")].index
edges_all_unique_todrop = edges_all_unique[edges_all_unique.duplicated("edge_id", keep = False) & (edges_all_unique["edgetype"] == "car")].index

# MOD changed "multi" to "both"
# edges_all_unique.loc[edges_all_unique_tokeep, "edgetype"] = "both"
# edges_all_unique = edges_all_unique.drop(edges_all_unique_todrop)

# sort by "left" node (id1 < id2 - to control order of tuple keys in nx)
edges_all_unique["order"] = edges_all_unique.apply(lambda x: np.min([x["u"], x["v"]]), axis = 1)
edges_all_unique = edges_all_unique.sort_values(by = "order").reset_index(drop = True)
# MOD: used "x" and "y" insteads of "orgig" and "dest"
edges_all_unique["x"] = edges_all_unique.apply(lambda x: np.min([x["u"], x["v"]]), axis = 1)
edges_all_unique["y"] = edges_all_unique.apply(lambda x: np.max([x["u"], x["v"]]), axis = 1)
edges_all_unique = edges_all_unique.drop(columns = ["order", "u", "v"]) # instead of "u" and "v",
# we will use "origin" and "destination" where osmid(origin) < osmid (destination)!

In [10]:
check_missing_nodes(edges_all_unique['x'], edges_all_unique['y'], nodes_all['osmid'])

([], [])

In [11]:
# # Plot
# ax = plt.axes()
# edges_all.plot(ax=ax, linewidth = 0.3, alpha = 0.5)
# nodes_all.plot(ax=ax, markersize = 0.05, facecolor = 'red', alpha = 1)
# GPM.plot(ax=ax, facecolor = 'none', linewidth = 2)

## Connect carbike edges and centroids
A network is created with
- nodes = centroids and road intersections
- edges = roads

These edges and nodes are connected by creating an edge between each centroid and its nearest edge.

In [12]:
#--- Create dataframe with all carbike nodes and centroids

# Modify the centroids to be compatible with the carbike nodes
centroids_network = centroids
centroids_network = centroids_network.rename(columns = {'centroid':'geometry'}).set_geometry('geometry').to_crs(4326).drop(columns = ['NOM_COM',
                                      'CODE_IRIS',
                                      'NOM_IRIS',
                                      'cells',
                                      'osm_id'])
centroids_network['nodetype'] = 'bike'

# Create the nodes_network dataframe 
nodes_network = pd.concat([nodes_all,centroids_network])

In [13]:
#--- Creating the network 

# Nodes 
nodes = gpd.GeoDataFrame(nodes_network, geometry = "geometry")

# # Edges
edges_all_unique["geometry"] = edges_all_unique.apply(lambda x: shapely.wkt.loads(x.geometry), axis = 1)
edges_all_unique = gpd.GeoDataFrame(edges_all_unique, geometry = "geometry") 
edges = gpd.GeoDataFrame(edges_all_unique, geometry = 'geometry', crs = 4326)

base_network = snkit.Network(nodes, edges)

# # plot
# ax = plt.axes()
# base_network.edges.plot(ax=ax, linewidth = 1, alpha = 0.6)
# base_network.nodes.plot(ax=ax, facecolor = 'red', markersize = 3)

In [14]:
len(check_missing_nodes(base_network.edges['x'], base_network.edges['y'], base_network.nodes['osmid'])[1]) #2844 is good!
base_network.edges = base_network.edges.drop(columns = ['Unnamed: 0', 'level_0', 'index']) 
base_network.nodes = base_network.nodes.drop(columns = 'Unnamed: 0') 


In [15]:
%%time 
#--- Link centroids to their nearest edge (takes a while)

linked = snkit.network.link_nodes_to_nearest_edge(base_network) #wth is this warning?
# # plot
# ax = plt.axes()
# linked.edges.plot(ax=ax, linewidth = 1, alpha = 0.5)
# linked.nodes.plot(ax=ax, facecolor = 'red', markersize = 3, alpha = 1)



CPU times: user 16min 12s, sys: 7.99 s, total: 16min 20s
Wall time: 16min 31s


In [91]:
# create IDs for the new nodes and edges
with_id = snkit.network.add_topology(snkit.network.add_ids(linked))


In [92]:
len(check_missing_nodes(with_id.edges['x'], with_id.edges['y'], with_id.nodes['osmid'])[1])
# This should be 5554, as the centroids and new nodes (total: 5554) were not part of the original dataframe with x and y attributes 

5554

In [93]:
# Separate old nodes (centroids) from new ones (intersection between centroid and nearest edge)
new_carbike_nodes = with_id.nodes[with_id.nodes['nodetype'].isnull()]

# Separate centroids from the other nodes of this network
centroids_network = with_id.nodes[(with_id.nodes['x'].isnull()) 
                                  & (with_id.nodes['nodetype'].notnull())]

## Post-linking processing

### Nodes
- Make centroids compatible with carbike nodes
- Make the new nodes compatible with the carbike nodes
- Add the new nodes to the carbike nodes
- Process all carbike nodes (old and new)

In [94]:
#--- Process centroids 
centroids_network = centroids_network.drop(columns = 'osmid')

# Create the x and y columns
centroids_network = centroids_network.rename(columns = {'id':'osmid'})
centroids_network['x'] = centroids_network.geometry.apply(lambda p: p.x)
centroids_network['y'] = centroids_network.geometry.apply(lambda p: p.y)

# Add attributes 
centroids_network['osmid'] = centroids_network['osmid'].str[5:].astype(int)
centroids_network['centroid'] = True
centroids_network['RER'] = False

centroids_network.tail(1)

Unnamed: 0,y,x,geometry,nodetype,osmid,centroid,RER
117990,48.767193,2.276657,POINT (2.27666 48.76719),bike,117990,True,False


In [95]:
#--- Process new carbike nodes 
new_carbike_nodes = new_carbike_nodes.drop(columns = 'osmid')

# Create the x and y columns
new_carbike_nodes['x'] = new_carbike_nodes.geometry.apply(lambda p: p.x)
new_carbike_nodes['y'] = new_carbike_nodes.geometry.apply(lambda p: p.y)
new_carbike_nodes = new_carbike_nodes.rename(columns = {'id':'osmid'})

# Add attributes 
new_carbike_nodes['nodetype'] = 'bike'
new_carbike_nodes['osmid'] = new_carbike_nodes['osmid'].str[5:].astype(int)
new_carbike_nodes['centroid'] = False
new_carbike_nodes['RER'] = False

new_carbike_nodes.tail(1)

Unnamed: 0,y,x,geometry,nodetype,osmid,centroid,RER
120700,48.767032,2.27667,POINT (2.27667 48.76703),bike,120700,False,False


In [96]:
# QUICK CHECK: ARE ALL IDs REALLY DIFFERENT FROM THE OSMIDs? 
centroids_minID = int(centroids_network.sort_values(by = 'osmid').head(1).osmid)
centroids_maxID = int(centroids_network.sort_values(by = 'osmid').tail(1).osmid)
new_carbike_nodes_minID = int(new_carbike_nodes.sort_values(by = 'osmid').head(1).osmid)
new_carbike_nodes_maxID = int(new_carbike_nodes.sort_values(by = 'osmid').tail(1).osmid)
nodes_all_minID = int(nodes_all.sort_values(by = 'osmid').head(1).osmid)
nodes_all_maxID = int(nodes_all.sort_values(by = 'osmid').tail(1).osmid)

print(centroids_minID, 
      centroids_maxID, 
      new_carbike_nodes_minID, 
      new_carbike_nodes_maxID,
      nodes_all_minID, 
      nodes_all_maxID) 

115147 117990 117991 120700 122926 10810974432


In [98]:
#-- Process old carbike nodes
# Add attributes to the old carbike nodes
nodes_all['centroid'] = False
nodes_all['RER'] = False
nodes_all = nodes_all.drop(columns = 'Unnamed: 0')
nodes_all.head(1)

Unnamed: 0,osmid,y,x,geometry,nodetype,centroid,RER
0,122926,48.884082,2.463549,POINT (2.46355 48.88408),car,False,False


In [99]:
#--- Combine all nodes
nodes_carbike_complete = pd.concat([nodes_all, new_carbike_nodes, centroids_network]) 
nodes_carbike_complete = gpd.GeoDataFrame(nodes_carbike_complete, geometry = "geometry")


#--- Dealing with the nodes like we used to (Anastassia) 
# Sort values and drop duplicates 
nodes_carbike_complete = nodes_carbike_complete.sort_values(by = "osmid").reset_index(drop = True) # sort by osmid
# make attribute dictionary with type and geocoordinates for each node
# MOD: removed category_node = x.type 
# MOD: changed coord to geometry
# MOD-Stpehan : split geometry into lat and lon
# MOD: added centroid and RER as attributes 
nodes_carbike_complete["attr_dict"] = nodes_carbike_complete.apply(lambda x: make_attr_dict(lat = x.x,
                                                                  lon = x.y,
                                                                  nodetype = x.nodetype,
                                                                  centroid = x.centroid,
                                                                  RER = x.RER),
                                                                  axis = 1) 

nodes_carbike_complete["osmid"] = nodes_carbike_complete["osmid"].astype(int)
nodes_carbike_complete.head(1)


Unnamed: 0,osmid,y,x,geometry,nodetype,centroid,RER,attr_dict
0,115147,48.869834,2.37464,POINT (2.37464 48.86983),bike,True,False,"{'lat': 2.3746400096338, 'lon': 48.86983413155..."


In [100]:
# This should be 120700, or the max id of the centroids and new nodes together since they were never given x and y attributes 
print(max(check_missing_nodes(linked.edges['x'], linked.edges['y'], nodes_carbike_complete['osmid'])[1])) 

# This should be nan
print(min(check_missing_nodes(linked.edges['x'], linked.edges['y'], nodes_carbike_complete['osmid'])[0])) 

# This should be 5554, same reasoning as above
print(len(check_missing_nodes(linked.edges['x'], linked.edges['y'], nodes_carbike_complete['osmid'])[1])) 


120700
nan
5554


### Edges
- Make new edges compatible with the old ones
    - some new edges connect with a node from the carbike network using the ID given by linked_network instead of the original osmID -> change these IDs into their osmIDs AND update this info in the edges df
    - These IDs are already absent from the nodes_carbike_complete df since it is based on the old nodes_all df and the new_nodes and centroids dfs

In [101]:
#--- Make with_id.nodes and edges workable
n = 5
edges_with_id = with_id.edges
edges_with_id['from_id'] = edges_with_id['from_id'].str[5:].astype(int)
edges_with_id['to_id'] = edges_with_id['to_id'].str[5:].astype(int)
edges_with_id['id'] = edges_with_id['id'].str[5:].astype(int)

print(max(check_missing_nodes(edges_with_id['from_id'], edges_with_id['to_id'], nodes_carbike_complete['osmid'])[0]) )
# highest missing id in edges should be BELOW all of those in the nodes dataframe (i.e 115147)

print(min(check_missing_nodes(edges_with_id['from_id'], edges_with_id['to_id'], nodes_carbike_complete['osmid'])[1]) )
# lowest missing ID in nodes should be the lowest original osmid (i.e 122926), need to overwrite the nodes id in edges with their new value

115146
122926


list

In [102]:
#--- Replacing new IDs with osmIDs where necessary

# Create a dictionary of ID to osmID
oldnodes_in_newedges = with_id.nodes.loc[(with_id.nodes['osmid'].notnull())] # They have an non-null osmID (the new nodes don't have one)
oldnodes_in_newedges['osmid'] = oldnodes_in_newedges['osmid'].astype(int)
oldnodes_in_newedges['id'] = oldnodes_in_newedges['id'].str[5:].astype(int)
oldnodes_in_newedges.set_index('id',inplace=True)
id_osmid_dict = oldnodes_in_newedges.to_dict()['osmid']

# Replace IDs with osmIDs in the edges dataframe
edges_with_id['from_id'] = edges_with_id['from_id'].apply(lambda row: id_osmid_dict[row] if row in (id_osmid_dict.keys()) else row)
edges_with_id['to_id'] = edges_with_id['to_id'].apply(lambda row: id_osmid_dict[row] if row in (id_osmid_dict.keys()) else row)


#--- Fill in the edgetype for the new edges 
# Create a dictionary of nodeID to nodetype
nodes_type_df = nodes_carbike_complete
nodes_type_dict = nodes_type_df.reset_index()
nodes_type_dict = nodes_type_dict.set_index('osmid').to_dict()['nodetype']


#--- Find the correct column indexes to work with
index_edgetype = edges_with_id.columns.get_loc("edgetype")
index_fromID = edges_with_id.columns.get_loc("from_id")
index_toID = edges_with_id.columns.get_loc("to_id")
index_x = edges_with_id.columns.get_loc("x")
index_y = edges_with_id.columns.get_loc("y")
index_edge_id = edges_with_id.columns.get_loc("edge_id")


#--- Fill in the edgetype of edges_with_id where it is NaN
# With the nodetype of the from_id and to_id if it is the same
# With 'car' if it is different
for i in range(len(edges_with_id)):
    if pd.isnull(edges_with_id.iloc[i,index_edgetype]):
        if nodes_type_dict[edges_with_id.iloc[i,index_fromID]] == nodes_type_dict[edges_with_id.iloc[i,index_toID]]:
            edges_with_id.iloc[i,index_edgetype] = nodes_type_dict[edges_with_id.iloc[i,index_toID]]
        else:
            edges_with_id.iloc[i,index_edgetype] = 'car'


#--- Fill in the x and y columns where they are NaN
# With x = from_id and y = to_id
for i in range(len(edges_with_id)):
      if pd.isnull(edges_with_id.iloc[i,index_x]):
         edges_with_id.iloc[i,index_x] = edges_with_id.iloc[i,index_fromID]
         edges_with_id.iloc[i,index_y] = edges_with_id.iloc[i,index_toID]
edges_with_id['x'] = edges_with_id['x'].astype(int)
edges_with_id['y'] = edges_with_id['y'].astype(int)


# #--- Fill in the edge_id column where it is NaN with [from_id, to_id]
edges_with_id['edge_id'] = edges_with_id.apply(lambda row: str([row["x"], row["y"]]), axis = 1)
# Convert edges_with_id.edge_id from string to list of int
edges_with_id['edge_id'] = edges_with_id['edge_id'].apply(lambda cell:
                                      ''.join(c for c in cell if c not in "'[]").split(', '))
# TODO THIS IS STILL A LIST OF STRINGS, NOT A LIST OF INTS!!!


#--- Drop unnecessary columns
edges_with_id = edges_with_id.drop(['from_id','to_id', 'id'], axis = 1)



#--- Create an attr_dict for the edges 
edges_with_id["attr_dict"] = edges_with_id.apply(lambda x: make_attr_dict(edgetype = x.edgetype,
                                                    edge_id = x.edge_id,
                                                    coord = x.geometry,
                                                    intnodes = []), # intnodes attribute: for storing simplification info on interstitial nodes 
                             axis = 1)

# There shouldn't be any NaNs outside of osmid and length anymore
edges_with_id.isnull().sum()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


osmid        2844
edgetype        0
length       2844
geometry        0
edge_id         0
x               0
y               0
attr_dict       0
dtype: int64

In [104]:
print(check_missing_nodes(edges_with_id['x'], edges_with_id['y'], nodes_carbike_complete['osmid'])) # should be nothing! 
# First list empty:all nodes in nodes_carbike_complete are present in some edge of edges_with_id
# Second list empty:  all nodes in edges_with_id exist in nodes_carbike_complete

([], [])


## Creating the NetworkX Graph

In [105]:
nodes_carbike_complete.head(1)

Unnamed: 0,osmid,y,x,geometry,nodetype,centroid,RER,attr_dict
0,115147,48.869834,2.37464,POINT (2.37464 48.86983),bike,True,False,"{'lat': 2.3746400096338, 'lon': 48.86983413155..."


In [106]:
edges_with_id.head(1)

Unnamed: 0,osmid,edgetype,length,geometry,edge_id,x,y,attr_dict
0,"[332409156, 39750470, 39750471, 1105411815, 42...",car,1618.6,"LINESTRING (2.46355 48.88408, 2.46236 48.88415...","[122926, 318399738]",122926,318399738,"{'edgetype': 'car', 'edge_id': ['122926', '318..."


## What's next?

BUILDING THE SKELETAL NETWORK --April--
- finish cleaning up edges/nodes for carbike after linking them with the centroids
    - edges need
        - ~~to have info on the travel mode (car/bike/public transport)~~
        - weight = travel time for the relevant transport mode
        - ~~origin and destination node IDs~~
- Figure out how to export the whole thing to NetworkX
- create separate subnetworks for car and bike travel
- add restriction on how to travel (either only bike or only car, not both)
- add RER
- send visuals and stuff to Anastassia and Trivik
GOAL: I can pick two centroids and find the shortest path, which is all walk, all bike, or walk/bike to station then walk 


SOCIAL STUFF -- second midterm (May 22nd)--
- define needed data for POIs and people 
- find data for POIs and people
- add all of it to centroids
- find/define OD matrix
GOAL: I can pick one pop and one POI and find the shortest path

OPTIMISATION STUFF (June)
- find algorithm
- modify network to allow for optimisation
- do whatever it takes to get stuff to run
- first results!! 
