# Merge graph components

In this notebook, we build the graph that will be the basis of our journey planner algorithm.

## Setup PySpark
We first start by loading the Spark session. Note that some of our queries are heavy and this notebook should be run only once, we allow ourselves take a bit more resources.

In [1]:
%load_ext sparkmagic.magics

In [2]:
import os
import warnings
import pandas as pd

warnings.simplefilter(action='ignore', category=UserWarning)

username = os.environ['RENKU_USERNAME']
print(username)

cpittet


In [3]:
server = "http://iccluster029.iccluster.epfl.ch:8998"
from IPython import get_ipython
get_ipython().run_cell_magic('spark', line="config",
                             cell="""{{ "name":"{0}-aces", "executorMemory":"10G", "executorCores":8, "numExecutors":10 }}""".format(username))

In [4]:
get_ipython().run_line_magic(
    "spark", "add -s {0}-aces -l python -u {1} -k".format(username, server) 
)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
9453,application_1652960972356_5285,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


In [5]:
%%spark
from pyspark.sql.functions import *

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## The first part is to merge the preprocessed with istdaten to have delay information 

In [6]:
%%spark
# Read the delay dataframe contained the stop delay estimation
istdaten_delays = spark.read.orc('/group/aces/istdaten_delays.orc')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
%%spark
istdaten_delays.count() # Number of stop delay estimation 38_641

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

23898

In [8]:
%%spark
import pyspark.sql.functions as F

@F.udf
def extract_stop_id(x):
    """
    Extract stop id from full stop id
    """
    return x.split('_')[0]

# Load the transport edges and set join on their correct bpuic, hour and transport to get the average delay for this stop
stop_times_zh_pairs = spark.read.orc("/group/aces/graph/edges_transport_final.orc")
stop_times_zh_delay = stop_times_zh_pairs.withColumn('end_bpuic', extract_stop_id(col('end_id')))

stop_times_zh_delay = stop_times_zh_delay.join(istdaten_delays, how='left', on=['end_bpuic', 'transport', 'hour'])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
%%spark
# get all the transport types
stop_times_zh_delay.select('transport').distinct().collect()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[Row(transport=u'Other'), Row(transport=u'Tram'), Row(transport=u'Bus'), Row(transport=u'Train')]

In [10]:
%%spark
# Impute missing delay values with avg for this kind of transport and that hour
# Computet the mean_delay, std_delay, median_delay for each transport type and each hour
avg_delays_by_transport = [row.asDict() for row in stop_times_zh_delay.groupBy(['transport', 'hour']).agg(
        F.expr('mean(mean_delay)').alias('mean_delay'),
        F.expr('std(mean_delay)').alias('std_delay'),
        F.expr('percentile(mean_delay,array(0.5))')[0].alias('median_delay')
    ).collect()]
# Collect all of this in a dictionnary
avg_delays_by_transport = {(d['transport'], d['hour']): {"mean_delay":d['mean_delay'],"std_delay":d['std_delay'], "median_delay":d['median_delay']} for d in avg_delays_by_transport}


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
%%spark
from pyspark.sql.functions import when
from pyspark.sql.types import *

@F.udf
def impute_missing_mean(transport, hour):
    """
    Get the mean delay for the given (transport, hour) air
    :param transport: the transport mean we want to input the mean delay
    :param hour: the transport hour we want to input the mean delay
    :return: the corresponding mean delay
    """
    return avg_delays_by_transport[(transport, hour)]["mean_delay"]

@F.udf
def impute_missing_std(transport, hour):
    """
    Get the std delay for the given (transport, hour) air
    :param transport: the transport mean we want to input the std delay
    :param hour: the transport hour we want to input the std delay
    :return: the corresponding std delay
    """
    return avg_delays_by_transport[(transport, hour)]["std_delay"]

@F.udf
def impute_missing_median(transport, hour):
    """
    Get the median delay for the given (transport, hour) air
    :param transport: the transport mean we want to input the median delay
    :param hour: the transport hour we want to input the median delay
    :return: the corresponding median delay
    """
    return avg_delays_by_transport[(transport, hour)]["median_delay"]

# infer the missing delay by putting the average delay, std and median for this transport type at this hour
stop_times_zh_delay = stop_times_zh_delay.withColumn('mean_delay', when(col('mean_delay').isNotNull(), col('mean_delay')).otherwise(impute_missing_mean(col('transport'), col('hour'))))
stop_times_zh_delay = stop_times_zh_delay.withColumn('std_delay', when(col('std_delay').isNotNull(), col('std_delay')).otherwise(impute_missing_std(col('transport'), col('hour'))))
stop_times_zh_delay = stop_times_zh_delay.withColumn('median_delay', when(col('median_delay').isNotNull(), col('median_delay')).otherwise(impute_missing_median(col('transport'), col('hour'))))
stop_times_zh_delay = stop_times_zh_delay.drop('avg_delay')

# Put columns in right order for the union (the union is NOT made by name only by column position)
stop_times_zh_delay = stop_times_zh_delay.select(
    stop_times_zh_delay.start_id,
    stop_times_zh_delay.end_id,
    stop_times_zh_delay.mean_delay,
    stop_times_zh_delay.std_delay,
    stop_times_zh_delay.median_delay,
    stop_times_zh_delay.duration,
    stop_times_zh_delay.transport,
    stop_times_zh_delay.line_number,
    stop_times_zh_delay.is_trip,
    stop_times_zh_delay.waiting_time,
    stop_times_zh_delay.hour
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
%%spark 
# Merge all types of edges into one dataframe
# Reread all files from hdfs
all_walking_edges = spark.read.orc('/group/aces/graph/all_walking_edges_final.orc')
all_same_station_edges = spark.read.orc('/group/aces/graph/all_same_station_edges_final.orc')
nodes_zurich = spark.read.orc("/group/aces/graph/nodes_final.orc")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
%%spark
# Get the avg delay by full stop id as a dictionnary since Spark cannot do the join itself without crashing

avg_delay_by_full_stop_id = [row.asDict() for row in stop_times_zh_delay.select(col("end_id").alias("start_id"), col("mean_delay")).collect()]
dic_avg_delay_by_full_stop_id = {d['start_id']:d['mean_delay'] for d in avg_delay_by_full_stop_id}

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
%%spark
@F.udf
def impute_missing_mean_delay(stop_id, line_number):
    """
    :param stop_id: the stop id of the stop
    :param line_number: the line number of the stop or -1 if it is a walking edge between two lines.
    :return: 
        - the avg delay by full stop id if the line number is -1 and the stop id is know
        - 0 otherwise
    """
    if line_number == "-1":
        if stop_id in dic_avg_delay_by_full_stop_id:
            return dic_avg_delay_by_full_stop_id[stop_id]
        else:
            return 0.0
    else:
        return 0.0
# Impute the missing delay information
all_same_station_edges_wt_delay = all_same_station_edges.withColumn("mean_delay", impute_missing_mean_delay(col("start_id"),col("line_number")))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
%%spark

# Make the union of all the edges
all_edges = all_walking_edges.union(all_same_station_edges_wt_delay).union(stop_times_zh_delay).filter(col("mean_delay").isNotNull())
# Cast the important columns as float for the saving on HDFS
all_edges = all_edges.withColumn('mean_delay', col('mean_delay').cast(FloatType()))
all_edges = all_edges.withColumn('std_delay', col('std_delay').cast(FloatType()))
all_edges = all_edges.withColumn('median_delay', col('median_delay').cast(FloatType()))
# Compute 2 different edge weights to be used in shortest path
all_edges = all_edges.withColumn("edge_weight", col('waiting_time')+col('duration'))
all_edges = all_edges.withColumn("edge_weight_wt_mean_delay", when(col("edge_weight")+col("mean_delay") > 0 ,col("edge_weight")+col("mean_delay")).otherwise(0.0))
all_edges = all_edges.withColumn("std_delay", when(col("edge_weight")+col("mean_delay") > 0 ,col("std_delay")+col("mean_delay")).otherwise(1.0))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [20]:
%%spark
# Write the final dataframe on HDFS
all_edges.write.save("/group/aces/graph/all_edges_final.orc", format="orc", mode='overwrite')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…