# Set up spark

In [1]:
%load_ext sparkmagic.magics

In [2]:
import os
from IPython import get_ipython

# set the application name as "<your_gaspar_id>-homework3"
username = os.environ['RENKU_USERNAME']
server = "http://iccluster029.iccluster.epfl.ch:8998"

get_ipython().run_cell_magic(
    'spark',
    line='config', 
    cell="""{{ "name": "{0}-final", "executorMemory": "4G", "executorCores": 4, "numExecutors": 10, "driverMemory": "4G"}}""".format(username)
)

In [3]:
get_ipython().run_line_magic(
    "spark", "add -s {0}-final -l python -u {1} -k".format(username, server)
)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
9059,application_1652960972356_4842,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


In [27]:
%%spark
import pyspark.sql.functions as functions
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time
from datetime import datetime
from dateutil.parser import parse

delayTimeMax = 60
N = 30

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import ipywidgets as widgets
from ipywidgets import HBox, VBox
token = "pk.eyJ1IjoiY29jb251dG51dCIsImEiOiJjbDNscTZhbHowMmxtM2pwajl3Yjd1ejF0In0.PXbwkPmWYXrAhQsus3ypVA"

# Load data
## Stops
- stop_name
- stop_lat
- stop_lon
- stop_id

In [130]:
%%spark
stops = spark.read.option("header",True).csv('/user/sixu/work/stops_main.csv')
stops = stops.drop("_c0")
stops = stops.withColumnRenamed("main_id", "stop_id")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Daily actual data
- trip_id `FAHRT_BEZEICHNER`: identifies the trip
- failed `FAELLT_AUS_TF`: boolean, true if this trip failed (cancelled or not completed)
- arrival_schedule `ANKUNFTSZEIT`: arrival time at the stop according to schedule
- arrival_actual `AN_PROGNOSE`: actual arrival time
- departure_schedule `ABFAHRTSZEIT`: departure time at the stop according to schedule
- departure_actual `AB_PROGNOSE`: actual departure time
- not_stop `DURCHFAHRT_TF`: boolean, true if the transport does not stop there
- stop_name `HALTESTELLEN_NAME`: name of the stop
- stop_lat
- stop_lon
- stop_id

In [131]:
%%spark
actual_data_513 = spark.read.option("header",True).csv('/user/ymao/work/actual_data_513_filtered.csv')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Walk

In [138]:
%%spark
## Stops infomation
from pyspark.sql.types import IntegerType

Stops_info = spark.read.option("header",True).csv('/user/sixu/work/stops_main.csv')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Confidence

In [142]:
%%spark

# Calculate cases of trip on time
def is_arrive_on_time(arr_sch, arr_act):
    if arr_sch != '' and arr_act != '':
        #compare the difference of them
        arr_date1 = parse(arr_sch)
        arr_date2 = parse(arr_act)
        res_arr = (arr_date1 - arr_date2).total_seconds()
        if res_arr < -delayTimeMax :
            return 0
    return 1
is_arrive_on_time_udf = functions.udf(is_arrive_on_time,IntegerType())

def getDataOnDTrip(tripId):
    # filter trip
    sbb_trip = sbb.filter(sbb.trip_id == tripId)
    
    # join and get stop_id
    sbb_trip = sbb_trip.join(stops, sbb_trip.stop_name2==stops.stop_name, "inner")
    sbb_trip = sbb_trip.drop("stop_name2")
    sbb_trip = sbb_trip.withColumnRenamed("main_id", "stop_id")
    
    return sbb_trip

# implemented function
def getConfidenceByTripId(tripID):
    sbb_trip =  getDataOnDTrip(tripID)
    if(sbb_trip.count() == 0): return None
    sbb_trip = sbb_trip.withColumn("is_arrive_on_time", is_arrive_on_time_udf(sbb_trip.arrival_schedule, sbb_trip.arrival_actual))
    Trips_confidence = sbb_trip.groupBy("trip_id").agg({"is_arrive_on_time": "avg"}).first()
    return Trips_confidence['avg(is_arrive_on_time)']

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [143]:
%%spark
from pyspark.sql.functions import col

dfs = Stops_info.alias("s1").crossJoin(Stops_info.alias("s2"))\
        .select(col("s1.main_id").alias("s1_stop_id"), col("s1.stop_name").alias("s1_stop_name"),
                col("s1.stop_lat").alias("s1_stop_lat"), col("s1.stop_lon").alias("s1_stop_lon"),
                col("s2.main_id").alias("s2_stop_id"), col("s2.stop_name").alias("s2_stop_name"),
                col("s2.stop_lat").alias("s2_stop_lat"), col("s2.stop_lon").alias("s2_stop_lon"),)\
        .filter('s1.main_id != s2.main_id')

dfs=dfs.withColumn('s1_stop_lat',dfs['s1_stop_lat'].cast("float"))
dfs=dfs.withColumn('s1_stop_lon',dfs['s1_stop_lon'].cast("float"))
dfs=dfs.withColumn('s2_stop_lat',dfs['s2_stop_lat'].cast("float"))
dfs=dfs.withColumn('s2_stop_lon',dfs['s2_stop_lon'].cast("float"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [144]:
%%spark
from pyspark.sql.types import *
from math import radians, sin, cos, asin, sqrt

def geodesic(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371
    return c * r * 1000

geodesic_udf = functions.udf(geodesic,FloatType())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [145]:
%%spark
dfs = dfs.withColumn('distance', geodesic_udf(dfs.s1_stop_lon, dfs.s1_stop_lat, dfs.s2_stop_lon, dfs.s2_stop_lat))
dfs = dfs.filter(dfs.distance < 500)
stops_walkable = dfs.select(col("s1_stop_id"), col("s2_stop_id"), col("distance"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Direct

In [146]:
%%spark
withinHour = 1

def caculate_time(h, time_string):
    pre_part = time_string[0 : 11]
    time_part = time_string[11 : ]
    
    time_rest = time_part[2:]
    t = int(time_part[0:2])-h
    time_res = str(t) + time_rest
    if t < 10 :
        time_res = '0' + time_res
    return pre_part + time_res

def direct_routes_withinHour(df, stop_id1, stop_id2, arrival_time, use_stop_id=False):
    '''
    Start from stop_id1
    End to stop_id2 before arrival_time
    '''
    if use_stop_id:
        ## 不同时间的同一路经trip id 不一样！
        # 所有过stop_id1的trips, 出发时间 < 规定到达时间
        trips_id1_df = df.filter((df['stop_id'] == stop_id1) & (df["departure_schedule"] < arrival_time)).select("trip_id", "arrival_schedule", "departure_schedule", "stop_id" ).distinct()
        #     trips_id1_df.orderBy("departure_time",ascending=False).show(15)
        # arrival_time 之前过stop_id2 的trips
        trips_id2_df = df.filter((df['stop_id'] == stop_id2) & (df["arrival_schedule"] < arrival_time) & (df["arrival_schedule"] > caculate_time(withinHour, arrival_time))).select("trip_id", "arrival_schedule","departure_schedule", "stop_id").distinct()
        trips_id2_df = trips_id2_df.withColumnRenamed("stop_id","stop_id2")
    else:
        trips_id1_df = df.filter((df['stop_id'] == stop_id1) & (df["departure_schedule"] < arrival_time)).select("trip_id", "arrival_schedule", "departure_schedule").distinct()
        trips_id2_df = df.filter((df['stop_id'] == stop_id2) & (df["arrival_schedule"] < arrival_time) & (df["arrival_schedule"] > caculate_time(withinHour, arrival_time))).select("trip_id", "arrival_schedule", "departure_schedule").distinct()
    
    trips_id2_df = trips_id2_df.withColumnRenamed("trip_id","trip_id2")
    trips_id2_df = trips_id2_df.withColumnRenamed("departure_schedule","departure_schedule2")
    trips_id2_df = trips_id2_df.withColumnRenamed("arrival_schedule","arrival_schedule2")
    

    direct_trips = trips_id1_df.join(trips_id2_df, trips_id1_df["trip_id"] == trips_id2_df["trip_id2"], "inner")
    # drop duplicate trip_id
    direct_trips = direct_trips.drop("trip_id2")
    
    # 把反向的删除了
    direct_trips = direct_trips.filter(direct_trips["departure_schedule"] < direct_trips["arrival_schedule2"])
#     direct_trips.orderBy("arrival_schedule", ascending=False).show(100, False)
    direct_trips = direct_trips.orderBy("arrival_schedule2", ascending=False)
    return direct_trips

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [147]:
%%spark

def filter_data_between(selected_data, tid, sid1, sid2):
#     trip_data = selected_data.filter(selected_data.trip_id == tid)
#     start_time = trip_data.filter(trip_data["stop_id"] == sid1).first().departure_schedule
#     end_time = trip_data.filter(trip_data["stop_id"] == sid2).first().arrival_schedule
#     trip_data = trip_data.filter(trip_data["departure_schedule"] >= start_time).filter(trip_data["arrival_schedule"] <= end_time)
#     return trip_data
    trip_data = selected_data.filter(selected_data.trip_id == tid)
    trip_data = trip_data.orderBy("arrival_schedule", "departure_schedule").withColumn("id",monotonically_increasing_id())
    start_id = trip_data.filter(trip_data["stop_id"] == sid1).first().id
    end_id = trip_data.filter(trip_data["stop_id"] == sid2).first().id
    trip_data = trip_data.filter(trip_data["id"] >= start_id).filter(trip_data["id"] <= end_id)
    trip_data = trip_data.drop("id")
    return trip_data

def get_schedule_from_direct_routes(data, trip_ids, stop_id1, stop_id2):
    trip_ids_list = trip_ids.select("trip_id").rdd.flatMap(lambda x: x).collect()
    # all valid trips
    selected_data = data.filter(data.trip_id.isin(trip_ids_list))\
                    .select("trip_id","stop_name","arrival_schedule","departure_schedule","stop_id","stop_lat","stop_lon")\
                    .orderBy("trip_id", "arrival_schedule")
    
    # empty dataframe to put result
    schema = StructType([
        StructField("trip_id",StringType(),True),
        StructField("stop_name",StringType(),True),
        StructField("arrival_schedule",StringType(),True),
        StructField("departure_schedule",StringType(),True),
        StructField("stop_id",StringType(),True),
        StructField("stop_lat",StringType(),True),
        StructField("stop_lon",StringType(),True),
        StructField("color",StringType(),True),
        StructField("schedule_id",StringType(),True)
    ])
    result_schedule = spark.createDataFrame([], schema)
    
    # for each group, keep stops between stop_id1 and stop_id2, calculate confidence
    cnt = 1
    for i in trip_ids_list:
        trip_data = filter_data_between(selected_data, i, stop_id1, stop_id2)
        # add distinct schedule_id
        schedule_id = "d_" + str(cnt)
        trip_data = trip_data.withColumn("color", lit('rgb(255, 0, 0)'))
        trip_data = trip_data.withColumn("schedule_id", lit(schedule_id))
        result_schedule = result_schedule.union(trip_data)
        cnt += 1
    
    return result_schedule

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# One transit 

In [148]:
%%spark
from pyspark.sql.functions import col
# def one_transit_helper(i):
#     print(i)
#     subpath1 = direct_routes(df, stop_id1, i, arrival_time).select("trip_id").distinct()
#     print("find subpath1!")
#     subpath1.show(False)
#     subpath2 = direct_routes(df, i, stop_id2, arrival_time).select("trip_id").distinct()
#     print("find subpath2!")
#     subpath2.show(False)
    

def one_transit(df, stop_id1, stop_id2, arrival_time):
    #找到所有满足条件的trip1的id和arrive_time
    trips_id1_df = df.filter((df['stop_id'] == stop_id1) & (df["departure_schedule"] < arrival_time)).select("trip_id", "arrival_schedule").distinct()
    #找出所有直达的trip2的id
    direct_trip_id = direct_routes_withinHour(df, stop_id1, stop_id2, arrival_time).select("trip_id").distinct()
    #trip1去除所有直达的trip1的id
    selected_trip_id = trips_id1_df.select("trip_id").subtract(direct_trip_id.select("trip_id")).distinct()
    #非直达的id转成list方便后面isin运算
    selected_list = selected_trip_id.rdd.flatMap(lambda x: x).collect()
    #找出非直达id的df的内容
    trips_id2 = df.filter(col("trip_id").isin(selected_list))

    stops_in_trip_id2 = trips_id2.select("stop_id").distinct()
#     stops_in_trip_id2.show()
    

    for i in stops_in_trip_id2:
        #找到每一个站名，然后分别计算两边，记住i是unicode，如果有错误，可以尝试转换成str(i)
        subpath1 = direct_routes_withinHour(df, stop_id1, i, arrival_time, True).select("*").distinct()
#         print("find subpath1!")
#         subpath1.orderBy("arrival_schedule2", ascending=False).show(100 ,False)
        subpath2 = direct_routes_withinHour(df, i, stop_id2, arrival_time, True).select("*").distinct()
#         print("find subpath2!")
#         subpath2.show(100, False)

    subpath2 = subpath2.withColumnRenamed("trip_id","trip_id_2")
    subpath2 = subpath2.withColumnRenamed("arrival_schedule","arrival_schedule_2")
    subpath2 = subpath2.withColumnRenamed("departure_schedule","departure_schedule_2")
    subpath2 = subpath2.withColumnRenamed("stop_id","stop_id_2")
    subpath2 = subpath2.withColumnRenamed("arrival_schedule2","arrival_schedule2_2")
    subpath2 = subpath2.withColumnRenamed("departure_schedule2","departure_schedule2_2")
    subpath2 = subpath2.withColumnRenamed("stop_id2","stop_id2_2")

    subpath_join = subpath1.crossJoin(subpath2)
    # subpath1 的到达站 = 中转站 = subpath2 的起始站
    
    subpath_no_walk = subpath_join.filter((subpath_join["stop_id2"] == subpath_join["stop_id_2"]) & (subpath_join["departure_schedule2"] <= subpath_join["arrival_schedule_2"]) & (subpath_join["trip_id"] != subpath_join["trip_id_2"]))
    ## Here for adding walkable stops!!!
    subpath_join_filter = subpath_join.filter((subpath_join["departure_schedule2"] <= subpath_join["arrival_schedule_2"]) & (subpath_join["trip_id"] != subpath_join["trip_id_2"]))
    condition = (stops_walkable.s1_stop_id == subpath_join_filter.stop_id2) & (stops_walkable.s2_stop_id == subpath_join_filter.stop_id_2)
    subpath_join_filter = subpath_join_filter.join(stops_walkable, condition, "inner").drop("s1_stop_id", "s2_stop_id", "distance")

    subpath_join_filter = subpath_join_filter.union(subpath_no_walk)

    path_one_transit = subpath_join_filter.select("trip_id", "stop_id", "departure_schedule", "stop_id2", "trip_id_2", "arrival_schedule2_2")
    path_one_transit = path_one_transit.orderBy("departure_schedule", ascending=False)
    
    return path_one_transit

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [149]:
%%spark

def get_schedule_from_one_transit(data, path_one_transit, stop_id1, stop_id2):
    trip_ids_list_1 = path_one_transit.select("trip_id").rdd.flatMap(lambda x: x).collect()
    trip_ids_list_2 = path_one_transit.select("trip_id_2").rdd.flatMap(lambda x: x).collect()
    trip_ids_list = trip_ids_list_1 + trip_ids_list_2
    
    # all valid trips
    selected_data = data.filter(data.trip_id.isin(trip_ids_list))\
                    .select("trip_id","stop_name","arrival_schedule","departure_schedule","stop_id","stop_lat","stop_lon")\
                    .orderBy("trip_id", "arrival_schedule")
    
    # empty dataframe to put result
    schema = StructType([
        StructField("trip_id",StringType(),True),
        StructField("stop_name",StringType(),True),
        StructField("arrival_schedule",StringType(),True),
        StructField("departure_schedule",StringType(),True),
        StructField("stop_id",StringType(),True),
        StructField("stop_lat",StringType(),True),
        StructField("stop_lon",StringType(),True),
        StructField("color",StringType(),True),
        StructField("schedule_id",StringType(),True)
    ])
    result_schedule = spark.createDataFrame([], schema)
    
    # for each group, keep stops between stop_id1 and stop_id2, calculate confidence
    cnt = 1
    for i in path_one_transit.collect():
        trip_id1 = i["trip_id"]
        trip_id2 = i["trip_id_2"]
        transit_stop = i["stop_id2"]
        
        # filter first half
        trip_data1 = filter_data_between(selected_data, trip_id1, stop_id1, transit_stop)
        # filter second half
        trip_data2 = filter_data_between(selected_data, trip_id2, transit_stop, stop_id2)
        
        # add distinct schedule_id
        schedule_id = "t1_" + str(cnt)
        trip_data1 = trip_data1.withColumn("color", lit('rgb(255, 0, 0)'))
        trip_data2 = trip_data2.withColumn("color", lit('rgb(0, 255, 0)'))
        trip_data = trip_data1.union(trip_data2)
        trip_data = trip_data.withColumn("schedule_id", lit(schedule_id))
        result_schedule = result_schedule.union(trip_data)
        cnt += 1
    
    return result_schedule

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Two transit

In [150]:
%%spark
def two_transit(df, stop_id1, stop_id2, arrival_time):
    #找到所有满足条件的trip1的id和arrive_time
    trips_id1_df = df.filter((df['stop_id'] == stop_id1) & (df["departure_schedule"] < arrival_time)).select("trip_id", "arrival_schedule").distinct()
    
    #找出所有直达的trip2的id
    direct_trip_id = direct_routes_withinHour(df, stop_id1, stop_id2, arrival_time).select("trip_id").distinct()
    #trip1去除所有直达的trip1的id
    selected_trip_id = trips_id1_df.select("trip_id").subtract(direct_trip_id.select("trip_id")).distinct()
    
    #找到所有一次中转能到的trip1的id
    one_transit_id = one_transit(df, stop_id1, stop_id2, arrival_time).select("trip_id").distinct()
    #trip1去除所有直达或一次中转的trip1的id
    selected_trip_id = selected_trip_id.select("trip_id").subtract(one_transit_id.select("trip_id")).distinct()
    
    #非直达的id转成list方便后面isin运算
    selected_list = selected_trip_id.rdd.flatMap(lambda x: x).collect()
    #找出非直达 非一次中转 id的df的内容
    trips_id2 = df.filter(col("trip_id").isin(selected_list))

    stops_in_trip_id2 = trips_id2.select("stop_id").distinct()
    
    # 然后到这些站后可以一次中转到达 stop_id2
    for i in stops_in_trip_id2:
        # 第一段trip
#         print("path1")
        subpath1 = direct_routes_withinHour(df, stop_id1, i, arrival_time, True).distinct()
#         subpath1.show()
        # 从第一段trip的stop 经过一次中转到 stop_id2
#         print("path2")
        subpath2 = one_transit(df, i, stop_id2, arrival_time)
#         subpath2.show()
       
    subpath1 = subpath1.withColumnRenamed("stop_id2","stop_id1")
    subpath2 = subpath2.withColumnRenamed("trip_id","trip_id_1")
    subpath2 = subpath2.withColumnRenamed("arrival_schedule2_2","arrival_schedule_3")
    subpath2 = subpath2.withColumnRenamed("stop_id2","stop_id3")
    subpath2 = subpath2.withColumnRenamed("stop_id","stop_id2")
    subpath2 = subpath2.withColumnRenamed("departure_schedule","departure_schedule3")
        
    subpath_join = subpath1.crossJoin(subpath2)
    
    subpath_join_filter = subpath_join.filter((subpath_join["stop_id1"] == subpath_join["stop_id2"]) & (subpath_join["arrival_schedule2"] <= subpath_join["departure_schedule3"]) & (subpath_join["stop_id3"] != subpath_join["stop_id"]))
    subpath_join_filter = subpath_join_filter.filter((subpath_join_filter["trip_id"] != subpath_join_filter["trip_id_1"]) & (subpath_join_filter["trip_id"] != subpath_join_filter["trip_id_2"]) & (subpath_join_filter["trip_id_2"] != subpath_join_filter["trip_id_1"]))
    path_two_transit = subpath_join_filter.select("trip_id", "departure_schedule", "arrival_schedule2", "stop_id1", "trip_id_1", "departure_schedule3", "stop_id3", "trip_id_2", "arrival_schedule_3")
    path_two_transit = path_two_transit.orderBy("departure_schedule", ascending=False)

    return path_two_transit

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [151]:
%%spark
def get_schedule_from_two_transit(data, path_two_transit, stop_id1, stop_id2):
    trip_ids_list_1 = path_two_transit.select("trip_id").rdd.flatMap(lambda x: x).collect()
    trip_ids_list_2 = path_two_transit.select("trip_id_1").rdd.flatMap(lambda x: x).collect()
    trip_ids_list_3 = path_two_transit.select("trip_id_2").rdd.flatMap(lambda x: x).collect()
    trip_ids_list = trip_ids_list_1 + trip_ids_list_2 + trip_ids_list_3
    
    # all valid trips
    selected_data = data.filter(data.trip_id.isin(trip_ids_list))\
                    .select("trip_id","stop_name","arrival_schedule","departure_schedule","stop_id","stop_lat","stop_lon")\
                    .orderBy("trip_id", "arrival_schedule")
    
    # empty dataframe to put result
    schema = StructType([
        StructField("trip_id",StringType(),True),
        StructField("stop_name",StringType(),True),
        StructField("arrival_schedule",StringType(),True),
        StructField("departure_schedule",StringType(),True),
        StructField("stop_id",StringType(),True),
        StructField("stop_lat",StringType(),True),
        StructField("stop_lon",StringType(),True),
        StructField("color",StringType(),True),
        StructField("schedule_id",StringType(),True)
    ])
    result_schedule = spark.createDataFrame([], schema)
    
    # for each group, keep stops between stop_id1 and stop_id2, calculate confidence
    cnt = 1
    for i in path_two_transit.collect():
        trip_id1 = i["trip_id"]
        trip_id2 = i["trip_id_1"]
        trip_id3 = i["trip_id_2"]
        transit_stop1 = i["stop_id1"]
        transit_stop2 = i["stop_id3"]
        
        # filter trip 1
        trip_data1 = filter_data_between(selected_data, trip_id1, stop_id1, transit_stop1)
        # filter trip 2
        trip_data2 = filter_data_between(selected_data, trip_id2, transit_stop1, transit_stop2)
        # filter trip 3
        trip_data3 = filter_data_between(selected_data, trip_id3, transit_stop2, stop_id2)
        
        # add distinct schedule_id
        schedule_id = "t2_" + str(cnt)
        trip_data1 = trip_data1.withColumn("color", lit('rgb(255, 0, 0)'))
        trip_data2 = trip_data2.withColumn("color", lit('rgb(0, 255, 0)'))
        trip_data3 = trip_data3.withColumn("color", lit('rgb(0, 0, 255)'))
        trip_data = trip_data1.union(trip_data2).union(trip_data3)
        trip_data = trip_data.withColumn("schedule_id", lit(schedule_id))
        result_schedule = result_schedule.union(trip_data)
        cnt += 1
    
    return result_schedule

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Planning

In [152]:
%%spark
def top_time(df, nb):
    df_order = df.orderBy("departure_schedule", ascending=False)
    df_order_list = df_order.select("departure_schedule").rdd.flatMap(lambda x: x).collect()
    if nb < len(df_order_list):
        index = nb - 1
    else:
        index = len(df_order_list) - 1
        
    return df_order_list[index]

def df_order(d1, d2, d3):
    '''
    d1 : direct routes
    d2 : one transit
    d3 : two transit
    '''
    d1d = d1.select("trip_id", "departure_schedule")
    d2d = d2.select("trip_id", "departure_schedule")
    d3d = d3.select("trip_id", "departure_schedule")
    df = d1d.union(d2d).union(d3d).orderBy("departure_schedule", ascending=False)

    time = top_time(df, 10)

    d1_order = d1.filter(d1["departure_schedule"] > time)
    d2_order = d2.filter(d2["departure_schedule"] > time)
    d3_order = d3.filter(d3["departure_schedule"] > time)
    return d1_order, d2_order, d3_order

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [153]:
%%spark
#  -o schedule
df = actual_data_513
start_id = "8503006"
end_id = "8503009"
arrival_time = "13.05.2019 19:00:00"

# get trips
direct_trips = direct_routes_withinHour(df, start_id, end_id, arrival_time)
one_transit_trips = one_transit(df, start_id, end_id, arrival_time)
two_transit_trips = two_transit(df, start_id, end_id, arrival_time)
print(direct_trips.count())
print(one_transit_trips.count())
print(two_transit_trips.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

1
5
17

In [154]:
%%spark
d1, d2, d3 = df_order(direct_trips, one_transit_trips, two_transit_trips)
print(d1.count())
print(d2.count())
print(d3.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

1
3
5

In [155]:
%%spark -o schedule
schedule1 = get_schedule_from_direct_routes(df, d1, start_id, end_id)
schedule2 = get_schedule_from_one_transit(df, d2, start_id, end_id)
schedule3 = get_schedule_from_two_transit(df, d3, start_id, end_id)

schedule = schedule1.union(schedule2).union(schedule3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Visualization

In [156]:
# prepare data
schedule_list = schedule["schedule_id"].unique().tolist()

# figure
layout = dict(hovermode='closest',
    margin=dict(l=0, t=0, r=0, b=0, pad=0),
    mapbox=dict(
        accesstoken=token,
        bearing=0,
        center=go.layout.mapbox.Center(
            lat=47.378177,
            lon=8.540192
        ),
        pitch=0,
        zoom=10
    ))

data = go.Scattermapbox(
        lat=[],
        lon=[],
        mode='markers+lines',
        marker=go.scattermapbox.Marker(size=5),
        text=[]
    )

figure = go.FigureWidget(data=[data], layout=layout)
table = widgets.Output()

# update function
def f(a):    
    # select this schedule
    df = schedule[schedule['schedule_id'] == choose_schedule.value].reset_index(drop=True)
    df['arrival_schedule'] = df['arrival_schedule'].apply(lambda x: str(x).split()[1] if str(x)!="NaT" else str(x))
    df['departure_schedule'] = df['departure_schedule'].apply(lambda x: str(x).split()[1] if str(x)!="NaT" else str(x))
    
    # update table
    table_data = df.drop(['stop_id','stop_lat','stop_lon','schedule_id', 'color', 'trip_id'], axis=1)
    # output table
    table = display(table_data)
    
    # update figure
    with figure.batch_update():
        figure.data[0].lat = df["stop_lat"]
        figure.data[0].lon = df["stop_lon"]
        figure.data[0].text = df["stop_name"]
        figure.data[0].marker.color = df['color']
        
# widget
choose_schedule = widgets.Dropdown(options=schedule_list)
out = widgets.interactive_output(f, {'a': choose_schedule})

# display
VBox([choose_schedule, HBox([figure, out])])

VBox(children=(Dropdown(options=('d_1', 't1_1', 't1_2', 't1_3', 't2_1', 't2_2', 't2_3', 't2_4', 't2_5'), value…