# Choose stops locally

In [None]:
%%local
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import ipywidgets as widgets
from ipywidgets import HBox, VBox

stops = pd.read_csv('stops_main.csv')
stops = stops.drop(['Unnamed: 0'], axis=1)
stops = stops.rename(columns={"main_id": "stop_id"})
stop_list = stops['stop_name'].tolist()
stop_list.sort()

token = "pk.eyJ1IjoiY29jb251dG51dCIsImEiOiJjbDNscTZhbHowMmxtM2pwajl3Yjd1ejF0In0.PXbwkPmWYXrAhQsus3ypVA"

In [2]:
%%local
# figure
layout = dict(hovermode='closest',
    margin=dict(l=0, t=0, r=0, b=0, pad=0),
    mapbox=dict(
        accesstoken=token,
        bearing=0,
        center=go.layout.mapbox.Center(
            lat=47.378177,
            lon=8.540192
        ),
        pitch=0,
        zoom=10
    ))

data_all = go.Scattermapbox(
        lat=stops['stop_lat'].tolist(),
        lon=stops['stop_lon'].tolist(),
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=5
        ),
        text=stops['stop_name'].tolist()
    )

data_choose1 = go.Scattermapbox(
        lat=[],
        lon=[],
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=10,
            color='red'
        ),
        text=[]
    )

data_choose2 = go.Scattermapbox(
        lat=[],
        lon=[],
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=10,
            color='green'
        ),
        text=[]
    )

figure = go.FigureWidget(data=[data_all, data_choose1, data_choose2], layout=layout)

# update function
def f1(a):
    # get stop lat and lon
    stop_data = stops[stops['stop_name'] == a]
    # get other stop info
    stop_id = int(stop_data['stop_id'])
    stop_lat = float(stop_data['stop_lat'])
    stop_lon = float(stop_data['stop_lon'])  
    # update figure
    with figure.batch_update():
        figure.data[1].lat = [stop_lat]
        figure.data[1].lon = [stop_lon]
        figure.data[1].text = [a] 
    print('Start:{} ({},{})'.format(stop_id, stop_lat, stop_lon))
        
def f2(a):
    # get stop lat and lon
    stop_data = stops[stops['stop_name'] == a]
    # get other stop info
    stop_id = int(stop_data['stop_id'])
    stop_lat = float(stop_data['stop_lat'])
    stop_lon = float(stop_data['stop_lon'])    
    # update figure
    with figure.batch_update():
        figure.data[2].lat = [stop_lat]
        figure.data[2].lon = [stop_lon]
        figure.data[2].text = [a]
    print('End:{} ({},{})'.format(stop_id, stop_lat, stop_lon))
        
# widget
choose_stop1 = widgets.Dropdown(
    description='Start:',
    options=stop_list
)

choose_stop2 = widgets.Dropdown(
    description='End:',
    options=stop_list
)

out1 = widgets.interactive_output(f1, {'a': choose_stop1})
out2 = widgets.interactive_output(f2, {'a': choose_stop2})

In [3]:
%%local
# display
VBox([figure, HBox([choose_stop1, choose_stop2]), out1, out2])

VBox(children=(FigureWidget({
    'data': [{'lat': [47.351677, 47.423626, 47.451023, ..., 47.44446, 47.399006,…

# Send input to spark

In [8]:
%%local
start_id = int(stops[stops['stop_name'] == choose_stop1.value]['stop_id'])
end_id = int(stops[stops['stop_name'] == choose_stop2.value]['stop_id'])

d = {'data': [start_id, end_id]}
input_df = pd.DataFrame(data=d)

In [9]:
%%send_to_spark -i input_df -t df -n input_df

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'input_df' as 'input_df' to Spark kernel

In [31]:
input_list = input_df.collect()
start_id = input_list[0].data
end_id = input_list[1].data

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

8576253

# Process on spark

In [32]:
actual_data_513 = spark.read.option("header",True).csv('/user/sixu/work/actual_data_513.csv')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------+------+----------------+-------------------+------------------+-------------------+--------+---------+---------+--------+-------+
|       trip_id|failed|arrival_schedule|     arrival_actual|departure_schedule|   departure_actual|not_stop|stop_name| stop_lat|stop_lon|stop_id|
+--------------+------+----------------+-------------------+------------------+-------------------+--------+---------+---------+--------+-------+
|  85:11:10:001| false|13.05.2019 21:50|13.05.2019 21:53:09|              null|               null|   false|Zürich HB|47.378178|8.540212|8503000|
|85:11:1007:001| false|13.05.2019 06:23|13.05.2019 06:22:53|              null|               null|   false|Zürich HB|47.378178|8.540212|8503000|
|85:11:1009:001| false|13.05.2019 07:23|13.05.2019 07:25:04|              null|               null|   false|Zürich HB|47.378178|8.540212|8503000|
|85:11:1011:001| false|13.05.2019 08:23|13.05.2019 08:23:24|              null|               null|   false|Zürich HB|47.378

In [33]:
def direct_routes(df, stop_id1, stop_id2, arrival_time):
    '''
    Start from stop_id1
    End to stop_id2 before arrival_time
    '''
    ## 不同时间的同一路经trip id 不一样！
    # 所有过stop_id1的trips, 出发时间 < 规定到达时间
    trips_id1_df = df.filter((df['stop_id'] == stop_id1) & (df["departure_schedule"] < arrival_time)).select("trip_id", "departure_schedule").distinct()
#     trips_id1_df.orderBy("departure_time",ascending=False).show(15)
    # arrival_time 之前过stop_id2 的trips
    trips_id2_df = df.filter((df['stop_id'] == stop_id2) & (df["arrival_schedule"] < arrival_time)).select("trip_id", "arrival_schedule").distinct()
    trips_id2_df = trips_id2_df.withColumnRenamed("trip_id","trip_id2")

    direct_trips = trips_id1_df.join(trips_id2_df, trips_id1_df["trip_id"] == trips_id2_df["trip_id2"], "inner")
    # drop duplicate trip_id
    direct_trips = direct_trips.drop("trip_id2")
    
    # 把反向的删除了
    direct_trips = direct_trips.filter(direct_trips["departure_schedule"] < direct_trips["arrival_schedule"])
    direct_trips.orderBy("arrival_schedule", ascending=False).show(100, False)
    
    return direct_trips

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [34]:
temp = direct_routes(actual_data_513, "8572603", "8582104", "13.05.2019 18:00:00")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------------+------------------+----------------+
|trip_id              |departure_schedule|arrival_schedule|
+---------------------+------------------+----------------+
|85:801:189656-32200-1|13.05.2019 17:27  |13.05.2019 17:34|
|85:801:189431-32200-1|13.05.2019 16:57  |13.05.2019 17:04|
|85:801:189051-32200-1|13.05.2019 16:26  |13.05.2019 16:32|
|85:801:189635-32200-1|13.05.2019 15:56  |13.05.2019 16:02|
|85:801:189322-32200-1|13.05.2019 15:26  |13.05.2019 15:32|
|85:801:189490-32200-1|13.05.2019 14:56  |13.05.2019 15:02|
|85:801:189138-32200-1|13.05.2019 14:26  |13.05.2019 14:32|
|85:801:189426-32200-1|13.05.2019 13:56  |13.05.2019 14:02|
|85:801:189087-32200-1|13.05.2019 13:26  |13.05.2019 13:32|
|85:801:189548-32200-1|13.05.2019 12:56  |13.05.2019 13:02|
|85:801:189317-32200-1|13.05.2019 12:26  |13.05.2019 12:32|
|85:801:189485-32200-1|13.05.2019 11:56  |13.05.2019 12:02|
|85:801:189133-32200-1|13.05.2019 11:26  |13.05.2019 11:32|
|85:801:189363-32200-1|13.05.2019 10:56 

In [None]:
import matplotlib.pyplot as plt
temp.toPandas().plot.bar(x='name',y='age')