In [85]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
from google.transit import gtfs_realtime_pb2
from google.protobuf.json_format import Parse, MessageToJson
import json
import psycopg2
import time
from django.contrib.gis.geos import Point, fromstr, GEOSGeometry
from datetime import datetime, timedelta
import geopandas as gpd
import rtree
import vaex
import vaex.ml
from joblib import delayed, Parallel, load, parallel_backend
from util import direction_angle, apply_dow, get_dt, get_conn, vaex_mjoin
# from haversine import haversine, Unit

dir = Path.cwd()
dir = os.path.join(dir, '../')
outdir = os.path.join(dir, 'output')
gtfs_records_zip = os.path.join(dir, "data", "GtfsRRecords.zip")
gtfs_csv_zip = os.path.join(outdir, "gtfsr_csv.zip")
gtfs_final_csv_path = os.path.join(outdir, "gtfsr.csv")
gtfs_processed_path = os.path.join(outdir, "gtfsr_processed.hdf5")
scats_model_path = os.path.join(outdir, "scats_model.json")
gtfsr_processing_temp = os.path.join(outdir, "processing_temp.hdf5")
gtfsr_arrival_means = os.path.join(outdir, "gtfsr_arrival_means.hdf5")
stop_time_data_path = os.path.join(outdir, 'stop_time_data.hdf5')
gtfs_final_hdf5_path = os.path.join(outdir, "gtfsr.csv.hdf5")

entity_cols = [
    "trip_id",
    "start_date",
    "start_time",
    "stop_sequence",
    "departure",
    "arrival",
    "timestamp",
    "stop_id",
]


In [7]:
# get the stop time, stop and trip data for each trip
def get_stop_time_df(trip_id, conn):
    query = """
    select 
        stop_time.arrival_time, stop_time.departure_time,
        stop_time.stop_sequence, stop_time.shape_dist_traveled, 
        stop.stop_id, stop.point as geom,
        trip.direction, route.route_id
    from stop_time
    join stop on stop.id = stop_time.stop_id
    join trip on trip.id = stop_time.trip_id
    join route on trip.route_id = route.id
    where trip.trip_id = '{}'
    group by stop_time.id, stop.id, trip.id, route.id
    order by stop_sequence
    ;
    """.format(
        trip_id
    ).lstrip()

    gdf = gpd.read_postgis(query, conn())

    # convert the times to human readable format, !IMPORTANT! utcfromtimestamp returns the correct version
    gdf["arrival_time"] = gdf["arrival_time"].apply(lambda d: datetime.utcfromtimestamp(d).strftime("%H:%M:%S"))
    gdf["departure_time"] = gdf["departure_time"].apply(lambda d: datetime.utcfromtimestamp(d).strftime("%H:%M:%S"))

    # convert the geom to lat lon
    gdf["lat"] = gdf.apply(lambda row: row["geom"].y, axis=1)
    gdf["lon"] = gdf.apply(lambda row: row["geom"].x, axis=1)

    # find the direction angle of the trip
    gdf["direction_angle"] = direction_angle(gdf.iloc[0].lon, gdf.iloc[0].lat, gdf.iloc[-1].lon, gdf.iloc[-1].lat)

    # calculate the point distance between each stop and shape dist between them
    gdf["shape_dist_between"] = gdf.shape_dist_traveled - gdf.shape_dist_traveled.shift()

    gdf["trip_id"] = trip_id  # set the trip id, no need to fetch from db
    gdf["start_time"] = gdf["arrival_time"].iloc[0]  # set the start time to the first instance of arrival time
    gdf = gdf.fillna(0)  # first will always be NA, set to 0

    # return a new pandas df dropping the geom column
    return pd.DataFrame(gdf.drop(columns="geom"))
get_stop_time_df('11777.2.60-13-b12-1.22.I', get_conn)

Unnamed: 0,arrival_time,departure_time,stop_sequence,shape_dist_traveled,stop_id,direction,route_id,lat,lon,direction_angle,shape_dist_between,trip_id,start_time
0,14:00:00,14:00:00,1,0.00,8230DB007229,1,60-13-b12-1,53.330089,-6.451083,63.095496,0.00,11777.2.60-13-b12-1.22.I,14:00:00
1,14:01:58,14:01:58,2,1096.06,8230DB004617,1,60-13-b12-1,53.326666,-6.439452,63.095496,1096.06,11777.2.60-13-b12-1.22.I,14:00:00
2,14:02:18,14:02:18,3,1282.17,8230DB004555,1,60-13-b12-1,53.325029,-6.439407,63.095496,186.11,11777.2.60-13-b12-1.22.I,14:00:00
3,14:03:37,14:03:37,4,2016.78,8230DB004576,1,60-13-b12-1,53.319363,-6.434300,63.095496,734.61,11777.2.60-13-b12-1.22.I,14:00:00
4,14:04:38,14:04:38,5,2583.07,8230DB003418,1,60-13-b12-1,53.320033,-6.427175,63.095496,566.29,11777.2.60-13-b12-1.22.I,14:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,15:23:01,15:23:01,86,29755.68,8220DB006238,1,60-13-b12-1,53.401192,-6.266724,63.095496,749.19,11777.2.60-13-b12-1.22.I,14:00:00
86,15:23:33,15:23:33,87,30031.51,8220DB006182,1,60-13-b12-1,53.402251,-6.265357,63.095496,275.83,11777.2.60-13-b12-1.22.I,14:00:00
87,15:23:59,15:23:59,88,30254.35,8220DB000322,1,60-13-b12-1,53.404255,-6.265351,63.095496,222.84,11777.2.60-13-b12-1.22.I,14:00:00
88,15:25:56,15:25:56,89,31264.42,8240DB000323,1,60-13-b12-1,53.412930,-6.265449,63.095496,1010.07,11777.2.60-13-b12-1.22.I,14:00:00


In [24]:
# df = pd.read_csv(gtfs_final_csv_path)  # read csv

# # dropping duplicates
# df = df.drop_duplicates(subset=entity_cols[:5])

# vaex.from_pandas(df).export_hdf5(gtfsr_processing_temp)
# del [df]

In [8]:
# create a df with stop data and export to hdf5
print("*** adding stop data ***")
df = vaex.open(gtfs_final_hdf5_path)  # read csv

trip_list = df["trip_id"].unique()
delayed_funcs = [delayed(get_stop_time_df)(t_id, get_conn) for t_id in trip_list[:10]]
parallel_pool = Parallel(n_jobs=8)

res = parallel_pool(delayed_funcs)

stop_time_trip_df = vaex.from_pandas(pd.concat(res))
stop_time_trip_df


*** adding stop data ***


#,arrival_time,departure_time,stop_sequence,shape_dist_traveled,stop_id,direction,route_id,lat,lon,direction_angle,shape_dist_between,trip_id,start_time
0,14:00:00,14:00:00,1,0.0,8230DB007229,1,60-13-b12-1,53.3300889307077,-6.45108278609524,63.09549602776408,0.0,11777.2.60-13-b12-1.22.I,14:00:00
1,14:01:58,14:01:58,2,1096.06,8230DB004617,1,60-13-b12-1,53.3266657523699,-6.43945187383431,63.09549602776408,1096.06,11777.2.60-13-b12-1.22.I,14:00:00
2,14:02:18,14:02:18,3,1282.17,8230DB004555,1,60-13-b12-1,53.3250293591832,-6.4394065107057,63.09549602776408,186.11000000000013,11777.2.60-13-b12-1.22.I,14:00:00
3,14:03:37,14:03:37,4,2016.78,8230DB004576,1,60-13-b12-1,53.3193629651012,-6.43429969561639,63.09549602776408,734.6099999999999,11777.2.60-13-b12-1.22.I,14:00:00
4,14:04:38,14:04:38,5,2583.07,8230DB003418,1,60-13-b12-1,53.3200332113251,-6.42717525248251,63.09549602776408,566.2900000000002,11777.2.60-13-b12-1.22.I,14:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
807,15:54:38,15:54:38,66,20877.32,8230DB004869,0,60-15-b12-1,53.2791145416749,-6.31875650653983,-126.28881488258797,419.4399999999987,11023.2.60-15-b12-1.32.O,14:45:00
808,15:56:23,15:56:23,67,21688.96,8230DB004870,0,60-15-b12-1,53.2766673916987,-6.33009910271138,-126.28881488258797,811.6399999999994,11023.2.60-15-b12-1.32.O,14:45:00
809,15:56:52,15:56:52,68,21825.05,8230DB003007,0,60-15-b12-1,53.2759174587528,-6.33110302091162,-126.28881488258797,136.09000000000015,11023.2.60-15-b12-1.32.O,14:45:00
810,15:58:25,15:58:25,69,22258.95,8230DB006283,0,60-15-b12-1,53.2723344985629,-6.32941334986016,-126.28881488258797,433.90000000000146,11023.2.60-15-b12-1.32.O,14:45:00


In [72]:
t_list = df.trip_id.unique()

In [78]:
[i for i, val in enumerate(t_list) if val == None]

[]

In [5]:
stop_time_trip_df.export_hdf5(stop_time_data_path)

In [6]:
cols = ["trip_id", "stop_sequence", "stop_id", "start_time"]

vaex_mjoin(df, stop_time_trip_df, cols, cols)

#,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,arrival_time,departure_time,shape_dist_traveled,direction,route_id,lat,lon,direction_angle,shape_dist_between
0,11777.2.60-13-b12-1.22.I,20210109,18:00:00,1,0,0,2021-01-09 19:28:02,8230DB007229,--,--,--,--,--,--,--,--,--
1,11379.2.60-13-b12-1.19.O,20210109,18:00:00,1,0,0,2021-01-09 19:28:02,8240DB000324,--,--,--,--,--,--,--,--,--
2,8111.2.60-27-b12-1.151.O,20210109,18:10:00,1,0,0,2021-01-09 19:28:02,8220DB004595,--,--,--,--,--,--,--,--,--
3,7705.2.60-40-b12-1.206.O,20210109,18:10:00,1,0,0,2021-01-09 19:28:02,8240DB007132,--,--,--,--,--,--,--,--,--
4,7751.2.60-40-b12-1.209.I,20210109,18:10:00,1,0,0,2021-01-09 19:28:02,8230DB004795,--,--,--,--,--,--,--,--,--
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1168494,12640.3.60-130-b12-1.74.I,20210131,23:30:00,18,1140,1140,2021-01-31 23:58:50,8220DB001729,--,--,--,--,--,--,--,--,--
1168495,12750.3.60-40-b12-1.206.O,20210131,23:00:00,79,120,120,2021-01-31 23:59:45,8230DB004690,--,--,--,--,--,--,--,--,--
1168496,12355.3.60-37-b12-1.39.O,20210131,23:20:00,56,120,120,2021-01-31 23:59:45,8240DB007227,--,--,--,--,--,--,--,--,--
1168497,12640.3.60-130-b12-1.74.I,20210131,23:30:00,18,1200,1200,2021-01-31 23:59:45,8220DB001729,--,--,--,--,--,--,--,--,--


In [30]:
# just convert csv

combined_csv = pd.read_csv(gtfs_final_csv_path)

# dropping duplicates
combined_csv = combined_csv.drop_duplicates(subset=entity_cols[:5])

# convert to csv
combined_csv.to_csv(gtfs_final_csv_path, index=False, header=True)

if os.path.exists(gtfs_final_csv_path + ".hdf5"):
    os.remove(gtfs_final_csv_path + ".hdf5")

vaex.from_csv(gtfs_final_csv_path, convert=True, copy_index=False, chunk_size=1000000)

#,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id
0,11777.2.60-13-b12-1.22.I,20210109,18:00:00,1,0,0,2021-01-09 19:28:02,8230DB007229
1,11379.2.60-13-b12-1.19.O,20210109,18:00:00,1,0,0,2021-01-09 19:28:02,8240DB000324
2,8111.2.60-27-b12-1.151.O,20210109,18:10:00,1,0,0,2021-01-09 19:28:02,8220DB004595
3,7705.2.60-40-b12-1.206.O,20210109,18:10:00,1,0,0,2021-01-09 19:28:02,8240DB007132
4,7751.2.60-40-b12-1.209.I,20210109,18:10:00,1,0,0,2021-01-09 19:28:02,8230DB004795
...,...,...,...,...,...,...,...,...
1168494,12640.3.60-130-b12-1.74.I,20210131,23:30:00,18,1140,1140,2021-01-31 23:58:50,8220DB001729
1168495,12750.3.60-40-b12-1.206.O,20210131,23:00:00,79,120,120,2021-01-31 23:59:45,8230DB004690
1168496,12355.3.60-37-b12-1.39.O,20210131,23:20:00,56,120,120,2021-01-31 23:59:45,8240DB007227
1168497,12640.3.60-130-b12-1.74.I,20210131,23:30:00,18,1200,1200,2021-01-31 23:59:45,8220DB001729


In [116]:
def predict_traffic_from_scats(_df):
    print("*** scats predictions ***")

    df = _df.copy()
    df["hour"] = df["arrival_time"].apply(lambda t: get_dt(t, "%H:%M:%S").hour)
    df["dow"] = df.apply(apply_dow, ["start_date", "start_time", "arrival_time"])

    pca_coord = vaex.ml.PCA(features=["lat", "lon"], n_components=2, prefix="pca")
    df = pca_coord.fit_transform(df)

    cycl_transform_hour = vaex.ml.CycleTransformer(features=["hour"], n=24)
    df = cycl_transform_hour.fit_transform(df)

    cycl_transform_dow = vaex.ml.CycleTransformer(features=["dow"], n=7)
    df = cycl_transform_dow.fit_transform(df)

    # with parallel_backend("threading"):
    # load the scats ml model
    # scats_model = load(scats_model_path)

    # get the predictions from scats data
    # df = scats_model.transform(df)

    # return df[_df.get_column_names() + ["p_avg_vol"]]
    return df[_df.get_column_names()]

In [112]:
# if not os.path.exists(stop_time_data_path):
#     add_stop_data()

cols = ["trip_id", "stop_sequence", "stop_id", "start_time"]

d1 = vaex.open(gtfs_final_csv_path+'')
d2 = vaex.open(stop_time_data_path)

df = vaex_mjoin(d1, d2, cols, cols, how='inner')

In [117]:
t_df = predict_traffic_from_scats(df)

t_df
# df.export_hdf5(gtfs_processed_path)


*** scats predictions ***
