In [18]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
from google.transit import gtfs_realtime_pb2
from google.protobuf.json_format import Parse, MessageToJson
import json
import psycopg2
import time
from django.contrib.gis.geos import Point, fromstr, GEOSGeometry
from datetime import datetime, timedelta
import geopandas as gpd
import rtree
import vaex
import vaex.ml
from joblib import delayed, Parallel, load, parallel_backend
from util import direction_angle, apply_dow, get_dt, get_conn, vaex_mjoin, find_trip_regex
# from haversine import haversine, Unit

dir = Path.cwd()
dir = os.path.join(dir, '../')
outdir = os.path.join(dir, 'output')
gtfs_records_zip = os.path.join(dir, "data", "GtfsRRecords.zip")
gtfs_csv_zip = os.path.join(outdir, "gtfsr_csv.zip")
gtfs_final_csv_path = os.path.join(outdir, "gtfsr.csv")
gtfs_processed_path = os.path.join(outdir, "gtfsr_processed.hdf5")
scats_model_path = os.path.join(outdir, "scats_model.json")
gtfsr_processing_temp = os.path.join(outdir, "processing_temp.hdf5")
gtfsr_arrival_means = os.path.join(outdir, "gtfsr_historical_means.hdf5")
stop_time_data_path = os.path.join(outdir, 'stop_time_data.hdf5')
gtfs_final_hdf5_path = os.path.join(outdir, "gtfsr.csv.hdf5")

entity_cols = [
    "trip_id",
    "start_date",
    "start_time",
    "stop_sequence",
    "departure",
    "arrival",
    "timestamp",
    "stop_id",
]

In [6]:
live_df = vaex.open(os.path.join(outdir, 'deploy_gtfsr.hdf5'))
live_df

#,trip_id,start_date,start_time,stop_sequence,arrival,timestamp,stop_id,arrival_time,shape_dist_traveled,direction,route_id,lat,lon,direction_angle,shape_dist_between,arr_dow,arr_hour,arrival_mean,p_mean_vol
0,19389.1.60-155-d12-1.89.O,20210300.0,19:20:00,24,5,2021-03-02 19:51:26,8220DB000264,19:38:38,7818.16,0,60-155-d12-1,53.3535,-6.26226,139.315,518.6,1,19,6,68.5386


In [7]:
# vaex.open(os.path.join(outdir, 'gtfsr_model.hdf5'))

In [9]:
# live_df.state_load(os.path.join(outdir, 'gtfsr_model.json'))

In [2]:
str(datetime.now())

'2021-03-04 16:43:34.367100'

In [10]:
from pandas import array

model_df = vaex.from_dict(
    {
        "trip_id": array(["19389.1.60-155-d12-1.89.O"], dtype=object),
        "start_date": array([20210302]),
        "start_time": array(["19:20:00"], dtype=object),
        "stop_sequence": array([24]),
        "arrival": array([5.0]),
        "timestamp": array(["2021-03-02 19:51:26"], dtype=object),
        "stop_id": array(["8220DB000264"], dtype=object),
        "arrival_time": array(["19:38:38"], dtype=object),
        "shape_dist_traveled": array([7818.16]),
        "direction": array(["0"], dtype=object),
        "route_id": array(["60-155-d12-1"], dtype=object),
        "lat": array([53.3535353]),
        "lon": array([-6.26225863]),
        "direction_angle": array([139.31470635]),
        "shape_dist_between": array([518.6]),
        "arr_dow": array([1]),
        "arr_hour": array([19]),
        "arrival_mean": array([6.0]),
        "p_mean_vol": array([68.53864425]),
    }
)
# model_df.state_load(gtfsr_model_path)
# model_state = model_df.state_get()

In [11]:
import io

path = os.path.join(outdir, "gtfsr_model.json")
f = open(path, 'r')

# df.state_load(path)
# tmp = io.StringIO(f)
# tmp.read()
# df = vaex.open(tmp)
# df

In [80]:
import vaex
from util import vaex_mjoin, apply_dow, get_dt

# converts input data into the correct format for predictions to happen.
# !IMPORTANT LESSONS LEARNT! -> everything must be the same, in a vaex pipeline/state,
# both datatypes must be the same for every feature, also, an expression wont be recognised when loading
# the state on.
st_df = vaex.open(stop_time_data_path)
hm_df = vaex.open(gtfsr_arrival_means)
# model = gtfsr_model_path

empty = ("", "")

data = {
    "trip_id": array(["19389.1.60-155-d12-1.89.O"], dtype=object),
    "start_date": array([20210302]),
    "start_time": array(["19:20:00"], dtype=object),
    "stop_sequence": array([24]),
    "arrival": array([5.0]),
    "timestamp": array(["2021-03-02 19:51:26"], dtype=object),
    "stop_id": array(["8220DB000264"], dtype=object)
}

live_df = vaex.from_dict(data)

# join stop time data, filtering improves speed by only copying relevant rows
cols = ["trip_id", "stop_sequence", "stop_id", "start_time"]

temp_df = st_df[
        (st_df["trip_id"] == data["trip_id"][0])
        & (st_df["stop_sequence"] == data["stop_sequence"][0])
        & (st_df["stop_id"] == data["stop_id"][0])
        & (st_df["start_time"] == data["start_time"][0])
    ].copy()

live_df['start_date'] = live_df['start_date'].astype('int64')
live_df['stop_sequence'] = live_df['stop_sequence'].astype('int64')
live_df['arrival'] = live_df['arrival'].astype('int64')

live_df = vaex_mjoin(
    live_df,
    temp_df,
    cols,
    cols,
    how="inner",
)

# if not len(live_df) == 1:
#     return empty

# join the historical means to our dataset
live_df["arr_dow"] = live_df.apply(apply_dow, ["start_date", "start_time", "arrival_time"])
live_df["arr_hour"] = live_df["arrival_time"].apply(lambda t: get_dt(t, "%H:%M:%S").hour)

temp_df = hm_df[
    (hm_df["trip_id"] == data["trip_id"][0])
    & (hm_df["stop_id"] == data["stop_id"][0])
    & (hm_df["arr_dow"] == live_df[["arr_dow"]][0][0])
    & (hm_df["arr_hour"] == live_df[["arr_hour"]][0][0])
]

# if not len(temp_df) > 0:
#     return empty

cols = ["trip_id", "stop_id", "arr_dow", "arr_hour"]
live_df = vaex_mjoin(
    live_df,
    temp_df.copy(),
    cols,
    cols,
    how="inner",
)

# if not len(live_df) == 1:
#     return empty

# assert same type
live_df["direction"] = live_df["direction"].astype("int64")

# # materialize virtual columns to match model state
live_df = live_df.materialize("arr_dow")
live_df = live_df.materialize("arr_hour")

live_df

# try:
#     live_df.state_set(model)

#     if len(live_df) == 1:
#         return (round(live_df[["p_arrival_lgbm"]][0][0]) * 60), live_df[["p_arrival_lgbm"]][0][0]
# except:
#     return empty
# return empty



#,trip_id,start_date,start_time,stop_sequence,arrival,timestamp,stop_id,arrival_time,departure_time,shape_dist_traveled,direction,route_id,lat,lon,direction_angle,shape_dist_between,arrival_mean,p_mean_vol,arr_dow,arr_hour
0,19389.1.60-155-d12-1.89.O,20210300.0,19:20:00,24,5,2021-03-02 19:51:26,8220DB000264,19:38:38,19:38:38,7818.16,0,60-155-d12-1,53.3535,-6.26226,139.315,518.6,6,68.5386,1,19


In [81]:
'19389.1.60-155-d12-1.89.O'

'19389.1.60-155-d12-1.89.O'

In [89]:
import re

def find_trip_regex(trip_list, trip_id):
    if not type(trip_id) == str:
        return None

    tokens = trip_id.split(".")
    print(tokens)
    if not len(tokens) == 5:
        return None

    route_id = tokens[2].split("-")

    if route_id[2] in ["ga2", "gad"]:
        route_id[2] = "ga[2|d]"
        tokens[2] = "-".join(route_id)
    elif route_id[2] in ["d12", "b12"]:
        route_id[2] = "[b|d]12"
        tokens[2] = "-".join(route_id)

    if 'y' in tokens[1]:
        tokens[1] = '*'

    tokens[3] = "*"

    reg = ".".join(tokens)

    r = re.compile(reg)
    matched_list = list(filter(r.match, trip_list))

    if len(matched_list) > 0:
        return matched_list[0]
    else:
        return None

find_trip_regex(['12953.3.60-27-b12-1.151.O', '12953.3.60-27-d12-1.151.O'], '12953.y1002.60-27-b12-1.151.O')

['12953', 'y1002', '60-27-b12-1', '151', 'O']


'12953.3.60-27-b12-1.151.O'

In [102]:
st_df = vaex.open(stop_time_data_path)
hm_df = vaex.open(gtfsr_arrival_means)

temp_df = {'trip_id': array(['12426.3.60-151-d12-1.85.O'], dtype='<U25'), 'stop_sequence': array([9]), 'stop_id': array(['8220DB004522'], dtype='<U12'), 'start_time': array(['23:00:00'], dtype='<U8'), 'start_date': array([20210307]), 'timestamp': array(['2021-03-07 22:51:27'], dtype='<U19'), 'arrival': array([0])}

t_id = '12426.3.60-151-d12-1.85.O'

st_df[st_df.trip_id == t_id]
# temp_df['trip_id'][0]

#,arrival_time,departure_time,stop_sequence,shape_dist_traveled,stop_id,direction,route_id,lat,lon,direction_angle,shape_dist_between,trip_id,start_time
,,,,,,,,,,,,,


In [106]:
print(
    len(st_df.trip_id.unique()),
    len(hm_df.trip_id.unique()),
)

29048 29012
