In [8]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import psycopg2
import time
from django.contrib.gis.geos import Point, fromstr, GEOSGeometry
from datetime import datetime, timedelta
import vaex
import vaex.ml
from joblib import delayed, Parallel, load, parallel_backend

dir = Path.cwd()
dir = os.path.join(dir, '../')
outdir = os.path.join(dir, 'output')
gtfs_records_zip = os.path.join(dir, 'data', 'GtfsRRecords.zip')
gtfs_csv_zip = os.path.join(outdir, 'gtfsr_csv.zip')
gtfs_final_csv_path = os.path.join(outdir, 'gtfsr.csv')
gtfs_processed_path = os.path.join(outdir, "gtfsr_processed.hdf5")
scats = os.path.join(dir, 'output', 'scats_model.json')
gtfsr_arrival_means = os.path.join(outdir, "gtfsr_arrival_means.hdf5")

In [None]:
df = pd.read_csv(gtfs_final_csv_path)

In [None]:
# export historical arrival means
arr_means_df.export_hdf5('../output/arrival_means.hdf5')

In [38]:

def get_dt(dt, format):
    return datetime.strptime(str(dt), format)

def apply_dow(start_date, start_time, expected_time):
    date = get_dt(start_date, "%Y%m%d")
    if get_dt(start_time, "%H:%M:%S") > get_dt(expected_time, "%H:%M:%S"):
        return (date + timedelta(days=1)).weekday()
    return date.weekday()

df["arr_dow"] = df.apply(lambda x: apply_dow(x.start_date, x.start_time, x.arrival_time), axis=1)


# print("creating arrival_means...")
cols = ["trip_id", "stop_id", "arr_dow"]
arr_means_df = df.groupby(cols) \
    .agg({'arrival': 'mean'}) \
    .rename(columns={'arrival':'arrival_mean'}) \
    .reset_index()
    
arr_means_df

Unnamed: 0,arr_dow,arrival_mean
count,681.0,681.0
mean,2.525698,80.853157
std,1.46217,195.263146
min,0.0,-360.0
25%,1.0,0.0
50%,3.0,0.0
75%,4.0,150.0
max,4.0,1500.0


In [4]:
vaex.open(gtfsr_arrival_means)

#,trip_id,stop_id,arr_dow,arrival_mean
0,1.10454.2-18-gad-1.12.O,8220DB000375,3,420.0
1,1.10454.2-18-gad-1.12.O,8220DB000414,3,420.0
2,1.10454.2-18-gad-1.12.O,8220DB000417,3,390.0
3,1.10454.2-18-gad-1.12.O,8220DB000779,3,360.0
4,1.10454.2-18-gad-1.12.O,8220DB000780,3,300.0
...,...,...,...,...
667313,9999.2.60-46A-b12-1.258.O,8220DB000819,5,-30.0
667314,9999.2.60-46A-b12-1.258.O,8220DB000846,5,0.0
667315,9999.2.60-46A-b12-1.258.O,8220DB000848,5,-30.0
667316,9999.2.60-46A-b12-1.258.O,8220DB006059,5,-60.0


In [None]:
df = vaex.open(gtfs_processed_path, convert=True)


# join the arrival means to our dataset
df = vaex_mjoin(df, vaex.open(gtfsr_arrival_means), cols, cols)

In [13]:
df = vaex.open(gtfs_processed_path)
df

#,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,arrival_time,departure_time,shape_dist_traveled,direction,route_id,lat,lon,direction_angle,shape_dist_between,p_avg_vol
0,19920.4.60-33-b12-1.175.I,20210122,09:41:00,1,0,0,2021-01-22 11:13:19,8240DB006048,09:41:00,09:41:00,0.0,1,60-33-b12-1,53.6045004892092,-6.18446749406744,-164.50285739144564,0.0,64.8747124578529
1,19920.4.60-33-b12-1.175.I,20210125,09:41:00,1,0,0,2021-01-25 09:12:01,8240DB006048,09:41:00,09:41:00,0.0,1,60-33-b12-1,53.6045004892092,-6.18446749406744,-164.50285739144564,0.0,69.63686284208207
2,19920.4.60-33-b12-1.175.I,20210126,09:41:00,1,0,0,2021-01-26 09:12:25,8240DB006048,09:41:00,09:41:00,0.0,1,60-33-b12-1,53.6045004892092,-6.18446749406744,-164.50285739144564,0.0,66.75543225511986
3,19920.4.60-33-b12-1.175.I,20210127,09:41:00,1,0,0,2021-01-27 09:11:59,8240DB006048,09:41:00,09:41:00,0.0,1,60-33-b12-1,53.6045004892092,-6.18446749406744,-164.50285739144564,0.0,65.5552113135545
4,19920.4.60-33-b12-1.175.I,20210128,09:41:00,1,0,0,2021-01-28 09:12:22,8240DB006048,09:41:00,09:41:00,0.0,1,60-33-b12-1,53.6045004892092,-6.18446749406744,-164.50285739144564,0.0,68.06247626235033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1025849,12640.3.60-130-b12-1.74.I,20210131,23:30:00,18,1200,1200,2021-01-31 23:59:45,8220DB001729,23:38:34,23:38:34,4808.93,1,60-130-b12-1,53.3584969058401,-6.19031550974047,-107.23111263684194,304.8600000000006,18.965954002092637
1025850,12877.3.60-44-b12-1.246.O,20210131,23:00:00,68,300,300,2021-01-31 23:58:50,8250DB003478,23:52:11,23:52:11,22858.06,0,60-44-b12-1,53.2390505484196,-6.19616830395543,155.4154238326518,487.3900000000031,5.037411138055553
1025851,12750.3.60-40-b12-1.206.O,20210131,23:00:00,79,120,120,2021-01-31 23:59:45,8230DB004690,23:57:43,23:57:43,28066.88,0,60-40-b12-1,53.3495094392524,-6.40519639715847,-118.90642649102342,264.3100000000013,28.85793655856444
1025852,12355.3.60-37-b12-1.39.O,20210131,23:20:00,56,120,120,2021-01-31 23:59:45,8240DB007227,23:56:18,23:56:18,19188.9,0,60-37-b12-1,53.3837882586066,-6.4031036640899,-67.67425709043938,240.34000000000012,25.439205758002338


True