In [1]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
from google.transit import gtfs_realtime_pb2
from google.protobuf.json_format import Parse, MessageToJson
import json
import psycopg2
import time
from django.contrib.gis.geos import Point, fromstr, GEOSGeometry
from datetime import datetime
import geopandas as gpd
import rtree
import vaex
import vaex.ml
from joblib import delayed, Parallel, load, parallel_backend
# from haversine import haversine, Unit

dir = Path.cwd()
outdir = os.path.join(dir, 'output')
gtfs_records_zip = os.path.join(dir, 'data', 'GtfsRRecords.zip')
gtfs_csv_zip = os.path.join(outdir, 'gtfsr_csv.zip')
gtfs_final_csv_path = os.path.join(outdir, 'gtfsr.csv')
gtfs_processed_csv_path = os.path.join(outdir, "gtfsr_processed.csv")
scats = os.path.join(dir, 'output', 'scats_model.json')


In [2]:
# connect to the PostgreSQL server
def get_conn():
    return psycopg2.connect(host="localhost", port="25432", database="gis", user="docker", password="docker")

In [3]:
df = pd.read_csv(gtfs_final_csv_path)
df

Unnamed: 0,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id
0,16289.4.60-123-b12-1.71.I,20210119,11:05:00,19,-14640,-14640,2021-01-19 10:50:00,8220DB001278
1,16289.4.60-123-b12-1.71.I,20210119,11:05:00,3,-14640,-14640,2021-01-19 10:50:00,8220DB001493
2,16289.4.60-123-b12-1.71.I,20210119,11:05:00,5,-14580,-14580,2021-01-19 10:50:00,8220DB001494
3,16289.4.60-123-b12-1.71.I,20210119,11:05:00,17,-14580,-14580,2021-01-19 10:50:00,8220DB000272
4,17148.4.60-40-b12-1.209.I,20210118,15:40:00,83,0,-14580,2021-01-18 15:39:01,8240DB006348
...,...,...,...,...,...,...,...,...
1331935,17281.4.60-44-b12-1.244.I,20210118,09:00:00,2,5460,5460,2021-01-18 10:34:03,8350DB004114
1331936,17281.4.60-44-b12-1.244.I,20210118,09:00:00,35,5460,5460,2021-01-18 11:04:05,8250DB002892
1331937,17281.4.60-44-b12-1.244.I,20210118,09:00:00,3,5520,5520,2021-01-18 11:04:05,8350DB004115
1331938,17026.4.60-40-b12-1.209.I,20210119,16:00:00,2,6660,6660,2021-01-19 18:07:16,8230DB004688


In [3]:
# get all the stop data and add the lat lon columns
import geoplot

def get_stops_df(conn): 
    query = """select stop_id, point as geom from stop;"""

    gdf = gpd.GeoDataFrame.from_postgis(query, conn())
    # geoplot.pointplot(gdf)

    gdf['lat'] = gdf.apply(lambda row: row['geom'].x, axis=1)
    gdf['lon'] = gdf.apply(lambda row: row['geom'].y, axis=1)

    return pd.DataFrame(gdf.drop(columns='geom'))
stop_df = get_stops_df(get_conn)
stop_df

Unnamed: 0,stop_id,lat,lon
0,8220DB000002,-6.263723,53.352244
1,8220DB000003,-6.263811,53.352309
2,8220DB000004,-6.264175,53.352575
3,8220DB000006,-6.264454,53.352749
4,8220DB000007,-6.264570,53.352841
...,...,...,...
4706,8350DB007462,-6.062480,53.128801
4707,8350DB007522,-6.118873,53.188131
4708,8350DB007574,-6.130064,53.182348
4709,8350GD10395,-6.170880,53.192599


In [5]:
df = df.merge(stop_df, on='stop_id')
df.head()

Unnamed: 0,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,lat,lon
0,16289.4.60-123-b12-1.71.I,20210119,11:05:00,19,-14640,-14640,2021-01-19 10:50:00,8220DB001278,-6.2613,53.344237
1,16287.4.60-123-b12-1.71.I,20210119,09:00:00,19,-8280,-8280,2021-01-19 08:51:57,8220DB001278,-6.2613,53.344237
2,16302.4.60-123-b12-1.71.I,20210119,09:15:00,19,-5700,-5700,2021-01-19 09:14:50,8220DB001278,-6.2613,53.344237
3,16316.4.60-123-b12-1.71.I,20210118,08:15:00,19,-3900,-3900,2021-01-18 08:02:08,8220DB001278,-6.2613,53.344237
4,1041.1.60-122-b12-1.65.O,20210114,14:00:00,23,-660,-660,2021-01-14 15:00:52,8220DB001278,-6.2613,53.344237


In [6]:
gb_trip = df['trip_id'].unique()
gb_trip

array(['16289.4.60-123-b12-1.71.I', '16287.4.60-123-b12-1.71.I',
       '16302.4.60-123-b12-1.71.I', ..., '2926.1.60-56A-b12-1.52.I',
       '7046.2.60-65-b12-1.274.I', '3400.1.60-46A-b12-1.251.I'],
      dtype=object)

In [4]:
# get all the stop data and add the lat lon columns
def get_stop_time_df(trip_id, conn): 
    query = """
    select stop_time.arrival_time, stop_time.departure_time, 
        stop_time.stop_sequence, stop_time.shape_dist_traveled
    from stop_time
    join trip on trip.id = stop_time.trip_id
    where trip.trip_id = '{}'
    group by stop_time.id
    order by stop_sequence
    ;
    """.format(trip_id).lstrip()
    df = pd.read_sql_query(query, conn())
    df['trip_id'] = trip_id
    return df

stop_time_df = get_stop_time_df('7218.10455.2-104-gad-1.83.O', get_conn)
stop_time_df.head()

Unnamed: 0,arrival_time,departure_time,stop_sequence,shape_dist_traveled,trip_id
0,28800,28800,1,0.0,7218.10455.2-104-gad-1.83.O
1,28920,28920,2,850.7,7218.10455.2-104-gad-1.83.O
2,28920,28920,3,1160.0,7218.10455.2-104-gad-1.83.O
3,28980,28980,4,1359.45,7218.10455.2-104-gad-1.83.O
4,29040,29040,5,1756.59,7218.10455.2-104-gad-1.83.O


In [8]:
# get a df with all the stop_time for that trip
delayed_funcs = [delayed(get_stop_time_df)(t_id, get_conn) for t_id in gb_trip]
parallel_pool = Parallel(n_jobs=8)
res = parallel_pool(delayed_funcs)

In [9]:
pd.concat(res)

Unnamed: 0,arrival_time,departure_time,stop_sequence,shape_dist_traveled,trip_id
0,32400,32400,1,0.00,16289.4.60-123-b12-1.71.I
1,32453,32453,2,377.15,16289.4.60-123-b12-1.71.I
2,32482,32482,3,583.63,16289.4.60-123-b12-1.71.I
3,32497,32497,4,683.71,16289.4.60-123-b12-1.71.I
4,32542,32542,5,853.23,16289.4.60-123-b12-1.71.I
...,...,...,...,...,...
11,22704,22704,12,3277.51,3400.1.60-46A-b12-1.251.I
12,22737,22737,13,3545.63,3400.1.60-46A-b12-1.251.I
13,22757,22757,14,3705.21,3400.1.60-46A-b12-1.251.I
14,22806,22806,15,4098.78,3400.1.60-46A-b12-1.251.I


In [33]:
df = df.merge(pd.concat(res), left_on=['trip_id', 'stop_sequence'], right_on=['trip_id', 'stop_sequence'], how='left')
df

Unnamed: 0,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,lat,lon,arrival_time,departure_time,shape_dist_traveled
0,16289.4.60-123-b12-1.71.I,20210119,11:05:00,19,-14640,-14640,2021-01-19 10:50:00,8220DB001278,-6.261300,53.344237,33569.0,33569.0,5330.51
1,16287.4.60-123-b12-1.71.I,20210119,09:00:00,19,-8280,-8280,2021-01-19 08:51:57,8220DB001278,-6.261300,53.344237,26392.0,26392.0,5330.51
2,16302.4.60-123-b12-1.71.I,20210119,09:15:00,19,-5700,-5700,2021-01-19 09:14:50,8220DB001278,-6.261300,53.344237,27292.0,27292.0,5330.51
3,16316.4.60-123-b12-1.71.I,20210118,08:15:00,19,-3900,-3900,2021-01-18 08:02:08,8220DB001278,-6.261300,53.344237,83167.0,83167.0,5330.51
4,1041.1.60-122-b12-1.65.O,20210114,14:00:00,23,-660,-660,2021-01-14 15:00:52,8220DB001278,-6.261300,53.344237,52649.0,52649.0,7800.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1331935,12475.3.60-31-b12-1.161.O,20210131,23:15:00,54,300,300,2021-01-31 23:53:38,8240DB000707,-6.058650,53.372490,85451.0,85451.0,17080.81
1331936,12333.3.60-65-b12-1.272.I,20210124,22:00:00,16,360,360,2021-01-24 22:20:43,8350DB004102,-6.515616,53.188873,79929.0,79929.0,11038.61
1331937,12304.3.60-65-b12-1.269.O,20210131,21:30:00,69,360,360,2021-01-31 22:26:53,8350DB004100,-6.515426,53.188736,80104.0,80104.0,28862.93
1331938,16460.4.60-65-b12-1.271.O,20210120,18:30:00,70,480,480,2021-01-20 19:45:43,8350DB004021,-6.518851,53.186181,59573.0,59573.0,29226.91


In [41]:
# start = time.time()
# with parallel_backend('threading'):
#     scats_model = load(scats)

# print(time.time()-start)

In [17]:
vaex.from_pandas(df).export_hdf5('./output/processing_temp.hdf5')
del df
import gc
gc.collect()

In [5]:
def add_stop_data(start):
    # read csv
    df = pd.read_csv(gtfs_final_csv_path)

    # get a list of all the stops
    stop_df = get_stops_df(get_conn)

    # merge the entity stop_id data with the stop lat lon from database
    df = pd.merge(df, stop_df, on=["stop_id"])
    print("merged stops, time: {}".format(round(time.time() - start)))

    # get all the stop_times for each trip in our realtime data
    trip_list = df["trip_id"].unique()
    delayed_funcs = [delayed(get_stop_time_df)(t_id, get_conn) for t_id in trip_list]

    parallel_pool = Parallel(n_jobs=8)
    res = parallel_pool(delayed_funcs)

    # stop times for each trip dataframe
    stop_time_trip_df = pd.concat(res)
    df = df.merge(
        stop_time_trip_df, left_on=["trip_id", "stop_sequence"], right_on=["trip_id", "stop_sequence"], how="left"
    )
    print("merged stop times, time: {}".format(round(time.time() - start)))

    # convert to hdf5
    vaex.from_pandas(df).export_hdf5("./output/processing_temp.hdf5")
add_stop_data(time.time())

merged stops, time: 3
merged stop times, time: 129


In [2]:
vx_df = vaex.open('./output/processing_temp.hdf5')
vx_df

#,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,lat,lon,arrival_time,departure_time,shape_dist_traveled
0,16289.4.60-123-b12-1.71.I,20210119,11:05:00,19,-14640,-14640,2021-01-19 10:50:00,8220DB001278,-6.26130011810001,53.3442366224365,33569.0,33569.0,5330.51
1,16287.4.60-123-b12-1.71.I,20210119,09:00:00,19,-8280,-8280,2021-01-19 08:51:57,8220DB001278,-6.26130011810001,53.3442366224365,26392.0,26392.0,5330.51
2,16302.4.60-123-b12-1.71.I,20210119,09:15:00,19,-5700,-5700,2021-01-19 09:14:50,8220DB001278,-6.26130011810001,53.3442366224365,27292.0,27292.0,5330.51
3,16316.4.60-123-b12-1.71.I,20210118,08:15:00,19,-3900,-3900,2021-01-18 08:02:08,8220DB001278,-6.26130011810001,53.3442366224365,83167.0,83167.0,5330.51
4,1041.1.60-122-b12-1.65.O,20210114,14:00:00,23,-660,-660,2021-01-14 15:00:52,8220DB001278,-6.26130011810001,53.3442366224365,52649.0,52649.0,7800.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1331935,12475.3.60-31-b12-1.161.O,20210131,23:15:00,54,300,300,2021-01-31 23:53:38,8240DB000707,-6.05864990209616,53.3724895313374,85451.0,85451.0,17080.81
1331936,12333.3.60-65-b12-1.272.I,20210124,22:00:00,16,360,360,2021-01-24 22:20:43,8350DB004102,-6.51561604212167,53.1888733556119,79929.0,79929.0,11038.61
1331937,12304.3.60-65-b12-1.269.O,20210131,21:30:00,69,360,360,2021-01-31 22:26:53,8350DB004100,-6.51542624487623,53.188736170197,80104.0,80104.0,28862.93
1331938,16460.4.60-65-b12-1.271.O,20210120,18:30:00,70,480,480,2021-01-20 19:45:43,8350DB004021,-6.51885133635455,53.1861812708552,59573.0,59573.0,29226.91


In [2]:
# with parallel_backend("threading"):
scats_model = load(scats)

In [4]:
def predict_traffic_from_gtfsr(_df):
    df = _df

    df['dow'] = df['start_date'].apply(lambda t: datetime.strptime(str(t), '%Y%m%d').weekday())
    df['month'] = df['start_date'].apply(lambda t: datetime.strptime(str(t), '%Y%m%d').month)
    df['day'] = df['start_date'].apply(lambda t: datetime.strptime(str(t), '%Y%m%d').day)
    df['hour'] = df['start_time'].apply(lambda t: datetime.strptime(t, '%H:%M:%S').hour)

    pca_coord = vaex.ml.PCA(features=['lat', 'lon'], n_components=2, prefix='pca')
    df = pca_coord.fit_transform(df)

    cycl_transform_hour = vaex.ml.CycleTransformer(features=['hour'], n=24)
    df = cycl_transform_hour.fit_transform(df)

    cycl_transform_dow = vaex.ml.CycleTransformer(features=['dow'], n=7)
    df = cycl_transform_dow.fit_transform(df)

    feats = df.get_column_names(regex='pca') + \
        df.get_column_names(regex='.*_x') + \
        df.get_column_names(regex='.*_y')

    # with parallel_backend("threading"):
        # scats_model = load(scats)
        # print("loaded scats model, time: {}".format(round(time.time() - start)))

    # with parallel_backend("threading"):
    #     df = scats_model.transform(df)

    return df[_df.column_names]
pred_df = predict_traffic_from_gtfsr(vx_df)
pred_df

#,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,lat,lon,arrival_time,departure_time,shape_dist_traveled,dow,month,day,hour
0,16289.4.60-123-b12-1.71.I,20210119,11:05:00,19,-14640,-14640,2021-01-19 10:50:00,8220DB001278,-6.26130011810001,53.3442366224365,33569.0,33569.0,5330.51,1,1,19,11
1,16287.4.60-123-b12-1.71.I,20210119,09:00:00,19,-8280,-8280,2021-01-19 08:51:57,8220DB001278,-6.26130011810001,53.3442366224365,26392.0,26392.0,5330.51,1,1,19,9
2,16302.4.60-123-b12-1.71.I,20210119,09:15:00,19,-5700,-5700,2021-01-19 09:14:50,8220DB001278,-6.26130011810001,53.3442366224365,27292.0,27292.0,5330.51,1,1,19,9
3,16316.4.60-123-b12-1.71.I,20210118,08:15:00,19,-3900,-3900,2021-01-18 08:02:08,8220DB001278,-6.26130011810001,53.3442366224365,83167.0,83167.0,5330.51,0,1,18,8
4,1041.1.60-122-b12-1.65.O,20210114,14:00:00,23,-660,-660,2021-01-14 15:00:52,8220DB001278,-6.26130011810001,53.3442366224365,52649.0,52649.0,7800.24,3,1,14,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1331935,12475.3.60-31-b12-1.161.O,20210131,23:15:00,54,300,300,2021-01-31 23:53:38,8240DB000707,-6.05864990209616,53.3724895313374,85451.0,85451.0,17080.81,6,1,31,23
1331936,12333.3.60-65-b12-1.272.I,20210124,22:00:00,16,360,360,2021-01-24 22:20:43,8350DB004102,-6.51561604212167,53.1888733556119,79929.0,79929.0,11038.61,6,1,24,22
1331937,12304.3.60-65-b12-1.269.O,20210131,21:30:00,69,360,360,2021-01-31 22:26:53,8350DB004100,-6.51542624487623,53.188736170197,80104.0,80104.0,28862.93,6,1,31,21
1331938,16460.4.60-65-b12-1.271.O,20210120,18:30:00,70,480,480,2021-01-20 19:45:43,8350DB004021,-6.51885133635455,53.1861812708552,59573.0,59573.0,29226.91,2,1,20,18
