In [23]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import psycopg2
import time
from django.contrib.gis.geos import Point, fromstr, GEOSGeometry
from datetime import datetime, timedelta
import vaex
import vaex.ml
from joblib import delayed, Parallel, load, parallel_backend
# from haversine import haversine, Unit

dir = Path.cwd()
outdir = os.path.join(dir, 'output')
gtfs_records_zip = os.path.join(dir, 'data', 'GtfsRRecords.zip')
gtfs_csv_zip = os.path.join(outdir, 'gtfsr_csv.zip')
gtfs_final_csv_path = os.path.join(outdir, 'gtfsr.csv')
gtfs_processed_csv_path = os.path.join(outdir, "gtfsr_processed.csv")
scats = os.path.join(dir, 'output', 'scats_model.json')

In [2]:
df = vaex.from_csv(gtfs_processed_csv_path, convert=True)
df

#,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,lat,lon,arrival_time,departure_time,shape_dist_traveled,hour,dow,p_avg_vol
0,16289.4.60-123-b12-1.71.I,20210119,11:05:00,19,-14640,-14640,2021-01-19 10:50:00,8220DB001278,-6.26130011810001,53.3442366224365,10:19:29,10:19:29,5330.51,10,2,71.60020835813714
1,16287.4.60-123-b12-1.71.I,20210119,09:00:00,19,-8280,-8280,2021-01-19 08:51:57,8220DB001278,-6.26130011810001,53.3442366224365,08:19:52,08:19:52,5330.51,8,2,62.45976357578324
2,16302.4.60-123-b12-1.71.I,20210119,09:15:00,19,-5700,-5700,2021-01-19 09:14:50,8220DB001278,-6.26130011810001,53.3442366224365,08:34:52,08:34:52,5330.51,8,2,62.45976357578324
3,16316.4.60-123-b12-1.71.I,20210118,08:15:00,19,-3900,-3900,2021-01-18 08:02:08,8220DB001278,-6.26130011810001,53.3442366224365,00:06:07,00:06:07,5330.51,0,1,23.520948983404384
4,1041.1.60-122-b12-1.65.O,20210114,14:00:00,23,-660,-660,2021-01-14 15:00:52,8220DB001278,-6.26130011810001,53.3442366224365,15:37:29,15:37:29,7800.24,15,3,54.39009117546706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1331128,12475.3.60-31-b12-1.161.O,20210131,23:15:00,54,300,300,2021-01-31 23:53:38,8240DB000707,-6.05864990209616,53.3724895313374,00:44:11,00:44:11,17080.81,0,0,125.06669632749724
1331129,12333.3.60-65-b12-1.272.I,20210124,22:00:00,16,360,360,2021-01-24 22:20:43,8350DB004102,-6.51561604212167,53.1888733556119,23:12:09,23:12:09,11038.61,23,6,0.0
1331130,12304.3.60-65-b12-1.269.O,20210131,21:30:00,69,360,360,2021-01-31 22:26:53,8350DB004100,-6.51542624487623,53.188736170197,23:15:04,23:15:04,28862.93,23,6,0.0
1331131,16460.4.60-65-b12-1.271.O,20210120,18:30:00,70,480,480,2021-01-20 19:45:43,8350DB004021,-6.51885133635455,53.1861812708552,17:32:53,17:32:53,29226.91,17,3,5.655465097158063


In [3]:
df.describe().T

Unnamed: 0,dtype,count,NA,mean,std,min,max
trip_id,str,1331133,0,--,--,--,--
start_date,int64,1331133,0,20210124.523001082,28.28869,20210109,20210131
start_time,str,1331133,0,--,--,--,--
stop_sequence,int64,1331133,0,31.461628552518793,20.889282,1,104
departure,int64,1331133,0,121.96154704300773,305.660074,-14640,6660
arrival,int64,1331133,0,118.25043778495463,300.882935,-14640,6660
timestamp,str,1331133,0,--,--,--,--
stop_id,str,1331133,0,--,--,--,--
lat,float64,1331133,0,-6.258904306910986,0.077611,-6.614865,-6.053311
lon,float64,1331133,0,53.34068158078088,0.06039,53.070678,53.606196


In [31]:
df[(df['hour'] == 0) & (df['dow'] == 0)]

#,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,lat,lon,arrival_time,departure_time,shape_dist_traveled,hour,dow,p_avg_vol
0,12105.3.60-122-b12-1.61.O,20210117,09:00:00,23,0,0,2021-01-17 09:40:07,8220DB001278,-6.26130011810001,53.3442366224365,00:22:08,00:22:08,7800.24,0,0,22.714894603447075
1,12229.3.60-123-b12-1.71.I,20210131,23:10:00,19,60,60,2021-01-31 23:24:42,8220DB001278,-6.26130011810001,53.3442366224365,00:26:02,00:26:02,5330.51,0,0,22.714894603447075
2,12105.3.60-122-b12-1.61.O,20210131,23:00:00,23,120,120,2021-01-31 23:20:26,8220DB001278,-6.26130011810001,53.3442366224365,00:22:08,00:22:08,7800.24,0,0,22.714894603447075
3,12105.3.60-122-b12-1.61.O,20210131,23:00:00,23,240,240,2021-01-31 23:23:30,8220DB001278,-6.26130011810001,53.3442366224365,00:22:08,00:22:08,7800.24,0,0,22.714894603447075
4,12105.3.60-122-b12-1.61.O,20210131,23:00:00,23,840,840,2021-01-31 23:51:29,8220DB001278,-6.26130011810001,53.3442366224365,00:22:08,00:22:08,7800.24,0,0,22.714894603447075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4590,3150.10447.2-185-gad-1.174.O,20210131,23:00:00,4,60,60,2021-01-31 23:02:57,8350DB004111,-6.19133907152751,53.1950960491494,00:01:00,00:01:00,957.32,0,0,34.88427118911486
4591,7779.10447.2-270-gad-1.236.O,20210124,23:03:00,6,120,120,2021-01-24 23:08:18,8310DB003337,-6.44897984739847,53.4140798813531,00:05:00,00:05:00,1875.87,0,0,3.9648713548811303
4592,12877.3.60-44-b12-1.246.O,20210131,23:00:00,79,240,240,2021-01-31 23:57:43,8350DB004093,-6.17351696534828,53.1988778931298,00:58:11,00:58:11,27882.07,0,0,34.73150895135262
4593,12475.3.60-31-b12-1.161.O,20210131,23:15:00,63,360,360,2021-01-31 23:55:34,8240DB000716,-6.08875424382869,53.3724901056998,00:47:13,00:47:13,20020.37,0,0,11.203738622452263


In [24]:
df.plot_widget(df.lon, df.lat, shape=512, colormap='plasma', f='log1p', limits='minmax')

Heatmap(children=[ToolsToolbar(interact_value=None, supports_normalize=False, template='<template>\n  <v-toolb…