In [4]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
from google.transit import gtfs_realtime_pb2
from google.protobuf.json_format import Parse, MessageToJson
import json
import psycopg2
import time
from django.contrib.gis.geos import Point, fromstr, GEOSGeometry
from datetime import datetime, timedelta
import geopandas as gpd
import rtree
import vaex
import vaex.ml
from joblib import delayed, Parallel, load, parallel_backend
# from haversine import haversine, Unit

dir = Path.cwd()
outdir = os.path.join(dir, 'output')
gtfs_records_zip = os.path.join(dir, 'data', 'GtfsRRecords.zip')
gtfs_csv_zip = os.path.join(outdir, 'gtfsr_csv.zip')
gtfs_final_csv_path = os.path.join(outdir, 'gtfsr.csv')
gtfs_processed_csv_path = os.path.join(outdir, "gtfsr_processed.csv")
scats = os.path.join(dir, 'output', 'scats_model.json')


In [5]:
# connect to the PostgreSQL server
def get_conn():
    return psycopg2.connect(host="localhost", port="25432", database="gis", user="docker", password="docker")

In [6]:
df = pd.read_csv(gtfs_final_csv_path)
df

Unnamed: 0,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id
0,11777.2.60-13-b12-1.22.I,20210109,18:00:00,1,0,0,2021-01-09 19:28:02,8230DB007229
1,11379.2.60-13-b12-1.19.O,20210109,18:00:00,1,0,0,2021-01-09 19:28:02,8240DB000324
2,8111.2.60-27-b12-1.151.O,20210109,18:10:00,1,0,0,2021-01-09 19:28:02,8220DB004595
3,7705.2.60-40-b12-1.206.O,20210109,18:10:00,1,0,0,2021-01-09 19:28:02,8240DB007132
4,7751.2.60-40-b12-1.209.I,20210109,18:10:00,1,0,0,2021-01-09 19:28:02,8230DB004795
...,...,...,...,...,...,...,...,...
13194471,12355.3.60-37-b12-1.39.O,20210131,23:20:00,3,0,0,2021-01-31 23:59:45,8220DB000784
13194472,12355.3.60-37-b12-1.39.O,20210131,23:20:00,56,120,120,2021-01-31 23:59:45,8240DB007227
13194473,12640.3.60-130-b12-1.74.I,20210131,23:30:00,1,0,0,2021-01-31 23:59:45,8220DB001772
13194474,12640.3.60-130-b12-1.74.I,20210131,23:30:00,18,1200,1200,2021-01-31 23:59:45,8220DB001729


In [17]:
# get all the stop data and add the lat lon columns
# import geoplot

def get_stops_df(conn): 
    query = """select stop_id, point as geom from stop;"""

    gdf = gpd.GeoDataFrame.from_postgis(query, conn())
    # geoplot.pointplot(gdf)

    gdf['lat'] = gdf.apply(lambda row: row['geom'].x, axis=1)
    gdf['lon'] = gdf.apply(lambda row: row['geom'].y, axis=1)

    return pd.DataFrame(gdf.drop(columns='geom'))
stop_df = get_stops_df(get_conn)
stop_df

Unnamed: 0,stop_id,lat,lon
0,8220DB000002,-6.263723,53.352244
1,8220DB000003,-6.263811,53.352309
2,8220DB000004,-6.264175,53.352575
3,8220DB000006,-6.264454,53.352749
4,8220DB000007,-6.264570,53.352841
...,...,...,...
4711,8350DB007462,-6.062480,53.128801
4712,8350DB007522,-6.118873,53.188131
4713,8350DB007574,-6.130064,53.182348
4714,8350GD10395,-6.170880,53.192599


In [16]:
df = df.merge(stop_df, on='stop_id')
df.head()

KeyboardInterrupt: 

In [9]:
gb_trip = df['trip_id'].unique()
gb_trip

NameError: name 'df' is not defined

In [14]:
def direction_angle(theta_1, phi_1, theta_2, phi_2):
    dtheta = theta_2 - theta_1
    dphi = phi_2 - phi_1
    radians = np.arctan2(dtheta, dphi)
    return np.rad2deg(radians)

# get all the stop data and add the lat lon columns
def get_stop_time_df(trip_id, conn): 
    query = """
    select 
        stop_time.arrival_time, stop_time.departure_time, 
        stop_time.stop_sequence, stop_time.shape_dist_traveled, 
        stop.stop_id, stop.point as geom
    from stop_time
    join stop on stop.id = stop_time.stop_id
    join trip on trip.id = stop_time.trip_id
    where trip.trip_id = '{}'
    group by stop_time.id, stop.id
    order by stop_sequence
    ;
    """.format(trip_id).lstrip()
    gdf = gpd.read_postgis(query, conn()).to_crs(epsg=4326)

    # convert the times to human readable format
    gdf['arrival_time'] = gdf['arrival_time'].apply(lambda d: datetime.fromtimestamp(int(d)).strftime('%H:%M:%S'))
    gdf['departure_time'] = gdf['departure_time'].apply(lambda d: datetime.fromtimestamp(int(d)).strftime('%H:%M:%S'))

    # convert the geom to lat lon
    gdf['lat'] = gdf.apply(lambda row: row['geom'].y, axis=1)
    gdf['lon'] = gdf.apply(lambda row: row['geom'].x, axis=1)

    # find the direction angle of the trip
    first = gdf.iloc[0]
    last = gdf.iloc[-1]
    gdf['direction_angle'] = direction_angle(first.lon, first.lat, last.lon, last.lat) 

    gdf['distance_between'] = gdf.to_crs(epsg=3035).distance(gdf.to_crs(epsg=3035).shift())
    gdf['shape_dist_between'] = gdf.shape_dist_traveled - gdf.shape_dist_traveled.shift()

    gdf['shape_dist_between'].iloc[0] = 0
    gdf['distance_between'].iloc[0] = 0
    
    # set the trip id, no need to fetch from db
    gdf['trip_id'] = trip_id
    gdf['start_time'] = gdf['arrival_time'].iloc[0]

    # return a new pandas df dropping the geom column
    # return pd.DataFrame(gdf.drop(columns='geom'))
    return gdf

stop_time_df = get_stop_time_df('16422.4.60-65-b12-1.271.O', get_conn)
stop_time_df

Unnamed: 0,arrival_time,departure_time,stop_sequence,shape_dist_traveled,stop_id,geom,lat,lon,direction_angle,distance_between,shape_dist_between,trip_id,start_time
0,10:30:00,10:30:00,1,0.00,8220DB007564,POINT (-6.25453 53.34683),53.346834,-6.254525,-122.339823,0.000000,0.00,16422.4.60-65-b12-1.271.O,10:30:00
1,10:32:42,10:32:42,2,589.01,8220DB004521,POINT (-6.26017 53.34430),53.344301,-6.260171,-122.339823,469.609889,589.01,16422.4.60-65-b12-1.271.O,10:30:00
2,10:34:39,10:34:39,3,1136.75,8220DB001283,POINT (-6.26493 53.34181),53.341809,-6.264928,-122.339823,421.045844,547.74,16422.4.60-65-b12-1.271.O,10:30:00
3,10:35:17,10:35:17,4,1360.53,8220DB004456,POINT (-6.26570 53.33991),53.339914,-6.265696,-122.339823,217.704725,223.78,16422.4.60-65-b12-1.271.O,10:30:00
4,10:35:51,10:35:51,5,1563.46,8220DB001284,POINT (-6.26587 53.33813),53.338128,-6.265874,-122.339823,199.886478,202.93,16422.4.60-65-b12-1.271.O,10:30:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,11:23:38,11:23:38,70,29226.91,8350DB004021,POINT (-6.51885 53.18618),53.186181,-6.518851,-122.339823,365.439659,363.98,16422.4.60-65-b12-1.271.O,10:30:00
69,11:24:01,11:24:01,71,29617.07,8350DB004022,POINT (-6.52211 53.18330),53.183301,-6.522108,-122.339823,388.148763,390.16,16422.4.60-65-b12-1.271.O,10:30:00
70,11:24:33,11:24:33,72,30180.10,8350DB004023,POINT (-6.52520 53.17860),53.178602,-6.525201,-122.339823,563.992771,563.03,16422.4.60-65-b12-1.271.O,10:30:00
71,11:25:04,11:25:04,73,30503.67,8350DB004024,POINT (-6.52697 53.17587),53.175874,-6.526971,-122.339823,326.860753,323.57,16422.4.60-65-b12-1.271.O,10:30:00


In [15]:
stop_time_df.to_crs(epsg=4326).iloc[0].geom.distance(stop_time_df.to_crs(epsg=4326).iloc[-1].geom)

0.32752063889662414

In [12]:
# get a df with all the stop_time for that trip
delayed_funcs = [delayed(get_stop_time_df)(t_id, get_conn) for t_id in gb_trip]
parallel_pool = Parallel(n_jobs=8)
res = parallel_pool(delayed_funcs)

In [18]:
comb_df = pd.concat(res)
comb_df

Unnamed: 0,arrival_time,departure_time,stop_sequence,shape_dist_traveled,trip_id
0,10:00:00,10:00:00,1,0.00,16289.4.60-123-b12-1.71.I
1,10:00:53,10:00:53,2,377.15,16289.4.60-123-b12-1.71.I
2,10:01:22,10:01:22,3,583.63,16289.4.60-123-b12-1.71.I
3,10:01:37,10:01:37,4,683.71,16289.4.60-123-b12-1.71.I
4,10:02:22,10:02:22,5,853.23,16289.4.60-123-b12-1.71.I
...,...,...,...,...,...
20,11:09:00,11:09:00,21,12012.90,9537.10453.2-332-gad-1.271.I
21,11:09:00,11:09:00,22,12185.06,9537.10453.2-332-gad-1.271.I
22,11:10:00,11:10:00,23,12433.63,9537.10453.2-332-gad-1.271.I
23,11:10:00,11:10:00,24,12665.55,9537.10453.2-332-gad-1.271.I


In [14]:
df = df.merge(comb_df, left_on=['trip_id', 'stop_sequence'], right_on=['trip_id', 'stop_sequence'])
df

Unnamed: 0,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,arrival_time,departure_time,shape_dist_traveled
0,16289.4.60-123-b12-1.71.I,20210119,11:05:00,19,-14640,-14640,2021-01-19 10:50:00,8220DB001278,10:19:29,10:19:29,5330.51
1,16289.4.60-123-b12-1.71.I,20210121,11:05:00,19,0,0,2021-01-21 11:18:21,8220DB001278,10:19:29,10:19:29,5330.51
2,16289.4.60-123-b12-1.71.I,20210125,09:00:00,19,0,0,2021-01-25 09:13:10,8220DB001278,10:19:29,10:19:29,5330.51
3,16289.4.60-123-b12-1.71.I,20210127,09:00:00,19,0,0,2021-01-27 09:57:19,8220DB001278,10:19:29,10:19:29,5330.51
4,16289.4.60-123-b12-1.71.I,20210119,11:05:00,19,120,120,2021-01-19 11:20:01,8220DB001278,10:19:29,10:19:29,5330.51
...,...,...,...,...,...,...,...,...,...,...,...
1331128,8397.2.60-145-b12-1.358.O,20210116,20:10:00,4,5280,5280,2021-01-16 21:39:22,8220DB001478,17:49:38,17:49:38,1415.45
1331129,8397.2.60-145-b12-1.358.O,20210116,20:10:00,4,5400,5400,2021-01-16 21:43:33,8220DB001478,17:49:38,17:49:38,1415.45
1331130,8397.2.60-145-b12-1.358.O,20210116,20:10:00,3,5340,5340,2021-01-16 21:43:33,8220DB007453,17:48:08,17:48:08,1041.80
1331131,17281.4.60-44-b12-1.244.I,20210118,09:00:00,2,5400,5400,2021-01-18 10:33:00,8350DB004114,22:31:01,22:31:01,214.91


In [33]:
# df[df.start_time > df.arrival_time]
# comb_df[comb_df['trip_id'] == '11758.2.60-13-b12-1.22.I']
# df.loc[(df.start_time < df.arrival_time)]
# df[(df.stop_sequence == 1) & (df.start_time == df.arrival_time)]
df[(df.trip_id == '11758.2.60-13-b12-1.22.I') & (df.stop_sequence == 1)]

Unnamed: 0,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,arrival_time,departure_time,shape_dist_traveled
408824,11758.2.60-13-b12-1.22.I,20210109,22:00:00,1,0,0,2021-01-09 23:13:18,8230DB007229,19:15:00,19:15:00,0.0
408825,11758.2.60-13-b12-1.22.I,20210123,18:15:00,1,0,0,2021-01-23 19:36:14,8230DB007229,19:15:00,19:15:00,0.0
408826,11758.2.60-13-b12-1.22.I,20210130,18:15:00,1,0,0,2021-01-30 19:36:01,8230DB007229,19:15:00,19:15:00,0.0
408827,11758.2.60-13-b12-1.22.I,20210123,18:15:00,1,60,0,2021-01-23 18:17:35,8230DB007229,19:15:00,19:15:00,0.0
408828,11758.2.60-13-b12-1.22.I,20210123,18:15:00,1,120,0,2021-01-23 18:25:51,8230DB007229,19:15:00,19:15:00,0.0


In [10]:
# del df[df['arrival_time'].isnull()]
df[df['arrival_time'].isnull()]

Unnamed: 0,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,arrival_time,departure_time,shape_dist_traveled


In [41]:
# start = time.time()
# with parallel_backend('threading'):
#     scats_model = load(scats)

# print(time.time()-start)

In [17]:
vaex.from_pandas(df).export_hdf5('./output/processing_temp.hdf5')
del df
import gc
gc.collect()

In [5]:
def add_stop_data(start):
    # read csv
    df = pd.read_csv(gtfs_final_csv_path)

    # get a list of all the stops
    stop_df = get_stops_df(get_conn)

    # merge the entity stop_id data with the stop lat lon from database
    df = pd.merge(df, stop_df, on=["stop_id"])
    print("merged stops, time: {}".format(round(time.time() - start)))

    # get all the stop_times for each trip in our realtime data
    trip_list = df["trip_id"].unique()
    delayed_funcs = [delayed(get_stop_time_df)(t_id, get_conn) for t_id in trip_list]

    parallel_pool = Parallel(n_jobs=8)
    res = parallel_pool(delayed_funcs)

    # stop times for each trip dataframe
    stop_time_trip_df = pd.concat(res)
    df = df.merge(
        stop_time_trip_df, left_on=["trip_id", "stop_sequence"], right_on=["trip_id", "stop_sequence"], how="left"
    )
    print("merged stop times, time: {}".format(round(time.time() - start)))

    # convert to hdf5
    vaex.from_pandas(df).export_hdf5("./output/processing_temp.hdf5")
add_stop_data(time.time())

merged stops, time: 3
merged stop times, time: 129


In [31]:
vx_df = vaex.open('./output/processing_temp.hdf5')
vx_df

#,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,lat,lon,arrival_time,departure_time,shape_dist_traveled
0,16289.4.60-123-b12-1.71.I,20210119,11:05:00,19,-14640,-14640,2021-01-19 10:50:00,8220DB001278,-6.26130011810001,53.3442366224365,10:19:29,10:19:29,5330.51
1,16287.4.60-123-b12-1.71.I,20210119,09:00:00,19,-8280,-8280,2021-01-19 08:51:57,8220DB001278,-6.26130011810001,53.3442366224365,08:19:52,08:19:52,5330.51
2,16302.4.60-123-b12-1.71.I,20210119,09:15:00,19,-5700,-5700,2021-01-19 09:14:50,8220DB001278,-6.26130011810001,53.3442366224365,08:34:52,08:34:52,5330.51
3,16316.4.60-123-b12-1.71.I,20210118,08:15:00,19,-3900,-3900,2021-01-18 08:02:08,8220DB001278,-6.26130011810001,53.3442366224365,00:06:07,00:06:07,5330.51
4,1041.1.60-122-b12-1.65.O,20210114,14:00:00,23,-660,-660,2021-01-14 15:00:52,8220DB001278,-6.26130011810001,53.3442366224365,15:37:29,15:37:29,7800.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1331935,12475.3.60-31-b12-1.161.O,20210131,23:15:00,54,300,300,2021-01-31 23:53:38,8240DB000707,-6.05864990209616,53.3724895313374,00:44:11,00:44:11,17080.81
1331936,12333.3.60-65-b12-1.272.I,20210124,22:00:00,16,360,360,2021-01-24 22:20:43,8350DB004102,-6.51561604212167,53.1888733556119,23:12:09,23:12:09,11038.61
1331937,12304.3.60-65-b12-1.269.O,20210131,21:30:00,69,360,360,2021-01-31 22:26:53,8350DB004100,-6.51542624487623,53.188736170197,23:15:04,23:15:04,28862.93
1331938,16460.4.60-65-b12-1.271.O,20210120,18:30:00,70,480,480,2021-01-20 19:45:43,8350DB004021,-6.51885133635455,53.1861812708552,17:32:53,17:32:53,29226.91


In [13]:
def predict_traffic_from_gtfsr(_df):
    df = _df

    def apply_dow(start_date, start_time, arrival_time):
        date = datetime.strptime(str(start_date), "%Y%m%d") 
        if datetime.strptime(str(arrival_time), '%H:%M:%S') < datetime.strptime(str(start_time), '%H:%M:%S'):
            return (date + timedelta(days=1)).weekday()
        return date.weekday()

    df['hour'] = df['arrival_time'].apply(lambda t: datetime.strptime(str(t), '%H:%M:%S').hour if not type(t) == None else None )
    df['dow'] = df.apply(apply_dow, ['start_date', 'start_time', 'arrival_time'])

    pca_coord = vaex.ml.PCA(features=['lat', 'lon'], n_components=2, prefix='pca')
    df = pca_coord.fit_transform(df)

    cycl_transform_hour = vaex.ml.CycleTransformer(features=['hour'], n=24)
    df = cycl_transform_hour.fit_transform(df)

    cycl_transform_dow = vaex.ml.CycleTransformer(features=['dow'], n=7)
    df = cycl_transform_dow.fit_transform(df)

    # with parallel_backend("threading"):
        # scats_model = load(scats)
        # print("loaded scats model, time: {}".format(round(time.time() - start)))

    # with parallel_backend("threading"):
    #     df = scats_model.transform(df)

    # _df = _df.drop(['dow', 'hour'])

    return df[_df.column_names]
pred_df = predict_traffic_from_gtfsr(vx_df)
pred_df

#,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,lat,lon,arrival_time,departure_time,shape_dist_traveled,hour,dow
0,16289.4.60-123-b12-1.71.I,20210119,11:05:00,19,-14640,-14640,2021-01-19 10:50:00,8220DB001278,-6.26130011810001,53.3442366224365,10:19:29,10:19:29,5330.51,10,2
1,16287.4.60-123-b12-1.71.I,20210119,09:00:00,19,-8280,-8280,2021-01-19 08:51:57,8220DB001278,-6.26130011810001,53.3442366224365,08:19:52,08:19:52,5330.51,8,2
2,16302.4.60-123-b12-1.71.I,20210119,09:15:00,19,-5700,-5700,2021-01-19 09:14:50,8220DB001278,-6.26130011810001,53.3442366224365,08:34:52,08:34:52,5330.51,8,2
3,16316.4.60-123-b12-1.71.I,20210118,08:15:00,19,-3900,-3900,2021-01-18 08:02:08,8220DB001278,-6.26130011810001,53.3442366224365,00:06:07,00:06:07,5330.51,0,1
4,1041.1.60-122-b12-1.65.O,20210114,14:00:00,23,-660,-660,2021-01-14 15:00:52,8220DB001278,-6.26130011810001,53.3442366224365,15:37:29,15:37:29,7800.24,15,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1331935,12475.3.60-31-b12-1.161.O,20210131,23:15:00,54,300,300,2021-01-31 23:53:38,8240DB000707,-6.05864990209616,53.3724895313374,00:44:11,00:44:11,17080.81,0,0
1331936,12333.3.60-65-b12-1.272.I,20210124,22:00:00,16,360,360,2021-01-24 22:20:43,8350DB004102,-6.51561604212167,53.1888733556119,23:12:09,23:12:09,11038.61,23,6
1331937,12304.3.60-65-b12-1.269.O,20210131,21:30:00,69,360,360,2021-01-31 22:26:53,8350DB004100,-6.51542624487623,53.188736170197,23:15:04,23:15:04,28862.93,23,6
1331938,16460.4.60-65-b12-1.271.O,20210120,18:30:00,70,480,480,2021-01-20 19:45:43,8350DB004021,-6.51885133635455,53.1861812708552,17:32:53,17:32:53,29226.91,17,3


In [14]:
pred_df[vx_df.column_names].export_csv(gtfs_processed_csv_path)

ERROR:ThreadPoolExecutor-6_0:vaex.execution:error in task, flush task queue
Traceback (most recent call last):
  File "/home/vlad/projects/dynamoDublin/backend_d-dub/.denv/lib/python3.8/site-packages/vaex/scopes.py", line 97, in evaluate
    result = self[expression]
  File "/home/vlad/projects/dynamoDublin/backend_d-dub/.denv/lib/python3.8/site-packages/vaex/scopes.py", line 144, in __getitem__
    raise KeyError("Unknown variables or column: %r" % (variable,))
KeyError: "Unknown variables or column: 'lambda_function_8(arrival_time)'"

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/vlad/projects/dynamoDublin/backend_d-dub/.denv/lib/python3.8/site-packages/vaex/execution.py", line 153, in execute_async
    async for element in self.thread_pool.map_async(self.process_part, parts,
  File "/home/vlad/projects/dynamoDublin/backend_d-dub/.denv/lib/python3.8/site-packages/vaex/multithreading.py", line 74, in map_async
   

ValueError: time data 'None' does not match format '%H:%M:%S'

In [84]:
def apply_dow(start_date=20210131, start_time='23:15:00', arrival_time='00:44:11'):
        date = datetime.strptime(str(start_date), "%Y%m%d") 
        if datetime.strptime(arrival_time, '%H:%M:%S') < datetime.strptime(start_time, '%H:%M:%S'):
            return (date + timedelta(days=1)).weekday()
        return date.weekday()

apply_dow()

0

In [32]:
vx_df.describe()

Unnamed: 0,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,lat,lon,arrival_time,departure_time,shape_dist_traveled
dtype,str,int64,str,int64,int64,int64,str,str,float64,float64,str,str,float64
count,1331940,1331940,1331940,1331940,1331940,1331940,1331940,1331940,1331940,1331940,1331133,1331133,1331133
,0,0,0,0,0,0,0,0,0,0,807,807,807
mean,--,20210124.51881241,--,31.475787197621514,121.96977341321681,118.28221991981621,--,--,-6.2589183179536825,53.340680169783184,--,--,10853.391890722698
std,--,28.207933,--,20.897237,305.690759,300.947633,--,--,0.07761,0.060382,--,--,7471.358078
min,--,20210109,--,1,-14640,-14640,--,--,-6.614865,53.070678,--,--,0.0
max,--,20210131,--,104,6660,6660,--,--,-6.053311,53.606196,--,--,45761.8


In [5]:
vx_df = vaex.open(gtfs_final_csv_path+'.hdf5')
vx_df

#,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id
0,11777.2.60-13-b12-1.22.I,20210109,18:00:00,1,0,0,2021-01-09 19:28:02,8230DB007229
1,11379.2.60-13-b12-1.19.O,20210109,18:00:00,1,0,0,2021-01-09 19:28:02,8240DB000324
2,8111.2.60-27-b12-1.151.O,20210109,18:10:00,1,0,0,2021-01-09 19:28:02,8220DB004595
3,7705.2.60-40-b12-1.206.O,20210109,18:10:00,1,0,0,2021-01-09 19:28:02,8240DB007132
4,7751.2.60-40-b12-1.209.I,20210109,18:10:00,1,0,0,2021-01-09 19:28:02,8230DB004795
...,...,...,...,...,...,...,...,...
13194471,12355.3.60-37-b12-1.39.O,20210131,23:20:00,3,0,0,2021-01-31 23:59:45,8220DB000784
13194472,12355.3.60-37-b12-1.39.O,20210131,23:20:00,56,120,120,2021-01-31 23:59:45,8240DB007227
13194473,12640.3.60-130-b12-1.74.I,20210131,23:30:00,1,0,0,2021-01-31 23:59:45,8220DB001772
13194474,12640.3.60-130-b12-1.74.I,20210131,23:30:00,18,1200,1200,2021-01-31 23:59:45,8220DB001729


In [26]:
one_trip = vx_df[vx_df.trip_id == '11777.2.60-13-b12-1.22.I']
one_trip

#,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id
0,11777.2.60-13-b12-1.22.I,20210109,18:00:00,1,0,0,2021-01-09 19:28:02,8230DB007229
1,11777.2.60-13-b12-1.22.I,20210109,18:00:00,1,0,0,2021-01-09 19:28:54,8230DB007229
2,11777.2.60-13-b12-1.22.I,20210109,18:00:00,1,0,0,2021-01-09 19:30:02,8230DB007229
3,11777.2.60-13-b12-1.22.I,20210109,18:00:00,1,0,0,2021-01-09 19:31:08,8230DB007229
4,11777.2.60-13-b12-1.22.I,20210109,18:00:00,1,0,0,2021-01-09 19:32:08,8230DB007229
...,...,...,...,...,...,...,...,...
878,11777.2.60-13-b12-1.22.I,20210130,14:00:00,1,0,0,2021-01-30 15:31:00,8230DB007229
879,11777.2.60-13-b12-1.22.I,20210130,14:00:00,76,240,240,2021-01-30 15:31:00,8220DB000092
880,11777.2.60-13-b12-1.22.I,20210130,14:00:00,80,300,240,2021-01-30 15:31:00,8220DB006239
881,11777.2.60-13-b12-1.22.I,20210130,14:00:00,81,240,240,2021-01-30 15:31:00,8220DB000104


In [33]:
import dask.dataframe as dd

ddf = dd.read_csv(gtfs_final_csv_path)
ddf.head()

Unnamed: 0,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id
0,11777.2.60-13-b12-1.22.I,20210109,18:00:00,1,0,0,2021-01-09 19:28:02,8230DB007229
1,11379.2.60-13-b12-1.19.O,20210109,18:00:00,1,0,0,2021-01-09 19:28:02,8240DB000324
2,8111.2.60-27-b12-1.151.O,20210109,18:10:00,1,0,0,2021-01-09 19:28:02,8220DB004595
3,7705.2.60-40-b12-1.206.O,20210109,18:10:00,1,0,0,2021-01-09 19:28:02,8240DB007132
4,7751.2.60-40-b12-1.209.I,20210109,18:10:00,1,0,0,2021-01-09 19:28:02,8230DB004795


In [35]:
ddf.drop_duplicates().to_csv('./output/temp_dask.csv')

['/home/vlad/projects/dynamoDublin/backend_d-dub/ml/processing/output/temp_dask.csv/0.part']