In [1]:
import joblib
from pathlib import Path
import time
import vaex
import vaex.ml
from datetime import datetime

dir = Path().cwd()
csv = os.path.join(dir, 'output', 'gtfsr.csv')
scats = os.path.join(dir, 'output', 'scats_model.json')


In [2]:
start = time.time()
with joblib.parallel_backend('threading'):
    scats_model = joblib.load(scats)

print(time.time()-start)

35.1014244556427


In [3]:
if not os.path.exists(csv+'.hdf5'):
    df = vaex.from_csv(csv, convert=True, copy_index=False, chunk_size=100000)
    df.export(csv, shuffle=True)
df = vaex.open(csv+'.hdf5')

df.head()

#,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,lon,lat
0,11777.2.60-13-b12-1.22.I,20210100.0,18:00:00,1,0,0,2021-01-09 19:28:02,8230DB007229,53.3301,-6.45108
1,11756.2.60-13-b12-1.22.I,20210100.0,18:15:00,1,0,0,2021-01-09 19:28:02,8230DB007229,53.3301,-6.45108
2,11370.2.60-13-b12-1.23.I,20210100.0,19:00:00,1,0,0,2021-01-09 19:28:02,8230DB007229,53.3301,-6.45108
3,11379.2.60-13-b12-1.19.O,20210100.0,18:00:00,1,0,0,2021-01-09 19:28:02,8240DB000324,53.4177,-6.27864
4,11394.2.60-13-b12-1.19.O,20210100.0,18:15:00,1,0,0,2021-01-09 19:28:02,8240DB000324,53.4177,-6.27864
5,8595.2.60-4-b12-1.5.O,20210100.0,18:30:00,1,0,0,2021-01-09 19:28:02,8240DB000324,53.4177,-6.27864
6,8566.2.60-4-b12-1.5.O,20210100.0,18:45:00,1,0,0,2021-01-09 19:28:02,8240DB000324,53.4177,-6.27864
7,11401.2.60-13-b12-1.19.O,20210100.0,18:45:00,1,0,0,2021-01-09 19:28:02,8240DB000324,53.4177,-6.27864
8,11662.2.60-27B-b12-1.97.I,20210100.0,19:00:00,1,0,0,2021-01-09 19:28:02,8240DB000324,53.4177,-6.27864
9,8924.2.60-4-b12-1.5.O,20210100.0,19:00:00,1,0,0,2021-01-09 19:28:02,8240DB000324,53.4177,-6.27864


In [4]:
len(df)

3100815

In [34]:
def apply_dow(t):
    return datetime.strptime(str(t), '%Y%m%d').weekday()

def apply_day(t):
    return datetime.strptime(str(t), '%Y%m%d').day

def apply_month(t):
    return datetime.strptime(str(t), '%Y%m%d').month

def apply_hour(t):
    return datetime.strptime(t, '%H:%M:%S').hour

df['dow'] = df['start_date'].apply(apply_dow)
df['month'] = df['start_date'].apply(apply_month)
df['day'] = df['start_date'].apply(apply_day)
df['hour'] = df['start_time'].apply(apply_hour)
df


#,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,lon,lat,dow,month,day,hour
0,11777.2.60-13-b12-1.22.I,20210109,18:00:00,1,0,8230DB007229,0,2021-01-09 19:28:02,53.3300889307077,-6.45108278609524,5,1,9,18
1,11756.2.60-13-b12-1.22.I,20210109,18:15:00,1,0,8230DB007229,0,2021-01-09 19:28:02,53.3300889307077,-6.45108278609524,5,1,9,18
2,11370.2.60-13-b12-1.23.I,20210109,19:00:00,1,0,8230DB007229,0,2021-01-09 19:28:02,53.3300889307077,-6.45108278609524,5,1,9,19
3,11379.2.60-13-b12-1.19.O,20210109,18:00:00,1,0,8240DB000324,0,2021-01-09 19:28:02,53.4177226807655,-6.27864416912571,5,1,9,18
4,11394.2.60-13-b12-1.19.O,20210109,18:15:00,1,0,8240DB000324,0,2021-01-09 19:28:02,53.4177226807655,-6.27864416912571,5,1,9,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3100810,4019.10447.2-76-gad-1.67.I,20210131,23:25:00,31,-120,8230DB005008,-120,2021-01-31 23:57:43,53.2904501890302,-6.3693569938866,6,1,31,23
3100811,7791.10447.2-270-gad-1.235.I,20210131,23:40:00,1,0,8240DB007026,0,2021-01-31 23:57:43,53.3938952323943,-6.39206445355138,6,1,31,23
3100812,7791.10447.2-270-gad-1.235.I,20210131,23:40:00,6,60,8240DB004765,60,2021-01-31 23:57:43,53.4035321061144,-6.41438109242276,6,1,31,23
3100813,7791.10447.2-270-gad-1.235.I,20210131,23:40:00,1,0,8240DB007026,0,2021-01-31 23:58:50,53.3938952323943,-6.39206445355138,6,1,31,23


In [37]:
def predict_traffic_from_gtfsr(df):
    new_df = df
    pca_coord = vaex.ml.PCA(features=['lat', 'lon'], n_components=2, prefix='pca')
    new_df = pca_coord.fit_transform(new_df)

    cycl_transform_hour = vaex.ml.CycleTransformer(features=['hour'], n=24)
    new_df = cycl_transform_hour.fit_transform(new_df)

    cycl_transform_dow = vaex.ml.CycleTransformer(features=['dow'], n=7)
    new_df = cycl_transform_dow.fit_transform(new_df)

    feats = new_df.get_column_names(regex='pca') + \
        new_df.get_column_names(regex='.*_x') + \
        new_df.get_column_names(regex='.*_y')

    new_df = scats_model.transform(train_df)
    return new_df[df.column_names + ['p_avg_vol']]

pred_df = predict_traffic_from_gtfsr(df)
pred_df


#,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,lon,lat,dow,month,day,hour,p_avg_vol
0,11777.2.60-13-b12-1.22.I,20210109,18:00:00,1,0,8230DB007229,0,2021-01-09 19:28:02,53.3300889307077,-6.45108278609524,5,1,9,18,2.0816397977575285
1,11756.2.60-13-b12-1.22.I,20210109,18:15:00,1,0,8230DB007229,0,2021-01-09 19:28:02,53.3300889307077,-6.45108278609524,5,1,9,18,2.0816397977575285
2,11370.2.60-13-b12-1.23.I,20210109,19:00:00,1,0,8230DB007229,0,2021-01-09 19:28:02,53.3300889307077,-6.45108278609524,5,1,9,19,0.6854268602814061
3,11379.2.60-13-b12-1.19.O,20210109,18:00:00,1,0,8240DB000324,0,2021-01-09 19:28:02,53.4177226807655,-6.27864416912571,5,1,9,18,63.324674301400876
4,11394.2.60-13-b12-1.19.O,20210109,18:15:00,1,0,8240DB000324,0,2021-01-09 19:28:02,53.4177226807655,-6.27864416912571,5,1,9,18,63.324674301400876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3100810,4019.10447.2-76-gad-1.67.I,20210131,23:25:00,31,-120,8230DB005008,-120,2021-01-31 23:57:43,53.2904501890302,-6.3693569938866,6,1,31,23,11.459041315839096
3100811,7791.10447.2-270-gad-1.235.I,20210131,23:40:00,1,0,8240DB007026,0,2021-01-31 23:57:43,53.3938952323943,-6.39206445355138,6,1,31,23,56.765784170384805
3100812,7791.10447.2-270-gad-1.235.I,20210131,23:40:00,6,60,8240DB004765,60,2021-01-31 23:57:43,53.4035321061144,-6.41438109242276,6,1,31,23,56.765784170384805
3100813,7791.10447.2-270-gad-1.235.I,20210131,23:40:00,1,0,8240DB007026,0,2021-01-31 23:58:50,53.3938952323943,-6.39206445355138,6,1,31,23,56.765784170384805


In [11]:
pred_df['p_avg_vol']

Expression = p_avg_vol
Length: 3,100,815 dtype: float64 (column)
-----------------------------------------
      0   2.08164
      1   2.08164
      2  0.685427
      3   63.3247
      4   63.3247
       ...       
3100810    11.459
3100811   56.7658
3100812   56.7658
3100813   56.7658
3100814   56.7658

In [38]:
trip_27 = pred_df[pred_df['trip_id'].str.contains('-27-')]
trip_27

#,trip_id,start_date,start_time,stop_sequence,departure,arrival,timestamp,stop_id,lon,lat,dow,month,day,hour,p_avg_vol
0,17608.4.60-27-b12-1.151.O,20210118,06:00:00,1,0,8220DB004595,0,2021-01-18 05:31:07,53.4021399240889,-6.1729725878937,0,1,18,6,22.267462864978395
1,17608.4.60-27-b12-1.151.O,20210118,06:00:00,1,0,8220DB004595,0,2021-01-18 05:32:08,53.4021399240889,-6.1729725878937,0,1,18,6,22.267462864978395
2,17673.4.60-27-b12-1.149.I,20210118,05:30:00,1,0,8230DB002353,0,2021-01-18 05:33:07,53.279150529518,-6.40175897234439,0,1,18,5,22.912470340780708
3,17608.4.60-27-b12-1.151.O,20210118,06:00:00,1,0,8220DB004595,0,2021-01-18 05:33:07,53.4021399240889,-6.1729725878937,0,1,18,6,22.267462864978395
4,17673.4.60-27-b12-1.149.I,20210118,05:30:00,1,0,8230DB002353,0,2021-01-18 05:34:09,53.279150529518,-6.40175897234439,0,1,18,5,22.912470340780708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64456,17628.4.60-27-b12-1.151.O,20210122,09:30:00,1,0,8220DB004595,0,2021-01-22 10:47:40,53.4021399240889,-6.1729725878937,4,1,22,9,63.66831483327159
64457,17628.4.60-27-b12-1.151.O,20210122,09:30:00,79,-60,8230DB004640,-60,2021-01-22 10:47:40,53.289395624798,-6.37374708548246,4,1,22,9,53.717319260508006
64458,17628.4.60-27-b12-1.151.O,20210122,09:30:00,80,0,8230DB004347,0,2021-01-22 10:47:40,53.2862220524214,-6.37500751308391,4,1,22,9,53.717319260508006
64459,17628.4.60-27-b12-1.151.O,20210122,09:30:00,1,0,8220DB004595,0,2021-01-22 10:48:44,53.4021399240889,-6.1729725878937,4,1,22,9,63.66831483327159
