In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point, Polygon, LineString
import contextily as cx
import datetime as dt
import json

from functions_file import *
print(testfunction(4))

### Outline
1. Import data and add basic information
    1. Import data
    1. Define area & Coarse filtering
    1. Add information
    1. Fine filtering
2. Clean data

### 1. Import data and add basic information

In [None]:
##################
# IMPORT DATA
##################

veh_info_list_all = pd.read_csv('../data/veh_info_list.csv',sep=',')
waypoints_w_dist_all = pd.read_csv('../data/waypoints_w_dist.csv')
# make unique id
waypoints_w_dist_all['unique_id'] = waypoints_w_dist_all.exp_id*1000000+waypoints_w_dist_all.track_id
veh_info_list_all['unique_id'] = veh_info_list_all.exp_id*1000000+veh_info_list_all.track_id

In [None]:
exp_info = pd.read_csv('../data/experiment_list_info.csv')
# remove exp_ids in exp_info that don't appear in selected region
exp_info = exp_info[exp_info.exp_id.isin(waypoints_w_dist_all.exp_id)]
# merge exp info
tmp = pd.merge(exp_info, waypoints_w_dist_all, how="outer", on=["exp_id"])
# map modes to waypoints_w_dist
waypoints_all = pd.merge(veh_info_list_all[['unique_id','type']], tmp, how="outer", on=["unique_id"])

In [None]:
##################
# SELECT ROAD SEGMENT / POLYGON
# All with bus route. Direction is driving direction.
# Repeat from here on for every polygon: starting from polygon 0 to polygon 11, except polygon 4 (which contains several links)
##################

polygon_name = 'polygon_r0'

polygons = pd.read_csv('../data/polygons11.csv')
POLYGON = get_polygon(polygon_name,polygons)
print('Available polygon names:',[a for a in polygons.name.values])
print('\nName: %s\nLength (km): %s\nLanes: %i\nDirection (°): %s\nBus stops: %i\nSeparate lane: %i\nComment: %s\nCoordinates: %s'%(
      POLYGON['name'],POLYGON['length'],POLYGON['lanes'],POLYGON['direction'],POLYGON['busstops'],POLYGON['seplane'],POLYGON['comment'],POLYGON['coords']))

In [None]:
##################
# COARSE FILTERING
##################

lons,lats = POLYGON['coords'].exterior.coords.xy
waypoints_w_dist = waypoints_w_dist_all[
    (waypoints_w_dist_all.lat>min(lats)) &
    (waypoints_w_dist_all.lat<max(lats)) &
    (waypoints_w_dist_all.lon>min(lons)) &
    (waypoints_w_dist_all.lon<max(lons)) ]

# remove ids in veh_info_list that don't appear in selected region
veh_info_list = veh_info_list_all[veh_info_list_all.unique_id.isin(waypoints_w_dist.unique_id)]

In [None]:
##################
# ADD INFORMATION
##################

# import info
exp_info = pd.read_csv('../data/experiment_list_info.csv')
# remove exp_ids in exp_info that don't appear in selected region
exp_info = exp_info[exp_info.exp_id.isin(waypoints_w_dist.exp_id)]

# merge exp info
tmp = pd.merge(exp_info, waypoints_w_dist, how="outer", on=["exp_id"])
# map modes to waypoints_w_dist
waypoints_w_dist_mode = pd.merge(veh_info_list[['unique_id','type']], tmp, how="outer", on=["unique_id"])

# make a global time
timestamps = []
hours = waypoints_w_dist_mode.ST//1
start_minutes = np.where(waypoints_w_dist_mode.ST%1>0, 30, 0)
minutes = start_minutes + waypoints_w_dist_mode.time//60
DOYs =  waypoints_w_dist_mode.DOY
seconds = waypoints_w_dist_mode.time%60
for i in range(len(waypoints_w_dist_mode)):
    timestamps.append(dt.datetime.strptime(
        '%s %i:%i:%f'%(DOYs[i],hours[i],minutes[i],seconds[i]),'%Y-%m-%d %H:%M:%S.%f'))
waypoints_w_dist_mode['timestamp'] = timestamps  

# drop unnecessary columns
waypoints_w_dist_mode.drop(columns=['DOY']) # 'DOW'

# check
print(waypoints_w_dist_mode.head())
print(waypoints_w_dist_mode['lat'].isna().sum()) # check if all were assigned

In [None]:
##################
# FINE FILTERING
##################

non_geo_cols = list(waypoints_w_dist_mode.columns.drop(['lat','lon']))
geo_df_full = gpd.GeoDataFrame(waypoints_w_dist_mode[non_geo_cols], crs='epsg:4326', 
                          geometry=gpd.points_from_xy(waypoints_w_dist_mode.lon, waypoints_w_dist_mode.lat))
waypoints_w_dist_mode = geo_df_full[geo_df_full.geometry.within(POLYGON['coords'])]

### 2. Clean Data

In [None]:
##################
# CLEAN DATA 1
##################

# remove cyclists and pedestrians
waypoints_w_dist_mode = waypoints_w_dist_mode[~waypoints_w_dist_mode['type'].isin(['Bicycle','Pedestrian'])]

# remove start and end
leninit = len(waypoints_w_dist_mode)
remove = 60 #s
max_time_exp = waypoints_w_dist_mode.groupby(by=['exp_id']).max()['time']
for exp in max_time_exp.index:
    # drop first and last seconds
    x = waypoints_w_dist_mode[
        (waypoints_w_dist_mode.exp_id==exp) &
        ((waypoints_w_dist_mode.time<remove) | (waypoints_w_dist_mode.time>max_time_exp[exp]-remove))]
    waypoints_w_dist_mode = waypoints_w_dist_mode.drop(x.index.values)
print('Start/end removed: %.1f%%. New length is %s.\n'%(100*(1-len(waypoints_w_dist_mode)/leninit),len(waypoints_w_dist_mode)))

# check for speed outliers
print('Max. speed of ')
for exp in max_time_exp.index:
    subset = waypoints_w_dist_mode[(waypoints_w_dist_mode.exp_id==exp)]
    print('mean: exp %i: %.ikm/h'%(exp,subset.speed.mean()))
    
# remove experiment 8
remove = [8]
waypoints_w_dist_mode = waypoints_w_dist_mode[~waypoints_w_dist_mode.exp_id.isin(remove)]


In [None]:
# remove parked vehicles
group_df = pd.DataFrame()
group_df[['unique_id','d1']] = waypoints_w_dist_mode.groupby('unique_id')['trv_dist'].max().reset_index()[['unique_id','trv_dist']]
group_df['d0'] = waypoints_w_dist_mode.groupby('unique_id')['trv_dist'].min().reset_index()['trv_dist']
group_df['ddiff'] = group_df.d1-group_df.d0
display(group_df.head())
cutoff_distance = POLYGON['length']*1000 / 10
remove = group_df[group_df.ddiff<cutoff_distance].unique_id.values
waypoints_w_dist_mode = waypoints_w_dist_mode[~waypoints_w_dist_mode.unique_id.isin(remove)]
print('%.1f%% of probes cover less than %s of the %s m'%(100*len(group_df[group_df.ddiff<cutoff_distance])/len(group_df),cutoff_distance,POLYGON['length']*1000))

In [None]:
##################
# CLEAN DATA 2 # remove vehicles moving in wrong direction
##################

# get start and end location
idx_max = waypoints_w_dist_mode.groupby(['unique_id'])['time'].transform(max) == waypoints_w_dist_mode['time']
idx_min = waypoints_w_dist_mode.groupby(['unique_id'])['time'].transform(min) == waypoints_w_dist_mode['time']
ids = waypoints_w_dist_mode[idx_max].unique_id.values
data_max = waypoints_w_dist_mode[idx_max].geometry.reset_index(drop=True)
data_min = waypoints_w_dist_mode[idx_min].geometry.reset_index(drop=True)
# check each vehicle
remove = []
correct = POLYGON['direction']
ds = []
for idx,i in enumerate(ids):
    loc1 = data_max.iloc[idx]
    loc0 = data_min.loc[idx]
    th = np.arctan2(loc1.y-loc0.y, loc1.x-loc0.x);
    direction = (th*180/3.14 + 360) % 360; 
    
    if abs(direction-correct)>30:
        print('id: %s, direction: %s, correct direction:%s'%(i,direction,correct))
        remove.append(i)
    else:
        ds.append(direction)
print('Average direction: %s, correct direction: %s'%(np.mean(ds),correct))
# remove faulty vehicle ids
leninit = len(waypoints_w_dist_mode)
waypoints_w_dist_mode = waypoints_w_dist_mode[~waypoints_w_dist_mode.unique_id.isin(remove)]
print('removed %.1f%%.'%(100*(1-len(waypoints_w_dist_mode)/leninit)))

In [None]:
##################
# SAVE DATA
##################

# COLUMNS: 'unique_id','type','exp_id','DOW','ST','ET','DOY','track_id','time','speed','trv_dist','timestamp','geometry'

save = 'pickle' # 'pickle','csv'
if save=='csv':
    print(waypoints_w_dist_mode.head())
    file_name = '../output/data_clean/prepared_data_%s.csv'%(POLYGON['name']) 
    waypoints_w_dist_mode.to_csv(file_name,encoding='iso-8859-1',index=False) 
    print('Saved as csv.')
elif save=='pickle':
    file_name = '../output/data_clean/prepared_data_%s.pkl'%(POLYGON['name']) 
    waypoints_w_dist_mode.to_pickle(file_name) # index=False
    print('Saved as pickle.')
else:
    print('Not saved.')

In [None]:
#########################
# IMPORT DATA
#########################
polygon_name = 'polygon_r0'
file_type = 'pickle'
if file_type=='pickle':
    file_name = '../output/data_clean/prepared_data_%s.pkl'%(polygon_name)
if file_type=='csv':
    file_name = '../output/data_clean/prepared_data_%s.csv'%(polygon_name)

waypoints_w_dist_mode = import_clean_data(file_name,file_type,polygon_name)
display(waypoints_w_dist_mode.head(3))