data_processing.ipynb
by: 
- Sheil Kumar (sk17@illinois.edu)
- Anirudh Eswara (aeswara2@illinois.edu)
- Lloyd Fernandes (lloydf2@illinois.edu)

Project director: Richard Sowers
r-sowers@illinois.eduhttps://publish.illinois.edu/r-sowers/
Copyright 2019 University of Illinois Board of Trustees. All Rights Reserved. Licensed under the MIT license


# ETL on pNEUMA dataset
This notebook serves to show how the features in `data.py` and `feature_eng.py` is used to clean and augment the data downloaded from 



In [1]:
import sys
sys.path.append('../src/')
sys.path.append('../data/')

from data_loader import *
from feature_eng import *

## 1.0 Data.py

- Functions:
  - `csv_to_df`
  - `pickle_df`

In [2]:
#The original dataset
pd.read_csv('../data/block4_sample.csv').head()

Unnamed: 0,track_id; type; traveled_d; avg_speed; lat; lon; speed; lon_acc; lat_acc; time
0,1; Taxi; 54.52; 30.666639; 37.980927; 23.73476...
1,2; Taxi; 196.24; 12.481974; 37.982899; 23.7332...
2,3; Taxi; 3.00; 27.033729; 37.980567; 23.735136...
3,4; Taxi; 37.81; 32.412429; 37.980794; 23.73487...
4,5; Car; 181.72; 11.937486; 37.982973; 23.73335...


In [3]:
#csv_to_df converts the above displayed csv to a multi Index dataframe
df = csv_to_df('../data/block4_sample.csv') #simply need to feed in name of .csv file containing dat a11
df.to_pickle('../data/block4_sample.pkl') #simply need to feed in dataframe and desired name for .pkl file 
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,speed,lon_acc,lat_acc,type,traveled_d,avg_speed
id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.0,37.980927,23.734767,33.6878,0.2358,0.0018,Taxi,54.52,30.666639
1,0.04,37.980924,23.734769,33.7222,0.2427,0.0021,Taxi,54.52,30.666639
1,0.08,37.980921,23.734772,33.7586,0.2625,0.0024,Taxi,54.52,30.666639
1,0.12,37.980918,23.734774,33.797,0.2704,0.0027,Taxi,54.52,30.666639
1,0.16,37.980915,23.734776,33.8351,0.259,0.003,Taxi,54.52,30.666639


## 2.0 feature_eng.py

- Functions:
    - `bearing`
    - `nearest_graph_data`
    - `direction`
    - `vehicle_density` 
    - `cross_track`
    - `edge_average_speed`
    - `split_trajectories`

In [4]:
#Add bearings
df = bearing(df) #using bearing will then return a df with bearing as a column 
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,speed,lon_acc,lat_acc,type,traveled_d,avg_speed,bearing
id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.0,37.980927,23.734767,33.6878,0.2358,0.0018,Taxi,54.52,30.666639,3.027448
1,0.04,37.980924,23.734769,33.7222,0.2427,0.0021,Taxi,54.52,30.666639,2.97129
1,0.08,37.980921,23.734772,33.7586,0.2625,0.0024,Taxi,54.52,30.666639,3.027444
1,0.12,37.980918,23.734774,33.797,0.2704,0.0027,Taxi,54.52,30.666639,3.027443
1,0.16,37.980915,23.734776,33.8351,0.259,0.003,Taxi,54.52,30.666639,2.971285


In [5]:
#add nearest nodes and edges
lat = df.iloc[1,0]
lon = df.iloc[1,1]
graph = ox.graph_from_point((lat,lon), network_type='drive', dist=700)
#graph = ox.graph_from_address('Athens, Municipality of Athens, Regional Unit of Central Athens, Attica, 10667, Greece', network_type='drive')  #need to provide graph from osmnx as well 
df = nearest_graph_data(df, graph,mode='balltree') #nearest_graph_data will then return multiple features regarding the node and edge per id 
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,speed,lon_acc,lat_acc,type,traveled_d,avg_speed,bearing,nearest_edge_start_node,nearest_edge_end_node,edge_progress
id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.0,37.980927,23.734767,33.6878,0.2358,0.0018,Taxi,54.52,30.666639,3.027448,250691795,250699359,0.545202
1,0.04,37.980924,23.734769,33.7222,0.2427,0.0021,Taxi,54.52,30.666639,2.97129,250691795,250699359,0.543026
1,0.08,37.980921,23.734772,33.7586,0.2625,0.0024,Taxi,54.52,30.666639,3.027444,250691795,250699359,0.540545
1,0.12,37.980918,23.734774,33.797,0.2704,0.0027,Taxi,54.52,30.666639,3.027443,250691795,250699359,0.538369
1,0.16,37.980915,23.734776,33.8351,0.259,0.003,Taxi,54.52,30.666639,2.971285,250691795,250699359,0.536192


In [6]:
#assign edge_id  
df["edge_id"] = df["nearest_edge_start_node"].astype(str)+"_"+df["nearest_edge_end_node"].astype(str)

#find edges which are greater than 65m in length
df_nodes = df.reset_index()[['nearest_edge_start_node','nearest_edge_end_node','edge_id']].drop_duplicates()

#find length of each edge
df_nodes['len'] = df_nodes.apply(lambda x: length(x[0],x[1],graph),axis = 1)

#remove trajectories where edge is smaller than 65m
df = df[df.edge_id.isin(df_nodes[df_nodes.len>=65].edge_id)]

In [7]:
# add vehicle direction wrt edge
df = direction(df) # running direction on dataframe will then return a df with the direction as a column 
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,speed,lon_acc,lat_acc,type,traveled_d,avg_speed,bearing,nearest_edge_start_node,nearest_edge_end_node,edge_progress,edge_id,dir
id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0.0,37.980927,23.734767,33.6878,0.2358,0.0018,Taxi,54.52,30.666639,3.027448,250691795,250699359,0.545202,250691795_250699359,0
1,0.04,37.980924,23.734769,33.7222,0.2427,0.0021,Taxi,54.52,30.666639,2.97129,250691795,250699359,0.543026,250691795_250699359,0
1,0.08,37.980921,23.734772,33.7586,0.2625,0.0024,Taxi,54.52,30.666639,3.027444,250691795,250699359,0.540545,250691795_250699359,0
1,0.12,37.980918,23.734774,33.797,0.2704,0.0027,Taxi,54.52,30.666639,3.027443,250691795,250699359,0.538369,250691795_250699359,0
1,0.16,37.980915,23.734776,33.8351,0.259,0.003,Taxi,54.52,30.666639,2.971285,250691795,250699359,0.536192,250691795_250699359,0


In [8]:
#Add Cross track distance
df = cross_track(df,graph)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,speed,lon_acc,lat_acc,type,traveled_d,avg_speed,bearing,nearest_edge_start_node,nearest_edge_end_node,edge_progress,edge_id,dir,xtrack_dist
id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0.0,37.980927,23.734767,33.6878,0.2358,0.0018,Taxi,54.52,30.666639,3.027448,250691795,250699359,0.545202,250691795_250699359,0,-5.168306
1,0.04,37.980924,23.734769,33.7222,0.2427,0.0021,Taxi,54.52,30.666639,2.97129,250691795,250699359,0.543026,250691795_250699359,0,-5.202807
1,0.08,37.980921,23.734772,33.7586,0.2625,0.0024,Taxi,54.52,30.666639,3.027444,250691795,250699359,0.540545,250691795_250699359,0,-5.163781
1,0.12,37.980918,23.734774,33.797,0.2704,0.0027,Taxi,54.52,30.666639,3.027443,250691795,250699359,0.538369,250691795_250699359,0,-5.198282
1,0.16,37.980915,23.734776,33.8351,0.259,0.003,Taxi,54.52,30.666639,2.971285,250691795,250699359,0.536192,250691795_250699359,0,-5.232783


In [9]:
#width of lane data
edge_lane = pd.read_pickle("../data/edge_lane.pkl")
file_name = 'sample'

df['len']  = df['edge_id'].map(df_nodes.set_index('edge_id')['len'])
df['lanes']  = df['edge_id'].map(edge_lane.set_index('edge_id')['lanes'])

#remove trajectory outside the lane, save vehicles which for some duration move outside of the lane in df_out_of_lane dataframe
df,df_out_of_lane = remove_traj_outside_lane(df,xtrack_lim = 0.75)

#remove trajectory near 10meters of intersection
df = remove_traj_near_nodes(df,node_rad = 10)
#add a column defining which segment of the edge the vehicle lies in 
df = split_edge_to_seg(df,seg_len = 20,seg_lim = 17)

# find number of vehicles in segment where the subject vehicle is located
df = vehicle_density_by_seg(df)
#find average surrounding vehicle speed where the subject vehicle is located
df = avg_surr_speed_by_seg(df)

#save as pickle file
df.to_pickle('block4_'+file_name+'_filt_lane.pkl')
    
#remove all vehicles which are in df_out_of_lane list
df = df[ ~(df.index.get_level_values('id').isin(df_out_of_lane.id.to_list()))]

#add filename in index and save
df = df[df.type.isin(['Car','Taxi'])]
df['file_name'] = file_name
df.set_index('file_name', append=True, inplace=True)
df.reorder_levels([2,0,1])
df.to_pickle('block4_'+file_name+'_ct_lane.pkl')

    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
  return array(a, dtype, copy=False, order=order)
