### Trajectory Feature Extraction Data Processing Script

Segment into trips and extract trajectory features for classification modeling:

- direction
- Distance traveled
- Average speed
- Total span (min, max lat/lon, median)
- Total time

In [97]:
import pandas as pd
import os
import numpy as np
import datetime
import pickle
import geopy.distance
import matplotlib.pyplot as plt

In [98]:
#load labeled MMSI data
mmsi_check = pd.read_csv('mmsis.csv')

#load raw trajectory data
trajectories = pd.read_csv('trajectories.csv')

In [99]:
trajectories = trajectories[['mmsi','timestamp','lat','lon']]

#change time to datetime and round time to nearest 20 minutes
trajectories['timestamp'] = pd.to_datetime(trajectories['timestamp'])
trajectories['new_time'] = trajectories['timestamp'].dt.round('20min')

In [100]:
#helper functions for segmenting trajectories

#define distance function
def distance_calc(df):
    try:
        dist = geopy.distance.vincenty(df['lat_lon'], df['next_latlon']).km
    except:
        dist = 0
    return dist

#create a trip id tag based on stationarity of boat
def trip_tag(df):
    '''tags trajectories of movement with a trip id'''
    if df['stationary_flag'] == 1:
        return 0
    else:
        return df['trip_id']
    
#create stationary flag based on speed threshold
def station_flag(df):
    '''creates a binary flag for when a boat is stationary'''
    if df['km_per_hr'] < 0.3:
        return 1
    else: 
        return 0

In [101]:
def trip_segmentation_prep(df,boat):
    '''function for preprocessing dataframe'''
    trajectories_test = trajectories[df['mmsi']==boat] #filter on single boat
    trajectories_test = trajectories_test[['mmsi','new_time','lat','lon']] #filter only needed columns
    
    uniform_traj = pd.merge(left=date_test,right=trajectories_test,how='left',left_on=0,right_on='new_time')

    uniform_traj = uniform_traj[[0,'mmsi','lat','lon']] #filter to only needed columns
    uniform_traj.drop_duplicates(subset=[0],inplace=True) #drop duplicate time values, keep first record at each time
    uniform_traj.reset_index(inplace=True) #reset index so concatenation will align
    uniform_traj = uniform_traj.interpolate(method='linear') #perform linear interpolation
    
    #returns index value of the first non NaN, which is where we want the trajectory to start from
    uniform_traj = uniform_traj[uniform_traj['lat'].index.get_loc(uniform_traj['lat'].first_valid_index()):]
    
    #some preprocessing for the lat-lon columns
    #create a new column with a tuple 
    uniform_traj['lat_lon'] = list(zip(uniform_traj['lat'],uniform_traj['lon']))
    #create column of next location
    uniform_traj['next_latlon'] = uniform_traj['lat_lon'].shift(-1)
    #filter last row which will be a NaN
    uniform_traj = uniform_traj[:-1]
    
    return uniform_traj

In [102]:
#create a data frame with uniform time stamps of 20 minutes
date_range = []

date1 = '2012-01-01 00:00:00'
date2 = '2016-12-31 00:00:00'
start = datetime.datetime.strptime(date1, '%Y-%m-%d %H:%M:%S')
end = datetime.datetime.strptime(date2, '%Y-%m-%d %H:%M:%S')
step = datetime.timedelta(minutes=20)
while start <= end:
    date_range.append(start)
    start += step   

date_test = pd.DataFrame(date_range)

In [103]:
trajectory_features = pd.DataFrame() #initialize a dataframe to fill with trajectory features

mmsi_list = list(trajectories.mmsi.unique())

for mmsi in mmsi_list:
    
    uniform_traj = trip_segmentation_prep(trajectories,mmsi)

    #apply distance function and create new column with distance calc
    uniform_traj['distance'] = uniform_traj.apply(distance_calc,axis=1)
    uniform_traj['km_per_hr'] = (uniform_traj['distance'] / 20) * 60

    uniform_traj.dropna(subset=['km_per_hr'],inplace=True) #drop NaN / duplicates
    uniform_traj = uniform_traj[~uniform_traj['km_per_hr'].isin([np.nan, np.inf, -np.inf])]  #filter out infinity values
    uniform_traj['stationary_flag'] = uniform_traj.apply(station_flag,axis=1) #

    #create a cumulative sum that tags changes in stationary movement with a new id 
    uniform_traj['trip_id'] = (uniform_traj['stationary_flag'].shift(1) != \
                                uniform_traj['stationary_flag']).astype(int).cumsum() 

    #the trip id will reset for each boat, e.g. boat ABC will have trip_id = 2,4,6,..,n and boat ZYX will have trip_id = 2,4,6,etc.
    uniform_traj['trip_id'] = uniform_traj.apply(trip_tag,axis=1) #reassign stationary values all as 0 in trip id

    #since you don't care when a boat is stationary we can drop these records
    uniform_traj = uniform_traj[uniform_traj['stationary_flag']!=1]

    #we also want to exclude short trips
    trip_length = pd.DataFrame(uniform_traj.groupby('trip_id')['stationary_flag'].count()) #get trip length
    trip_length.reset_index(inplace=True)

    trip_segments = pd.merge(left=uniform_traj,right=trip_length,how='left',left_on='trip_id',right_on='trip_id')
    trip_segments = trip_segments[trip_segments['stationary_flag_y']>5]
    
    trip_list = list(filtered_df.trip_id.unique()) #get a unique list of trip id's for the mmsi
    
    for trip in trip_list:
        dict_list = []
        filtered_trips = filtered_df[filtered_df['trip_id']==trip]
        new_row = {'mmsi':mmsi,
                   'distance':filtered_trips['distance'].sum(),
                   'avg_speed':filtered_trips['km_per_hr'].mean(),
                   'total_time':filtered_trips[0].max()-filtered_trips[0].min(),
                   'min_lat':filtered_trips['lat'].min(),
                   'min_lon':filtered_trips['lon'].min(),
                   'max_lat':filtered_trips['lat'].max(),
                   'max_lon':filtered_trips['lon'].max(),
                   'med_lat':filtered_trips['lat'].median(),
                   'med_lon':filtered_trips['lon'].median()}
        dict_list.append(new_row)
        new_df = pd.DataFrame(dict_list)
        trajectory_features = pd.concat([trajectory_features,new_df])
        
    print(mmsi)  

  


170052967564863
105579478788595
130155297254428
108705926933614
209123096662320
251545767275046
221946427834059
224786612746287
146855328418227
138705062821220
472004530520
194671993660064
260351244058145
23770783250938
26740506301377
183690246677486
192665586399581
13275176023916
256792384000220
116578315107420
253844832070541
102029098096261
184417002225525
130679899491312
214743087662983
115904004134405
236614363512057
49946321646941
187863545115697
103998194181708
201416986422368
109983223516401
256926157663553
232049438520401
190845284027868
244602438606339
259125516190601
184994959626596
244511605554921
212820497932329
196896378492676
103576446797335
150151863317535
101554492253652
107308862771865
172864471892552
180663791083120
28271485059106
261683006747890
121426486551523
139028728648447
208987624720865
147790950389381
138495562862686
247928547083693
262588566762668
180853884696213
19859547683322
159023274509168
205298006856593
227964845509425
22967306557743
31360332891350
109

150799508133438
189588287999655
145170421637214
230116816157187
116302826373535
173370905016385
130528914446151
262800915147570
198753800773401
274677012401415
24060466082208
171227197635973
104640075350819
243672000801887
235709439727079
33272339789416
177523928951928
205324415477552
117473549532614
33266086194351
140426460258330
136929052181490
104456033115132
26546654870606
164396145656466
23593130178765
38322969102051
202700751091436
113583450025875
182326604324132
39174193362812
196808761044383
235991361138082
104784130331242
133554754525857
38485248052422
201376812742573
3250200328878
104051385412267
212673454430811
166981727229351
255237681871477
268536739139182
274683790319798
15236962385493
222443496133416
278374764356221
167353196910359
134393118622376
1252339803566
208925550869609
227048092823071
258237566084521
180719089545656
126098391219686
42552734527429
2124802087028
187759007488957
28161301541509
26616040923734
32323597276678
103887972271799
41302015830985
167058315879

In [104]:
trajectory_features.shape

(515155, 10)

In [105]:
trajectory_features = pd.merge(left=trajectory_features,right=mmsi_check,left_on='mmsi',right_on='mmsi')

col_list = ['avg_speed','distance','max_lat','max_lon','med_lat','med_lon','min_lat',
            'min_lon','mmsi','total_time','is_fishing','label','sublabel']

trajectory_features = trajectory_features[col_list]

In [106]:
trajectory_features.to_csv('trajectory_features.csv')