### Resampling Method Testing for Trajectory Clustering

In [24]:
#import needed packages
import pandas as pd
import os
import numpy as np
import datetime
import pickle
import geopy.distance

In [3]:
#set display options
pd.set_option('display.max_rows', 1000)

In [4]:
#grab a sample of trajectories to test
trajectories = pd.read_csv('trajectories.csv',nrows=8000000)

In [5]:
#check number of boats in sample
len(list(trajectories.mmsi.unique()))

125

### Build Interpolation Based on Full Boat Trajectory

In [8]:
#change time to datetime and round time to nearest 10 minutes
trajectories['timestamp'] = pd.to_datetime(trajectories['timestamp'])
trajectories['new_time'] = trajectories['timestamp'].dt.round('20min')  

In [9]:
#create a data frame with uniform time stamps of 20 minutes
date_range = []

date1 = '2012-01-01 00:00:00'
date2 = '2016-12-31 00:00:00'
start = datetime.datetime.strptime(date1, '%Y-%m-%d %H:%M:%S')
end = datetime.datetime.strptime(date2, '%Y-%m-%d %H:%M:%S')
step = datetime.timedelta(minutes=20)
while start <= end:
    date_range.append(start)
    start += step   
    
date_test = pd.DataFrame(date_range)

In [11]:
#initialize a list of all the mmsi
boat_list = list(trajectories.mmsi.unique())

df_list = [] #initialize list for storing processed dataframes

#loop through mmsi list, each column in the output will be all the lat-lon pairs for every possible time step

for mmsi in boat_list:
    
    trajectories_test = trajectories[trajectories['mmsi']==mmsi] #filter on single boat

    uniform_traj = pd.merge(left=date_test,right=trajectories_test,how='left',left_on=0,right_on='new_time')
    
    uniform_traj = uniform_traj[[0,'lat','lon']] #filter to only needed columns
    uniform_traj.drop_duplicates(subset=[0],inplace=True) #drop duplicate time values, keep first record at each time
    uniform_traj.reset_index(inplace=True) #reset index so concatenation will align
    uniform_traj = uniform_traj.interpolate(method='linear') #perform linear interpolation
    uniform_traj['lat_lon'] = uniform_traj[['lat','lon']].values.tolist() #convert lat-lon to list object
    uniform_traj = uniform_traj[['lat_lon']] #keep just the lat lon values
    uniform_traj.rename({'lat_lon':mmsi},axis=1,inplace=True) #rename column based on boat id
    df_list.append(uniform_traj) #append data frame to df list
    
uniform_boats = pd.concat(df_list,axis=1)
uniform_boats = pd.merge(left=date_test,right=uniform_boats,left_index=True,right_index=True)

In [79]:
#generate a list of interpolated trajectories
uniform_boats_col_list = list(uniform_boats.columns[1:]) #just names of boat columns

interpolated_list = [] #initialize empty list for storing 

for col in uniform_boats_col_list:
    test_df = uniform_boats[col]
    test_list = test_df.tolist()
    
    traj = []
    for i in test_list:
        if str(i[0]) != 'nan':
            traj.append(i)
    
    interpolated_list.append(traj)

In [85]:
#save the list object
with open('full_trajectories', 'wb') as fp:
    pickle.dump(interpolated_list, fp)
    
'''
Read it back by calling:
with open ('outfile', 'rb') as fp:
    itemlist = pickle.load(fp)'''

"\nRead it back by calling:\nwith open ('outfile', 'rb') as fp:\n    itemlist = pickle.load(fp)"

### Segment Trajectories into Trips

#### The first step is linear interpolation at the MMSI level
#### Second step is measuring the speed 

In [None]:
#define functions needed for trip segmentation calculations

#define distance function
def distance_calc(df):
    dist = geopy.distance.vincenty(df['lat_lon'], df['next_latlon']).km
    return dist

#create stationary flag based on speed threshold
def station_flag(df):
    '''creates a binary flag for when a boat is stationary'''
    if df['km_per_hr'] < 0.3:
        return 1
    else: 
        return 0

#create a trip id tag based on stationarity of boat
def trip_tag(df):
    '''tags trajectories of movement with a trip id'''
    if df['stationary_flag'] == 1:
        return 0
    else:
        return df['trip_id']

In [78]:
def trip_segmentation_prep(df,boat):
    '''function for preprocessing dataframe'''
    trajectories_test = trajectories[df['mmsi']==boat] #filter on single boat
    trajectories_test = trajectories_test[['mmsi','new_time','lat','lon']] #filter only needed columns
    
    uniform_traj = pd.merge(left=date_test,right=trajectories_test,how='left',left_on=0,right_on='new_time')

    uniform_traj = uniform_traj[[0,'lat','lon']] #filter to only needed columns
    uniform_traj.drop_duplicates(subset=[0],inplace=True) #drop duplicate time values, keep first record at each time
    uniform_traj.reset_index(inplace=True) #reset index so concatenation will align
    uniform_traj = uniform_traj.interpolate(method='linear') #perform linear interpolation
    
    #returns index value of the first non NaN, which is where we want the trajectory to start from
    uniform_traj = uniform_traj[uniform_traj['lat'].index.get_loc(uniform_traj['lat'].first_valid_index()):]
    
    #some preprocessing for the lat-lon columns
    #create a new column with a tuple 
    uniform_traj['lat_lon'] = list(zip(uniform_traj['lat'],uniform_traj['lon']))
    #create column of next location
    uniform_traj['next_latlon'] = uniform_traj['lat_lon'].shift(-1)
    #filter last row which will be a NaN
    uniform_traj = uniform_traj[:-1]
    
    return uniform_traj

In [80]:
mmsi_list = list(trajectories['mmsi'].unique())

for i in mmsi_list:
    
    uniform_traj = trip_segmentation_prep(trajectories,i)
    
    #apply distance function and create new column with distance calc
    uniform_traj['distance'] = uniform_traj.apply(distance_calc,axis=1)
    uniform_traj['km_per_hr'] = (uniform_traj['distance'] / 20) * 60
    
    uniform_traj.dropna(subset=['km_per_hr'],inplace=True) #drop NaN / duplicates
    uniform_traj = uniform_traj[~uniform_traj['km_per_hr'].isin([np.nan, np.inf, -np.inf])]  #filter out infinity values
    uniform_traj['stationary_flag'] = uniform_traj.apply(station_flag,axis=1) #

    #create a cumulative sum that tags changes in stationary movement with a new id 
    uniform_traj['trip_id'] = (uniform_traj['stationary_flag'].shift(1) != \
                                uniform_traj['stationary_flag']).astype(int).cumsum() 

    #the trip id will reset for each boat, e.g. boat ABC will have trip_id = 2,4,6,..,n and boat ZYX will have trip_id = 2,4,6,etc.
    uniform_traj['trip_id'] = uniform_traj.apply(trip_tag,axis=1) #reassign stationary values all as 0 in trip id
    
    #since you don't care when a boat is stationary we can drop these records
    uniform_traj = uniform_traj[uniform_traj['stationary_flag']!=1]

    #we also want to exclude short trips
    trip_length = pd.DataFrame(uniform_traj.groupby('trip_id')['stationary_flag'].count()) #get trip length
    trip_length.reset_index(inplace=True)

    trip_segments = pd.merge(left=uniform_traj,right=trip_length,how='left',left_on='trip_id',right_on='trip_id')
    trip_segments = trip_segments[trip_segments['stationary_flag_y']>12]
    
    trip_segments['lat_lon'] = trip_segments[['lat','lon']].values.tolist()
    trip_segments = trip_segments[['lat_lon','trip_id']]
    
    trip_id_list = list(trip_segments['trip_id'].unique())

    segmented_trips = []

    for trip in trip_id_list:
        trip_test = trip_segments[trip_segments['trip_id']==trip]
        trips = list(trip_test['lat_lon'].tolist())
        segmented_trips.append(trips)
    
    print("{} processed".format(i))


  This is separate from the ipykernel package so we can avoid doing imports until


170052967564863 processed
105579478788595 processed
130155297254428 processed
108705926933614 processed
209123096662320 processed
251545767275046 processed
221946427834059 processed
224786612746287 processed
146855328418227 processed
138705062821220 processed
472004530520 processed
194671993660064 processed
260351244058145 processed
23770783250938 processed
26740506301377 processed
183690246677486 processed
192665586399581 processed
13275176023916 processed
256792384000220 processed
116578315107420 processed
253844832070541 processed
102029098096261 processed
184417002225525 processed
130679899491312 processed
214743087662983 processed
115904004134405 processed
236614363512057 processed
49946321646941 processed
187863545115697 processed
103998194181708 processed
201416986422368 processed
109983223516401 processed
256926157663553 processed
232049438520401 processed
190845284027868 processed
244602438606339 processed
259125516190601 processed
184994959626596 processed
244511605554921 pro

In [81]:
#export structured list
#save the list object
with open('segmented_trajectories', 'wb') as fp:
    pickle.dump(segmented_trips, fp)
    