# Dive Prediction - Data

*Predicting Seabird Diving Behaviour from GPS data*

This notebook prepares and formats a dataset of **74 foraging trips of seabirds** (*Sula Variegata*) with both GPS and Time Depth Recorder (TDR) tracking data. Data has been collected on **Ilha Pescadores, Peru** between 2009 and 2013.

More precisely, data have been prepared and selected as follow:

* Trajectories with raw temporal sampling at 1s only
* Gaps have been interpolated linearly (yet there are shorter than 10s)
* Maximal step speed is inferior to 50 m.s-1
* Birds never stay static longer than 10 minutes
* Number of dive in each trajectory is within the following range 1-99

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools

In [2]:
check = pd.read_table('/home/amdroy/MEGA/DATA/FORMAT/interpolation/check.csv', sep=',')
check

Unnamed: 0,distance,duration,fieldwork,gaps,longest_gap,longest_stop,max_speed,nb_dive,nb_fixes,resolution,species,stop,tdr,trip
0,131.0,230.0,F0417,0.000870,0.0,166.0,28.16,14.0,1216.0,11.0,SD,0.14,1.0,F0417_10_SD_T1
1,119.0,191.0,F0417,0.003235,6.0,120.0,24.25,50.0,2274.0,5.0,SD,0.14,1.0,F0417_11_SD_T1
2,224.0,534.0,F0417,0.000999,3.0,325.0,40.45,66.0,6382.0,5.0,SD,0.36,1.0,F0417_11_SD_T2
3,80.0,285.0,F0417,0.000856,2.0,231.0,22.28,7.0,1555.0,11.0,SD,0.41,1.0,F0417_12_SD_T1
4,149.0,262.0,F0417,0.047679,69.0,0.0,38.75,34.0,972.0,15.0,SD,0.00,1.0,F0417_13_SD_T1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1102,15.0,23.0,P1115,0.064655,6.0,8.0,21.06,0.0,1264.0,1.0,SV,0.01,0.0,P1115_6_SV_T1
1103,123.0,164.0,P1115,0.038309,5.0,72.0,33.88,0.0,9273.0,1.0,SV,0.01,0.0,P1115_6_SV_T2
1104,115.0,191.0,P1115,0.101240,8.0,18.0,32.32,0.0,9827.0,1.0,SV,0.00,0.0,P1115_7_SV_T1
1105,133.0,256.0,P1115,0.170844,8.0,90.0,37.93,0.0,11820.0,1.0,SV,0.01,0.0,P1115_7_SV_T2


In [3]:
def get_trip_duration(data, resolution = 1):
    y = [np.sum(data.trip == tt) for tt in data.trip.unique()]
    return np.array(y)*resolution/60

def get_nb_trip(data):
    return len(data.trip.unique())

def get_trip_distance(data, resolution = 1):
    data = data.dropna()
    y = [np.sum(data.step_speed[data.trip == tt]) for tt in data.trip.unique()]
    return np.array(y)*resolution/1e3

def get_dive_prop(data):
    y = [np.mean(data.dive[data.trip == tt]) for tt in data.trip.unique()]
    return np.mean(y)*100

def get_dive_duration(data, resolution = 1):
    return np.array([len(list(g))*resolution  for b, g in itertools.groupby(data.dive) if b])

def get_gaps_prop(data):
    return np.mean(data.gaps)*100

def get_resting_prop(data):
    return np.mean((data.step_speed < 1) & (data.dive == 0))*100

# Datasets - supervized learning

In [4]:
# SV PESCADORES
SV_selected = check[(check.species == 'SV') & (check.fieldwork != 'G1107') & (check.resolution == 1) & (check.tdr == 1) & (check.longest_gap < 60) & (check.longest_stop < 3600)]
SV_selected = SV_selected.sample(frac = 1, random_state = 17)
available = check[(check.species == 'SV') & (check.fieldwork != 'G1107') & (check.tdr == 1)]

data = pd.DataFrame()
for trip in SV_selected.trip:
    traj = pd.read_table('/home/amdroy/MEGA/DATA/FORMAT/interpolation/1s/' + trip + '.csv', sep = ',')
    traj['fieldwork'] = trip[:5]
    data = data.append(traj, ignore_index=True)



nb_train = round(0.5 * len(data.trip.unique()))
nb_validation = round(0.3 * len(data.trip.unique()))
nb_test = round(0.2 * len(data.trip.unique()))

train = SV_selected[0:nb_train]
validation = SV_selected[nb_train:(nb_train+nb_validation)]
test = SV_selected[(nb_train+nb_validation):(nb_train+nb_validation+nb_test)]

SV_train = data[data.trip.isin(train.trip)]
SV_validation = data[data.trip.isin(validation.trip)]
SV_test = data[data.trip.isin(test.trip)]

SV_train.to_csv('./data/SV_train.csv')
SV_validation.to_csv('./data/SV_validation.csv')
SV_test.to_csv('./data/SV_test.csv')

print('Summary statistics  -----------------------------------------------\n')
print('Nb of trips : {} out of {} \nTrip Duration : {} +/- {} min \nDives : {} % \nDives Duration : {} +/- {} s \nGaps : {} % \nResting : {} %'
.format(get_nb_trip(data), len(available), np.round(np.mean(get_trip_duration(data)), 2), np.round(np.std(get_trip_duration(data)), 2),
 np.round(get_dive_prop(data), 2), np.round(np.mean(get_dive_duration(data)), 2), np.round(np.std(get_dive_duration(data)), 2),
 np.round(get_gaps_prop(data), 2), np.round(get_resting_prop(data), 2)))


Summary statistics  -----------------------------------------------

Nb of trips : 132 out of 194 
Trip Duration : 63.67 +/- 36.69 min 
Dives : 1.33 % 
Dives Duration : 2.48 +/- 1.28 s 
Gaps : 2.18 % 
Resting : 4.4 %


In [None]:
# LB
LB_selected = check[(check.species == 'LB') & (check.resolution == 1) & (check.tdr == 1) & (check.longest_gap < 60) & (check.longest_stop < 3600)]
LB_selected = LB_selected.sample(frac = 1, random_state = 17)
available = check[(check.species == 'LB') & (check.tdr == 1)]

data = pd.DataFrame()
for trip in LB_selected.trip:
    traj = pd.read_table('/home/amdroy/MEGA/DATA/FORMAT/interpolation/1s/' + trip + '.csv', sep = ',')
    traj['fieldwork'] = trip[:5]
    data = data.append(traj, ignore_index=True)


nb_train = round(0.5 * len(data.trip.unique()))
nb_validation = round(0.3 * len(data.trip.unique()))
nb_test = round(0.2 * len(data.trip.unique()))

train = LB_selected[0:nb_train]
validation = LB_selected[nb_train:(nb_train+nb_validation)]
test = LB_selected[(nb_train+nb_validation):(nb_train+nb_validation+nb_test)]

LB_train = data[data.trip.isin(train.trip)]
LB_validation = data[data.trip.isin(validation.trip)]
LB_test = data[data.trip.isin(test.trip)]

LB_train.to_csv('./data/LB_train.csv')
LB_validation.to_csv('./data/LB_validation.csv')
LB_test.to_csv('./data/LB_test.csv')

print('Summary statistics  -----------------------------------------------\n')
print('Nb of trips : {} out of {} \nTrip Duration : {} +/- {} min \nDives : {} % \nDives Duration : {} +/- {} s \nGaps : {} % \nResting : {} %'
.format(get_nb_trip(data), len(available), np.round(np.mean(get_trip_duration(data)), 2), np.round(np.std(get_trip_duration(data)), 2),
 np.round(get_dive_prop(data), 2), np.round(np.mean(get_dive_duration(data)), 2), np.round(np.std(get_dive_duration(data)), 2),
 np.round(get_gaps_prop(data), 2), np.round(get_resting_prop(data), 2)))

# Datasets - transfer learning

In [None]:
# SV GUANAPE
SV_selected = check[(check.species == 'SV') & (check.fieldwork == 'G1107') & (check.resolution == 1) & (check.tdr == 1) & (check.longest_gap < 60) & (check.longest_stop < 3600)]
available = check[(check.species == 'SV') & (check.fieldwork == 'G1107') & (check.tdr == 1) ]

data = pd.DataFrame()
for trip in SV_selected.trip:
    traj = pd.read_table('/home/amdroy/MEGA/DATA/FORMAT/interpolation/1s/' + trip + '.csv', sep = ',')
    traj['fieldwork'] = trip[:5]
    data = data.append(traj, ignore_index=True)


data.to_csv('./data/SV_guanape.csv')

print('Summary statistics  -----------------------------------------------\n')
print('Nb of trips : {} out of {} \nTrip Duration : {} +/- {} min \nDives : {} % \nDives Duration : {} +/- {} s \nGaps : {} % \nResting : {} %'
.format(get_nb_trip(data), len(available), np.round(np.mean(get_trip_duration(data)), 2), np.round(np.std(get_trip_duration(data)), 2),
 np.round(get_dive_prop(data), 2), np.round(np.mean(get_dive_duration(data)), 2), np.round(np.std(get_dive_duration(data)), 2),
 np.round(get_gaps_prop(data), 2), np.round(get_resting_prop(data), 2)))

In [None]:
# SD
visual_error = ['F0418_30_SD_T4', 'F0419_6_SD_T1', 'F0419_9_SD_T2']
SD_selected = check[(check.species == 'SD') &  (check.tdr == 1) & (check.nb_dive < 200) & (~check.trip.isin(visual_error))]
SD_selected = SD_selected.sample(frac = 1, random_state = 17)
available = check[(check.species == 'SD') &  (check.tdr == 1)]

data = pd.DataFrame()
for trip in SD_selected.trip:
    traj = pd.read_table('/home/amdroy/MEGA/DATA/FORMAT/interpolation/1s/' + trip + '.csv', sep = ',')
    traj['fieldwork'] = trip[:5]
    data = data.append(traj, ignore_index=True)


nb_train = round(0.5 * len(data.trip.unique()))
nb_validation = round(0.3 * len(data.trip.unique()))
nb_test = round(0.2 * len(data.trip.unique()))

train = SD_selected[0:nb_train]
validation = SD_selected[nb_train:(nb_train+nb_validation)]
test = SD_selected[(nb_train+nb_validation):(nb_train+nb_validation+nb_test)]

SD_train = data[data.trip.isin(train.trip)]
SD_validation = data[data.trip.isin(validation.trip)]
SD_test = data[data.trip.isin(test.trip)]

SD_train.to_csv('./data/SD_train.csv')
SD_validation.to_csv('./data/SD_validation.csv')
SD_test.to_csv('./data/SD_test.csv')

print('Summary statistics  -----------------------------------------------\n')
print('Nb of trips : {} out of {} \nTrip Duration : {} +/- {} min \nDives : {} % \nDives Duration : {} +/- {} s \nGaps : {} % \nResting : {} %'
.format(get_nb_trip(data), len(available), np.round(np.mean(get_trip_duration(data)), 2), np.round(np.std(get_trip_duration(data)), 2),
 np.round(get_dive_prop(data), 2), np.round(np.mean(get_dive_duration(data)), 2), np.round(np.std(get_dive_duration(data)), 2),
 np.round(get_gaps_prop(data), 2), np.round(get_resting_prop(data), 2)))

In [None]:
# import plotly.express as px

# tt = data[data.trip == 'F0418_25_SD_T1']
# fig = px.line_mapbox(tt, lat="lat", lon="lon", color = 'trip', zoom=3, height=300)
# fig = px.scatter_mapbox(tt, lat="lat", lon="lon",size = 'dive')
# fig.update_layout(mapbox_style="stamen-terrain", mapbox_zoom=4, mapbox_center_lat = 41,
#     margin={"r":0,"t":0,"l":0,"b":0})

# fig.show()

In [None]:
# # data = SD_test

# for i in data.trip.unique():
#     traj = data[data.trip == i].copy()

#     plt.figure(figsize=(12,4))
#     plt.subplot(1,2,1)
#     plt.plot(traj.lon, traj.lat)
#     plt.scatter(traj.lon[traj.dive == 1], traj.lat[traj.dive == 1], color='red')
#     plt.title(i)

#     plt.subplot(1,2,2)
#     plt.plot(traj.pressure)

#     plt.show()