# Dive Prediction - Data

*Predicting Seabird Diving Behaviour from GPS data*

This notebook prepares and formats a dataset of **74 foraging trips of seabirds** (*Sula Variegata*) with both GPS and Time Depth Recorder (TDR) tracking data. Data has been collected on **Ilha Pescadores, Peru** between 2009 and 2013.

More precisely, data have been prepared and selected as follow:

* Trajectories with raw temporal sampling at 1s only
* Gaps have been interpolated linearly (yet there are shorter than 10s)
* Maximal step speed is inferior to 50 m.s-1
* Birds never stay static longer than 10 minutes
* Number of dive in each trajectory is within the following range 1-99

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils.trip import * 

In [39]:
s = 'LB'

data = pd.DataFrame()
dataset = ['train', 'test', 'validation']
for d in dataset:
    dd =  pd.read_csv('./../data/'+s+'_'+ d +'.csv')
    
    data = data.append(dd)

data['datetime'] = pd.to_datetime(data['datetime'], format='%Y-%m-%d %H:%M:%S')

In [41]:
data =  pd.read_csv('./../data/SV_test_guanape.csv')
np.mean((data.step_speed < 1) & (data.dive == 0))

0.06381863647934355

In [35]:
var = []
for t in data.trip.unique():
    traj = data[data.trip == t].copy()
    
    var.append((data.step_speed < 1) & (data.dive == 0))

In [36]:
np.mean(var)

0.06537788147313818

In [21]:
species = ['SV', 'LB']


for s in species:
    data =  pd.read_csv('./../data/'+s+'_test.csv')
    
    print(np.mean((data.step_speed < 5) & (data.dive == 0)))
#     print(np.mean((data.step_speed < 5) ))

0.04856635266898034
0.34275397997702284


In [2]:
 np.random.seed(1)

## load raw data 

In [3]:
# connect to database
conn = sqlite3.connect('/home/amdroy/MEGA/DATA/seabirdbank.db')

In [93]:
request = "SELECT trip, datetime, lon, lat, pressure, fastlog, gaps, step_speed, step_direction FROM dive \
            INNER JOIN trip ON dive.trip = trip.id \
            INNER JOIN bird ON trip.bird = bird.id \
            WHERE bird.species = 'LB' \
            ORDER BY trip, datetime"

data = pd.read_sql_query(request, conn)
data['datetime'] = pd.to_datetime(data['datetime'], format='%Y-%m-%d %H:%M:%S')
data['dive'] = 0

# convert to boolean
data['gaps'] = (data['gaps']=='TRUE')
data['fastlog'] = (data['fastlog']=='TRUE')

data

Unnamed: 0,trip,datetime,lon,lat,pressure,fastlog,gaps,step_speed,step_direction,dive
0,P1108_55_LB_T1,2008-11-25 13:47:46,-77.275330,-11.759353,-0.13,False,False,,,0
1,P1108_55_LB_T1,2008-11-25 13:47:47,-77.275333,-11.759297,-0.13,False,False,6.310532,,0
2,P1108_55_LB_T1,2008-11-25 13:47:48,-77.275345,-11.759220,-0.04,False,False,8.632336,-5.226917,0
3,P1108_55_LB_T1,2008-11-25 13:47:49,-77.275360,-11.759138,-0.07,False,False,9.239958,-1.696009,0
4,P1108_55_LB_T1,2008-11-25 13:47:50,-77.275382,-11.759052,-0.04,False,False,9.925472,-3.593677,0
...,...,...,...,...,...,...,...,...,...,...
1242597,P1113_9_LB_T2,2013-11-15 16:37:39,-77.264040,-11.774440,1.25,False,False,7.704756,40.354250,0
1242598,P1113_9_LB_T2,2013-11-15 16:37:40,-77.264080,-11.774470,1.25,False,False,5.962112,-2.406246,0
1242599,P1113_9_LB_T2,2013-11-15 16:37:41,-77.264130,-11.774490,1.25,False,False,5.484760,8.833600,0
1242600,P1113_9_LB_T2,2013-11-15 16:37:42,-77.264170,-11.774510,1.31,False,False,4.755096,-5.497819,0


## filter data

In [94]:
check = pd.read_csv("/home/amdroy/MEGA/DATA/FORMAT/dive_check.csv", sep = ';')
check = check[check.trip.isin(data.trip)]
check

Unnamed: 0,trip,res_sampling_gps,res_sampling_tdr,nb_sampling_gps,nb_sampling_tdr,total_duration,error_sampling_tdr,error_sampling_gps,gap_sampling_tdr,gap_sampling_gps,dupl_sampling_tdr,dupl_sampling_gps,nb_fastlog_event,dist_start_end,total_distance,max_speed,nb_dive,longest_gap,stop
43,P1108_55_LB_T1,1.0,1.0,6353,7562,2.100278,0,273,0,273,0,0,103,0.486669,76.191504,24.090682,732,51,23
44,P1108_55_LB_T2,1.0,1.0,22599,25489,7.080000,0,932,0,932,0,0,109,2.233985,46.690963,36.157990,1249,43,12816
45,P1108_56_LB_T1,41.0,1.0,78,4016,1.115278,0,72,0,6,0,0,44,0.552560,27.843723,18.630666,21,6,0
46,P1108_56_LB_T2,42.0,1.0,79,4067,1.129444,0,65,0,7,0,0,35,0.537194,27.451203,15.920335,19,12,0
47,P1108_56_LB_T3,39.0,1.0,102,5048,1.401944,0,85,0,9,0,0,44,9.827461,35.810463,16.451186,25,12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416,P1113_49_LB_T3,1.0,1.0,5718,9583,2.661667,0,909,0,680,0,229,421,0.646110,46.068153,46.921830,1643,32,5413
419,P1113_8_LB_T1,1.0,1.0,4724,7583,2.106111,0,545,0,445,0,100,0,0.430558,39.907932,35.759790,1498,34,4170
420,P1113_8_LB_T2,1.0,1.0,4311,6897,1.915556,0,581,0,436,0,145,0,0.418736,48.188924,49.200430,1782,39,2995
421,P1113_9_LB_T1,1.0,1.0,7179,10447,2.901667,0,652,0,504,0,148,0,0.457851,61.465263,49.727980,2626,52,5556


In [95]:
# # SV
# list_ok = check.trip[(check.res_sampling_gps == 1) & (check.gap_sampling_tdr == 0) & 
#                      (check.nb_dive > 0) & (check.stop < 600)]
# data = data[data.trip.isin(list_ok)]
# data

In [96]:
## LB
list_ok = check.trip[(check.gap_sampling_tdr == 0) &
                     (check.res_sampling_gps == 1) & 
                     (check.nb_sampling_tdr > 1000) &
                     (check.stop < 3000)]
data = data[data.trip.isin(list_ok)]
data

Unnamed: 0,trip,datetime,lon,lat,pressure,fastlog,gaps,step_speed,step_direction,dive
0,P1108_55_LB_T1,2008-11-25 13:47:46,-77.275330,-11.759353,-0.13,False,False,,,0
1,P1108_55_LB_T1,2008-11-25 13:47:47,-77.275333,-11.759297,-0.13,False,False,6.310532,,0
2,P1108_55_LB_T1,2008-11-25 13:47:48,-77.275345,-11.759220,-0.04,False,False,8.632336,-5.226917,0
3,P1108_55_LB_T1,2008-11-25 13:47:49,-77.275360,-11.759138,-0.07,False,False,9.239958,-1.696009,0
4,P1108_55_LB_T1,2008-11-25 13:47:50,-77.275382,-11.759052,-0.04,False,False,9.925472,-3.593677,0
...,...,...,...,...,...,...,...,...,...,...
1224911,P1113_8_LB_T2,2013-11-15 17:37:27,-77.261410,-11.772650,2.68,False,False,16.221330,7.289993,0
1224912,P1113_8_LB_T2,2013-11-15 17:37:28,-77.261280,-11.772630,2.68,False,False,13.398880,15.866960,0
1224913,P1113_8_LB_T2,2013-11-15 17:37:29,-77.261190,-11.772610,2.68,False,False,10.347060,-7.167097,0
1224914,P1113_8_LB_T2,2013-11-15 17:37:30,-77.261120,-11.772630,2.68,False,False,7.572113,26.202130,0


In [85]:
# # SD
# list_ok = check.trip[(check.res_sampling_gps == 15) &(check.nb_dive > 0) & (check.res_sampling_gps <= 15)]
# data = data[data.trip.isin(list_ok)]
# data

In [69]:
# # SS
# list_ok = check.trip[(check.res_sampling_gps == 15) & (check.longest_gap < 100) & (check.nb_fastlog_event > 5)]
# data = data[data.trip.isin(list_ok)]
# data

In [97]:
trip_duration = []
trip_dives = []

In [98]:
for i in data.trip.unique():
    t = data[data.trip == i]
    
    # compute dives
    trip = Trip(t)
    trip.add_dive(2)
    
#     trip.df['dive'] = 1*trip.df['fastlog']
    
    trip_duration.append(trip.get_duration().total_seconds()/60)
    trip_dives.append(sum(trip.df.dive))
    
    # fill dive in main table
    data = data.copy()
    data.loc[data.trip == i, 'dive'] = trip.df.dive.values
    
#     # plot individual and global plots
#     trip.plot('./data/SS/'+ str(i)+'.png')
#     plt.plot(np.array(t.lon), np.array(t.lat))
    
print('Total nb of trips: '+ str(len(data.trip.unique())))

Total nb of trips: 76


In [99]:
def rle(inarray):
        """ run length encoding. Partial credit to R rle function. 
            Multi datatype arrays catered for including non Numpy
            returns: tuple (runlengths, startpositions, values) """
        ia = np.asarray(inarray)                # force numpy
        n = len(ia)
        if n == 0: 
            return (None, None, None)
        else:
            y = ia[1:] != ia[:-1]               # pairwise unequal (string safe)
            i = np.append(np.where(y), n - 1)   # must include last element posi
            z = np.diff(np.append(-1, i))       # run lengths
            p = np.cumsum(np.append(0, z))[:-1] # positions
            return(z, p, ia[i])

In [101]:
duration,pos, dive = rle(data.dive)
np.mean([duration[i] for i in range(len(duration)) if dive[i] == 1])
# np.std([duration[i] for i in range(len(duration)) if dive[i] == 1])

17.626161837069436

In [None]:
sum(data.dive)/len(data)

In [None]:
np.std(trip_dives)

In [None]:
# data.to_csv('./data/SS_all.csv', index = False)

## split to train, validation, test dataset

In [None]:
data_guanape = data[data.trip.str[:1]=='G']
data = data[data.trip.str[:1]=='P']

In [None]:
data_guanape.to_csv('./data/SV_test_guanape.csv', index = False)

In [None]:
trajs = data.trip.unique()
np.random.shuffle(trajs)
trajs

In [None]:
nb_train = round(0.7 * len(data.trip.unique()))
nb_validation = round(0.2 * len(data.trip.unique()))
nb_test = round(0.1 * len(data.trip.unique()))

In [None]:
train_trajs = trajs[0:nb_train]
validation_trajs = trajs[nb_train:(nb_train+nb_validation)]
test_trajs = trajs[(nb_train+nb_validation):(nb_train+nb_validation+nb_test)]

In [None]:
data_train = data[data.trip.isin(train_trajs)]
data_validation = data[data.trip.isin(validation_trajs)]
data_test = data[data.trip.isin(test_trajs)]

In [None]:
data_train.to_csv('./data/SV_train.csv', index = False)
data_validation.to_csv('./data/SV_validation.csv', index = False)
data_test.to_csv('./data/SV_test.csv', index = False)

In [None]:
(nb_train, nb_validation, nb_test)