# Dive Prediction - Data

*Predicting Seabird Diving Behaviour from GPS data*

This notebook prepares and formats a dataset of **74 foraging trips of seabirds** (*Sula Variegata*) with both GPS and Time Depth Recorder (TDR) tracking data. Data has been collected on **Ilha Pescadores, Peru** between 2009 and 2013.

More precisely, data have been prepared and selected as follow:

* Trajectories with raw temporal sampling at 1s only
* Gaps have been interpolated linearly (yet there are shorter than 10s)
* Maximal step speed is inferior to 50 m.s-1
* Birds never stay static longer than 10 minutes
* Number of dive in each trajectory is within the following range 1-99

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils.trip import * 

In [2]:
 np.random.seed(1)

## load raw data 

In [3]:
# connect to database
conn = sqlite3.connect('/home/amdroy/MEGA/DATA/seabirdbank.db')

In [5]:
request = "SELECT trip, datetime, lon, lat, pressure, fastlog, gaps, step_speed, step_direction FROM dive \
            INNER JOIN trip ON dive.trip = trip.id \
            INNER JOIN bird ON trip.bird = bird.id \
            WHERE bird.species = 'LB' \
            ORDER BY trip, datetime"

data = pd.read_sql_query(request, conn)
data['datetime'] = pd.to_datetime(data['datetime'], format='%Y-%m-%d %H:%M:%S')
data['dive'] = 0

# convert to boolean
data['gaps'] = (data['gaps']=='TRUE')
data['fastlog'] = (data['fastlog']=='TRUE')

data

Unnamed: 0,trip,datetime,lon,lat,pressure,fastlog,gaps,step_speed,step_direction,dive
0,P1108_55_LB_T1,2008-11-25 13:47:46,-77.275330,-11.759353,-0.13,False,False,,,0
1,P1108_55_LB_T1,2008-11-25 13:47:47,-77.275333,-11.759297,-0.13,False,False,6.310532,,0
2,P1108_55_LB_T1,2008-11-25 13:47:48,-77.275345,-11.759220,-0.04,False,False,8.632336,-5.226917,0
3,P1108_55_LB_T1,2008-11-25 13:47:49,-77.275360,-11.759138,-0.07,False,False,9.239958,-1.696009,0
4,P1108_55_LB_T1,2008-11-25 13:47:50,-77.275382,-11.759052,-0.04,False,False,9.925472,-3.593677,0
...,...,...,...,...,...,...,...,...,...,...
1242597,P1113_9_LB_T2,2013-11-15 16:37:39,-77.264040,-11.774440,1.25,False,False,7.704756,40.354250,0
1242598,P1113_9_LB_T2,2013-11-15 16:37:40,-77.264080,-11.774470,1.25,False,False,5.962112,-2.406246,0
1242599,P1113_9_LB_T2,2013-11-15 16:37:41,-77.264130,-11.774490,1.25,False,False,5.484760,8.833600,0
1242600,P1113_9_LB_T2,2013-11-15 16:37:42,-77.264170,-11.774510,1.31,False,False,4.755096,-5.497819,0


In [None]:
for 

## filter data

In [None]:
check = pd.read_csv("/home/amdroy/MEGA/DATA/FORMAT/dive_check.csv", sep = ';')
check = check[check.trip.isin(data.trip)]
check

In [None]:
# SV
list_ok = check.trip[(check.res_sampling_gps == 1) & (check.gap_sampling_tdr == 0) & 
                     (check.nb_dive > 0) & (check.stop < 600)]
data = data[data.trip.isin(list_ok)]
data

In [None]:
# ## LB
# list_ok = check.trip[(check.gap_sampling_tdr == 0) &
#                      (check.res_sampling_gps == 1) & 
#                      (check.nb_sampling_tdr > 1000) &
#                      (check.stop < 3000)]
# data = data[data.trip.isin(list_ok)]
# data

In [None]:
# # SD
# list_ok = check.trip[(check.res_sampling_gps == 15) &(check.nb_dive > 0) & (check.res_sampling_gps <= 15)]
# data = data[data.trip.isin(list_ok)]
# data

In [None]:
# # SS
# list_ok = check.trip[(check.res_sampling_gps == 15) & (check.longest_gap < 100) & (check.nb_fastlog_event > 5)]
# data = data[data.trip.isin(list_ok)]
# data

In [None]:
trip_duration = []
trip_dives = []

In [None]:
for i in data.trip.unique():
    t = data[data.trip == i]
    
    # compute dives
    trip = Trip(t)
    trip.add_dive(2)
    
#     trip.df['dive'] = 1*trip.df['fastlog']
    
    trip_duration.append(trip.get_duration().total_seconds()/60)
    trip_dives.append(sum(trip.df.dive))
    
    # fill dive in main table
    data = data.copy()
    data.loc[data.trip == i, 'dive'] = trip.df.dive.values
    
#     # plot individual and global plots
#     trip.plot('./data/SS/'+ str(i)+'.png')
#     plt.plot(np.array(t.lon), np.array(t.lat))
    
print('Total nb of trips: '+ str(len(data.trip.unique())))

In [None]:
sum(data.dive)/len(data)

In [None]:
np.std(trip_dives)

In [None]:
# data.to_csv('./data/SS_all.csv', index = False)

## split to train, validation, test dataset

In [None]:
data_guanape = data[data.trip.str[:1]=='G']
data = data[data.trip.str[:1]=='P']

In [None]:
data_guanape.to_csv('./data/SV_test_guanape.csv', index = False)

In [None]:
trajs = data.trip.unique()
np.random.shuffle(trajs)
trajs

In [None]:
nb_train = round(0.7 * len(data.trip.unique()))
nb_validation = round(0.2 * len(data.trip.unique()))
nb_test = round(0.1 * len(data.trip.unique()))

In [None]:
train_trajs = trajs[0:nb_train]
validation_trajs = trajs[nb_train:(nb_train+nb_validation)]
test_trajs = trajs[(nb_train+nb_validation):(nb_train+nb_validation+nb_test)]

In [None]:
data_train = data[data.trip.isin(train_trajs)]
data_validation = data[data.trip.isin(validation_trajs)]
data_test = data[data.trip.isin(test_trajs)]

In [None]:
data_train.to_csv('./data/SV_train.csv', index = False)
data_validation.to_csv('./data/SV_validation.csv', index = False)
data_test.to_csv('./data/SV_test.csv', index = False)

In [None]:
(nb_train, nb_validation, nb_test)