In [210]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import IPython
import datetime
import random
import pickle

import trip
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import pytorch_lightning as pl

In [3]:
# connect to database
conn = sqlite3.connect('/home/amdroy/MEGA/DATA/seabirdbank.db')

In [252]:
request = "SELECT \
            trip.id as trip, gps.datetime, gps.lon, gps.lat\
            FROM gps \
            INNER JOIN trip ON gps.trip = trip.id \
            INNER JOIN bird ON trip.bird = bird.id \
            WHERE trip.file_gps IS NOT NULL \
            AND trip.file_tdr IS NOT NULL \
            AND (bird.fieldwork = 'P1108' OR bird.fieldwork = 'P1111' \
                OR bird.fieldwork = 'P1112' OR bird.fieldwork = 'P1113') \
            AND bird.species = 'SV'"

# substr(bird.fieldwork,1,1) = 'P'

gps = pd.read_sql_query(request, conn)

# check duplicated datetime in GPS trip
idx = gps.index[gps.loc[:, 'trip':'datetime'].duplicated(keep=False)]
idx_drop = [idx[i] for i in range(0, len(idx), 2)]
gps = gps.drop(idx_drop)

request = "SELECT \
            trip.id as trip, tdr.datetime, tdr.pressure \
            FROM tdr \
            INNER JOIN trip ON tdr.trip = trip.id \
            INNER JOIN bird ON trip.bird = bird.id \
            WHERE trip.file_gps IS NOT NULL \
            AND trip.file_tdr IS NOT NULL \
            AND (bird.fieldwork = 'P1108' OR bird.fieldwork = 'P1111' \
                OR bird.fieldwork = 'P1112' OR bird.fieldwork = 'P1113') \
            AND bird.species = 'SV'"

# WHERE substr(bird.fieldwork,1,1) = 'P' 

tdr = pd.read_sql_query(request, conn)
tdr['datetime'] = pd.to_datetime(tdr['datetime'], format='%Y-%m-%d %H:%M:%S')
gps['datetime'] = pd.to_datetime(gps['datetime'], format='%Y-%m-%d %H:%M:%S')
df = pd.merge(tdr, gps, on=['trip', 'datetime'], how='left')


# check NaN and fill with interpolations
[i for i, x in enumerate(df['lon'].isna()) if x]

df['lon'] = df['lon'].interpolate(method='nearest', limit_direction='backward')
df['lat'] = df['lat'].interpolate(method='nearest', limit_direction='backward')

bad_trip = ['P1111_9_SV_T1', 'P1111_13_SV_T6', 'P1111_41_SV_T2', 'P1111_46_SV_T5', 'P1111_52_SV_T1',
            'P1112_4_SV_T2', 'P1112_26_SV_T5', 'P1112_36_SV_T1', 'P1112_36_SV_T2', 'P1113_41_SV_T3',
           'P1113_50_SV_T2']

no_loop = ['P1112_26_SV_T1', 'P1112_26_SV_T2', 'P1112_26_SV_T3', 'P1112_26_SV_T4']

### remove bad data
idx = df.index[[(t in bad_trip or t in no_loop) for t in df.trip]]
df = df.drop(idx)

### reindex data
df = df.set_index(np.arange(len(df)))

df

Unnamed: 0,trip,datetime,pressure,lon,lat
0,P1108_4_SV_T1,2008-11-25 13:00:35,-0.35,-77.264128,-11.773317
1,P1108_4_SV_T1,2008-11-25 13:00:36,-0.38,-77.264118,-11.773235
2,P1108_4_SV_T1,2008-11-25 13:00:37,-0.35,-77.264098,-11.773152
3,P1108_4_SV_T1,2008-11-25 13:00:38,-0.29,-77.264113,-11.773060
4,P1108_4_SV_T1,2008-11-25 13:00:39,-0.29,-77.264142,-11.772955
...,...,...,...,...,...
862916,P1113_50_SV_T1,2013-11-24 13:09:17,-1.01,-77.264229,-11.773601
862917,P1113_50_SV_T1,2013-11-24 13:09:18,-1.07,-77.264221,-11.773701
862918,P1113_50_SV_T1,2013-11-24 13:09:19,-1.07,-77.264191,-11.773771
862919,P1113_50_SV_T1,2013-11-24 13:09:20,-1.01,-77.264168,-11.773826


In [253]:
# Hyperparameters
epoch = 600
batch_size = 8
learning_rate = 0.01

In [312]:
class TrajDataSet(Dataset):
    def __init__(self,  df, epoch, transform=None):
        self.df = df
        self.epoch = epoch
        self.start_idx = np.where([self.df.trip[i]==self.df.trip[i+self.epoch] for i in range(len(self.df)-self.epoch)])[0]
        self.transform = transform
        
    def __len__(self):
        return len(self.start_idx)

    def __getitem__(self, idx):
        
        i = self.start_idx[idx]
        
        traj = self.df.loc[i:i+self.epoch-1, ('lon', 'lat')]           
        traj = np.array(traj).T
        
        
        dive = self.df.loc[i:i+self.epoch-1, 'pressure']
        dive = np.array(dive)
        
        sample = (traj, dive)
        
        if self.transform:
            sample = self.transform(sample)
            
        return sample

class Threshold(object):
    """Define dives with threshold

    Args:
        threshold value 
    """

    def __init__(self, threshold):
        self.threshold = threshold

    def __call__(self, sample):
        traj, dive = sample

        # change resolution
        dive_new = np.where( (dive - np.median(dive)) > self.threshold, 1, 0)

        return (traj, dive_new)
    
    
class Rescale(object):
    """Rescale the output in a sample to a given size.

    Args:
        output_size (tuple or int): Desired output size
    """

    def __init__(self, ratio, method='max'):
        self.ratio = ratio
        self.method = method

    def __call__(self, sample):
        traj, dive = sample

        # change resolution
        if self.method == 'max':
            dive_new = [np.max(dive[i:i+self.ratio+1]) for i in range(len(dive)) if i%self.ratio==0]
            
        if self.method == 'mean':
            dive_new = [np.mean(dive[i:i+self.ratio+1]) for i in range(len(dive)) if i%self.ratio==0]

        return (traj, dive_new)
    
    
class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""
        
    def __call__(self, sample):
        traj, dive = sample
        return (torch.from_numpy(traj), torch.from_numpy(dive))

In [313]:
dataset_train = TrajDataSet(df, epoch, transform = 
                            transforms.Compose([Rescale(10, 'mean'), Threshold(1), ToTensor()]))
(x,y) = dataset_train[0]
x.size()

torch.Size([2, 600])

In [314]:
x

tensor([[-77.2641, -77.2641, -77.2641,  ..., -77.3376, -77.3377, -77.3379],
        [-11.7733, -11.7732, -11.7732,  ..., -11.7703, -11.7703, -11.7704]],
       dtype=torch.float64)

In [315]:
train_loader = DataLoader(dataset_train, batch_size=4, num_workers = 2, shuffle = True)

In [324]:
class ConvNet(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.cnn_input_1 = nn.Sequential(
            nn.Conv1d(2, 8, kernel_size = 11, stride = 1, padding = 5, dilation = 1),
            nn.ReLU(),
            nn.Conv1d(8, 8, kernel_size = 11, stride = 1, padding = 5, dilation = 1),
            nn.ReLU()
        )

        self.pooling_1 = nn.Sequential(
            nn.MaxPool1d(kernel_size = 11, stride = 2, padding = 5, dilation = 1)
        )

        self.cnn_input_2 = nn.Sequential(
            nn.Conv1d(8, 16, kernel_size = 11, stride = 1, padding = 5, dilation = 1),
            nn.ReLU(),
            nn.Conv1d(16, 16, kernel_size = 11, stride = 1, padding = 5, dilation = 1),
            nn.ReLU()
        )

        self.pooling_2 = nn.Sequential(
            nn.MaxPool1d(kernel_size = 11, stride = 2, padding = 5, dilation = 1)
        )

        self.cnn_input_3 = nn.Sequential(
            nn.Conv1d(16, 32, kernel_size = 11, stride = 1, padding = 5, dilation = 1),
            nn.ReLU(),
            nn.Conv1d(32, 32, kernel_size = 11, stride = 1, padding = 5, dilation = 1),
            nn.ReLU()
        )

        self.pooling_3 = nn.Sequential(
            nn.MaxPool1d(kernel_size = 11, stride = 2, padding = 5, dilation = 4)
        )

        self.cnn_4 = nn.Sequential(
            nn.Conv1d(32, 16, kernel_size = 11, stride = 1, padding = 5, dilation = 1),
            nn.ReLU(),
            nn.Conv1d(16, 8, kernel_size = 11, stride = 1, padding = 5, dilation = 1),
            nn.ReLU(),
            nn.Conv1d(8, 1, kernel_size = 11, stride = 1, padding = 5, dilation = 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        # in lightning, forward defines the prediction/inference actions
        out = self.cnn_input_1(x)
        out = self.pooling_1(out)
        out = self.cnn_input_2(out)
        out = self.pooling_2(out)
        out = self.cnn_input_3(out)
        out = self.pooling_3(out)
        out = self.cnn_4(out)
        return out

    def training_step(self, batch, batch_idx):
        # training_step defined the train loop
        criterion = nn.BCELoss()
            
        x, y = batch
        out = model(x.float())
        loss =  criterion(out, y.float())
        
        # Logging to TensorBoard by default
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer


In [None]:
# init model
model = ConvNet()

# most basic trainer, uses good defaults (auto-tensorboard, checkpoints, logs, and more)
# trainer = pl.Trainer(gpus=8) (if you have GPUs)
trainer = pl.Trainer()
trainer.fit(model, train_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name        | Type       | Params
-------------------------------------------
0 | cnn_input_1 | Sequential | 896   
1 | pooling_1   | Sequential | 0     
2 | cnn_input_2 | Sequential | 4 K   
3 | pooling_2   | Sequential | 0     
4 | cnn_input_3 | Sequential | 16 K  
5 | pooling_3   | Sequential | 0     
6 | cnn_4       | Sequential | 7 K   


HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…