### Import packages and set global variables

In [12]:
import sys
sys.path.append('../')

from stepselector.data_loader import ZebraDataset, ZebraBatchSampler, custom_collate
from torch.utils.data import Dataset, DataLoader, Sampler
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import joblib
from osgeo import gdal
from tqdm.notebook import tqdm

gdal.UseExceptions()

### Set variables for data loader and file saving

In [13]:
# Change server_mount for your system
server_mount = '/home/blair/server/herd_hover'

# Do you want to load five meter steps or ten meter steps?
step_length = 'ten'

# Specify radius of viewshed (in meters)
viewshed_radius = 30
# Specify height/width of downsampled viewshed (e.g. 512 will return an array of 512x512 pixels
viewshed_hw = 512
# Specify radius (in meters) to define social density (number of conspecifics within radius)
social_radius = 10

# How many fake steps to use per real step? (max 20)
n_ref_steps = 5

# how many threads to use for viewshed generation? (used 1 before, 4 should be faster)
n_threads = 1

# Make list of columns to keep - these will be fetched by the dataloader
columns_to_keep = ['target_id',
                   'observation',
                   'step_speed_mps',
                   'angle_to_observers', 
                   'dist_to_observer', 
                   'delta_observer_dist', 
                   'road', 
                   'ground_class',
                   'ground_slope', 
                   'visibility', 
                   'social_dens', 
                   'social_vis',
                   'age_class',
                   'species',
                   'individual_ID']

# Filename for loaded data
filename = 'loaded_data_%smsteps.csv' %step_length

In [14]:
# Define data directories
data_folder = os.path.join(server_mount, 'zebra_movement_data')

# For five meter steps:
observed_steps_directory = os.path.join(data_folder, '%s_meter_steps' %step_length, 'observed')
simulated_steps_directory = os.path.join(data_folder, '%s_meter_steps' %step_length, 'simulated')


rasters_directory = os.path.join(data_folder, 'rasters')
ob_metadata_file = os.path.join(data_folder, 'observation_metadata.csv')
track_metadata_file = os.path.join(data_folder, 'track_metadata.csv')

### Load data

In [15]:
dataset = ZebraDataset(target_dir = observed_steps_directory,
                       reference_dir = simulated_steps_directory,
                       rasters_dir = rasters_directory,
                       ob_metadata_file = ob_metadata_file,
                       viewshed_radius = viewshed_radius,
                       viewshed_hw = viewshed_hw,
                       social_radius = social_radius,
                       num_ref_steps = n_ref_steps,
                       threads = n_threads,
                       columns_to_keep = columns_to_keep)

batch_sampler = ZebraBatchSampler(dataset)
dataloader = DataLoader(dataset, batch_sampler = batch_sampler, collate_fn = custom_collate, num_workers = 18)

In [16]:
def process_batch(batch):
    target, references = batch
    
    target_data = target[0]
    target_features = [
        target_data['target_id'],
        target_data['observation'],
        target_data['step_speed_mps'],
        target_data['angle_to_observers'],
        target_data['dist_to_observer'],
        target_data['delta_observer_dist'],
        target_data['road'],
        target_data['ground_class'],
        target_data['ground_slope'],
        target_data['visibility'],
        target_data['social_dens'],
        target_data['social_vis'],
        target_data['age_class'],
        target_data['species'],
        target_data['individual_ID']
    ]
    labels = [1]

    ref_features = []
    for ref in references[0]:
        ref_features.append([
            ref['target_id'],
            ref['observation'],
            ref['step_speed_mps'],
            ref['angle_to_observers'],
            ref['dist_to_observer'],
            ref['delta_observer_dist'],
            ref['road'],
            ref['ground_class'],
            ref['ground_slope'],
            ref['visibility'],
            ref['social_dens'],
            ref['social_vis'],
            ref['age_class'],
            ref['species'],
            ref['individual_ID']
        ])
        labels.append(0)

    return target_features, ref_features, labels

In [17]:
features = []
labels = []

for batch in tqdm(dataloader):
    target_features, ref_features, batch_labels = process_batch(batch)
    features.append(target_features)
    features.extend(ref_features)
    labels.extend(batch_labels)

df_features = pd.DataFrame(features)
df_labels = pd.Series(labels, name = 'label')

df = pd.concat([df_features, df_labels], axis =1)

  0%|          | 0/3968 [00:00<?, ?it/s]

In [18]:
# Save loaded data - temporarily until I get the transformations and scaling finalized
df.to_csv(os.path.join(data_folder, filename))

### Transform, encode, and scale features
Features **social_dens** and **social_vis** will need ordinal embedding implemented in the model

Features **observation** and **individual_ID** will need word embedding in the model

In [None]:
# Initialize encoders
onehot_encode = OneHotEncoder(sparse_output = False)

In [None]:
# Transform data

# log transform step_speed_mps
data['step_speed_mps'] = np.log1p(data['step_speed_mps'])

# convert angle_to_observer to proportion (divide by 180)
data['angle_to_observer'] = data['angle_to_observer']/180

# logit transform angle_to_observer
data['angle_to_observer'] = logit(data['angle_to_observer'])

# log transform dist_to_observer
data['dist_to_observer'] = np.log1p(data['dist_to_observer'])

# one-hot encode ground_class
ground_class_df = data[['ground_class']].copy()
ground_class_onehot = onehot_encode.fit_transform(ground_class_df)
data['ground_unclassified'] = ground_class_onehot[:,0]
data['ground_bare'] = ground_class_onehot[:,1]
data['ground_grass'] = ground_class_onehot[:,2]
data['ground_tree'] = ground_class_onehot[:,3]
data.drop(['ground_class'], axis = 1, inplace = True)

# one-hot encode species
species_df = data[['species']].copy()
species_onehot = onehot_encode.fit_transform(species_df)
species = np.unique(data.species)
for n,i in enumerate(species):
    data['spp_%s' % i] = species_onehot[:,n]
data.drop(['species'], axis = 1, inplace = True)

# one-hot encode age_class
age_class_df = data[['age_class']].copy()
age_class_onehot = onehot_encode.fit_transform(age_class_df)
classes = np.unique(data.age_class)
for n,i in enumerate(classes):
    data['age_%s' % i] = age_class_onehot[:,n]
data.drop(['age_class'], axis = 1, inplace = True)

# logit transform visibility
data['visibility'] = logit(data['visibility'])

In [None]:
# function for Z-scoring that uses only observed steps (label = 1) to define mean and standard deviation
def Zscore(colname, df):
    obs_df = df[df['label'] == 1].copy()
    mean = np.mean(obs_df[colname])
    std = np.std(obs_df[colname])
    df[colname] = [(x-mean)/std for x in df[colname]]

In [None]:
# Scale data

# Z-score step_speed_mps
Zscore('step_speed_mps', data)

# Z-score angle_to_observer
Zscore('angle_to_observer', data)
#data['angle_to_observer'] = Zscore.fit_transform(data[['angle_to_observer']])

# Z-score dist_to_observer
Zscore('dist_to_observer', data)
#data['dist_to_observer'] = Zscore.fit_transform(data[['dist_to_observer']])

# Z-score delta_observer_dist
Zscore('delta_observer_dist', data)
#data['delta_observer_dist'] = Zscore.fit_transform(data[['delta_observer_dist']])

# Z-score ground_slope
Zscore('ground_slope', data)
#data['ground_slope'] = Zscore.fit_transform(data[['ground_slope']])

# Z-score visibility
Zscore('visibility', data)
#data['visibility'] = Zscore.fit_transform(data[['visibility']])

### Save loaded data

In [None]:
# Save loaded data
df.to_csv(os.path.join(data_folder, filename))