In [11]:
import sys
sys.path.append('../')

from stepselector.data_loader import ZebraDataset, ZebraBatchSampler, custom_collate
from torch.utils.data import Dataset, DataLoader, Sampler
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import joblib
from osgeo import gdal
from tqdm.notebook import tqdm

gdal.UseExceptions()

In [12]:
# Change server_mount for your system
server_mount = '/home/blair/server/herd_hover'

# Do you want to load five meter steps or ten meter steps?
step_length = 'five'

# Specify radius of viewshed (in meters)
viewshed_radius = 50
# Specify height/width of downsampled viewshed (e.g. 512 will return an array of 512x512 pixels
viewshed_hw = 512
# Specify radius (in meters) to define social density (number of conspecifics within radius)
social_radius = 10

# How many fake steps to use per real step? (max 20)
n_ref_steps = 5

# how many threads to use for viewshed generation? (used 1 before, 4 should be faster)
n_threads = 4

# Make list of columns to keep - these will be fetched by the dataloader
columns_to_keep = ['target_id',
                   #'step_length_m',
                   #'step_duration_s',
                   #'step_speed_mps',
                   'angle_to_observers', 
                   'dist_to_observer', 
                   'delta_observer_dist', 
                   'road', 
                   'ground_slope', 
                   'visibility', 
                   'social_dens', 
                   'social_vis']

# Filename for loaded data
filename = 'loaded_data.csv'

In [13]:
# Define data directories
data_folder = os.path.join(server_mount, 'zebra_movement_data')

# For five meter steps:
observed_steps_directory = os.path.join(data_folder, '%s_meter_steps' %step_length, 'observed')
simulated_steps_directory = os.path.join(data_folder, '%s_meter_steps' %step_length, 'simulated')


rasters_directory = os.path.join(data_folder, 'rasters')
ob_metadata_file = os.path.join(data_folder, 'observation_metadata.csv')
track_metadata_file = os.path.join(data_folder, 'track_metadata.csv')

In [7]:
dataset = ZebraDataset(target_dir = observed_steps_directory,
                       reference_dir = simulated_steps_directory,
                       rasters_dir = rasters_directory,
                       ob_metadata_file = ob_metadata_file,
                       viewshed_radius = viewshed_radius,
                       viewshed_hw = viewshed_hw,
                       social_radius = social_radius,
                       num_ref_steps = n_ref_steps,
                       threads = n_threads,
                       columns_to_keep = columns_to_keep)

batch_sampler = ZebraBatchSampler(dataset)
dataloader = DataLoader(dataset, batch_sampler = batch_sampler, collate_fn = custom_collate, num_workers = 4)

In [8]:
def process_batch(batch):
    target, references = batch
    
    target_data = target[0]
    target_features = [
        target_data['target_id'],
        target_data['angle_to_observers'],
        target_data['dist_to_observer'],
        target_data['delta_observer_dist'],
        target_data['road'],
        target_data['ground_slope'],
        target_data['visibility'],
        target_data['social_dens'],
        target_data['social_vis']
    ]
    labels = [1]

    ref_features = []
    for ref in references[0]:
        ref_features.append([
            ref['target_id'],
            ref['angle_to_observers'],
            ref['dist_to_observer'],
            ref['delta_observer_dist'],
            ref['road'],
            ref['ground_slope'],
            ref['visibility'],
            ref['social_dens'],
            ref['social_vis']
        ])
        labels.append(0)

    return target_features, ref_features, labels

In [9]:
features = []
labels = []

for batch in tqdm(dataloader):
    target_features, ref_features, batch_labels = process_batch(batch)
    features.append(target_features)
    features.extend(ref_features)
    labels.extend(batch_labels)

df_features = pd.DataFrame(features)
df_labels = pd.Series(labels, name = 'label')

df = pd.concat([df_features, df_labels], axis =1)

df.to_csv(os.path.join(data_folder, filename))
# imputer = SimpleImputer(strategy = 'mean')
# df_imputed = pd.DataFrame(imputer.fit_transform(features), columns=df_features.columns)

  0%|          | 0/7869 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

