In [2]:
import sys
sys.path.append('../')

from stepselector.data_loader import ZebraDataset, ZebraBatchSampler, custom_collate
from torch.utils.data import Dataset, DataLoader, Sampler
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import joblib
from osgeo import gdal

gdal.UseExceptions()

In [3]:
# Change server_mount for your system
server_mount = '/home/blair/server/herd_hover'

# Specify radius of viewshed (in meters)
viewshed_radius = 100
# Specify height/width of downsampled viewshed (e.g. 512 will return an array of 512x512 pixels
viewshed_hw = 512
# Specify radius (in meters) to define social density (number of conspecifics within radius)
social_radius = 10


# Make list of columns to keep - these will be fetched by the dataloader
columns_to_keep = ['angle_to_observers', 'dist_to_observer', 'delta_observer_dist', 'road', 'ground_slope', 'visibility', 'social_dens', 'social_vis']

In [4]:
# Define data directories
data_folder = os.path.join(server_mount, 'zebra_movement_data')
observed_steps_directory = os.path.join(data_folder, 'five_meter_steps', 'observed')
simulated_steps_directory = os.path.join(data_folder, 'five_meter_steps', 'simulated')
rasters_directory = os.path.join(data_folder, 'rasters')
ob_metadata_file = os.path.join(data_folder, 'observation_metadata.csv')
track_metadata_file = os.path.join(data_folder, 'track_metadata.csv')

In [5]:
dataset = ZebraDataset(target_dir = observed_steps_directory,
                       reference_dir = simulated_steps_directory,
                       rasters_dir = rasters_directory,
                       ob_metadata_file = ob_metadata_file,
                       viewshed_radius = viewshed_radius,
                       viewshed_hw = viewshed_hw,
                       social_radius = social_radius,
                       threads = 4,
                       columns_to_keep = columns_to_keep)

batch_sampler = ZebraBatchSampler(dataset)
dataloader = DataLoader(dataset, batch_sampler = batch_sampler, collate_fn = custom_collate)

In [6]:
def process_batch(batch):
    target, references = batch
    
    target_data = target[0]
    target_features = [
        target_data['angle_to_observers'],
        target_data['dist_to_observer'],
        target_data['road'],
        target_data['ground_slope'],
        target_data['visibility'],
        target_data['social_dens'],
        target_data['social_vis']
    ]
    labels = [1]

    ref_features = []
    for ref in references[0]:
        ref_features.append([
            ref['angle_to_observers'],
            ref['dist_to_observer'],
            ref['delta_observer_dist'],
            ref['road'],
            ref['ground_slope'],
            ref['visibility'],
            ref['social_dens'],
            ref['social_vis']
        ])
        labels.append(0)

    return target_features, ref_features, labels

In [None]:
features = []
labels = []

with ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_batch, batch) for batch in dataloader]
    for future in as_completed(futures):
        target_features, ref_features, batch_labels = future.result()
        features.append(target_features)
        features.extend(ref_features)
        labels.extend(batch_labels)

features = np.array(features)
labels = np.array(labels)

imputer = SimpleImputer(strategy = 'mean')
features = imputer.fit_transform(features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model with parallelization
model = LogisticRegression(max_iter=1000, n_jobs=4)  # n_jobs=-1 uses all available cores
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

In [None]:
joblib.dump(model, '/home/blair/fine-scale-movement/logistic_regression_model.joblib')

In [None]:
# features = []
# labels = []

# # Iterate over data loader
# for batch in dataloader:
#     # Extract target and reference data
#     target, references = batch

#     # Extract features for target point and label as 1
#     target_data = target[0]
#     target_features = [
#         target_data['angle_to_observers'],
#         target_data['dist_to_observer'],
#         target_data['road'],
#         target_data['ground_slope'],
#         target_data['visibility'],
#         target_data['social_dens'],
#         target_data['social_vis']
#     ]
#     features.append(target_features)
#     labels.append(1)

#     for ref in references[0]:
#         ref_features = [
#             ref['angle_to_observers'],
#             ref['dist_to_observer'],
#             ref['delta_observer_dist'],
#             ref['road'],
#             ref['ground_slope'],
#             ref['visibility'],
#             ref['social_dens'],
#             ref['social_vis']
#         ]
#         features.append(ref_features)
#         labels.append(0)

# # Convert to numpy arrays
# features = np.array(features)
# labels = np.array(labels)

# # Handle missing values (nan) by imputing with the mean of the column
# imputer = SimpleImputer(strategy = 'mean')
# features = imputer.fit_transform(features)

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state=42)

# # Initialize and train the logistic regression model
# model = LogisticRegression(max_iter = 1000)
# model.fit(X_train, y_train)



In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

In [None]:
accuracy

In [None]:
accuracy

In [None]:
joblib.dump(model, '/home/blair/fine-scale-movement/logistic_regression_model.joblib')