In [50]:
import sys
sys.path.append('../')

from stepselector.data_loader import ZebraDataset, ZebraBatchSampler, custom_collate
from torch.utils.data import Dataset, DataLoader, Sampler
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import joblib
from osgeo import gdal
from tqdm.notebook import tqdm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neural_network import MLPClassifier

gdal.UseExceptions()

In [51]:
# Change server_mount for your system
server_mount = '/home/blair/server/herd_hover'
data_folder = os.path.join(server_mount, 'zebra_movement_data')

# Import pre-loaded data
data_file = os.path.join(data_folder, 'loaded_data.csv')
data = pd.read_csv(data_file)
data.drop(labels = ['Unnamed: 0'], axis = 1, inplace = True)
data.rename(columns={'0': 'angle_to_observer',
                     '1': 'dist_to_observer',
                     '2': 'road',
                     '3': 'ground_slope',
                     '4': 'visibility',
                     '5': 'social_dens',
                     '6': 'social_vis'}, inplace = True)

In [52]:
data

Unnamed: 0,angle_to_observer,dist_to_observer,road,ground_slope,visibility,social_dens,social_vis,label
0,,313.353999,0.0,,0.051937,1.0,0.7,0
1,,313.278518,0.0,,0.052157,1.0,0.7,0
2,,313.087000,0.0,,0.052371,1.0,0.7,0
3,,311.517244,0.0,,0.054823,2.0,0.7,0
4,,311.394117,0.0,,0.054778,2.0,0.7,0
...,...,...,...,...,...,...,...,...
35018,142.385135,671.698277,0.0,1.437439,0.099055,11.0,1.0,1
35019,142.258953,675.666246,0.0,1.324872,0.091859,6.0,1.0,1
35020,161.759522,680.430323,0.0,1.991050,0.087393,10.0,1.0,1
35021,167.839593,685.402186,0.0,1.600556,0.073806,11.0,1.0,1


In [53]:
# impute missing values
imputer = SimpleImputer(strategy = 'mean')
df_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
df_imputed = df_imputed.astype({'road': 'int32', 'social_dens':'int32', 'label':'int32'})

In [54]:
# Log-transform distances
df_imputed['log_dist_to_observer'] = np.log1p(df_imputed['dist_to_observer'])

# convert angles to sines and cosines
df_imputed['sin_angle_to_observers'] = np.sin(np.deg2rad(df_imputed['angle_to_observer']))
df_imputed['cos_angle_to_observers'] = np.cos(np.deg2rad(df_imputed['angle_to_observer']))
df_imputed['sin_ground_slope'] = np.sin(np.deg2rad(df_imputed['ground_slope']))
df_imputed['cos_ground_slope'] = np.cos(np.deg2rad(df_imputed['ground_slope']))

# drop untransformed columns
df = df_imputed.drop(columns =['dist_to_observer', 'angle_to_observer', 'ground_slope'])

In [55]:
# initialize scalers
#slope_scaler = StandardScaler()
visibility_scaler = MinMaxScaler(feature_range=(0,1))
density_scaler = MinMaxScaler(feature_range=(0,1))
log_distance_scaler = StandardScaler()
sin_cos_scaler = StandardScaler()

In [56]:
# apply scalers
#df['ground_slope'] = slope_scaler.fit_transform(df[['ground_slope']])
df['visibility'] = visibility_scaler.fit_transform(df[['visibility']])
df['social_dens'] = density_scaler.fit_transform(df[['social_dens']])
df['social_vis'] = visibility_scaler.fit_transform(df[['social_vis']])
df['log_dist_to_observer'] = log_distance_scaler.fit_transform(df[['log_dist_to_observer']])
df[['sin_angle_to_observers', 'cos_angle_to_observers', 'sin_ground_slope', 'cos_ground_slope']] = sin_cos_scaler.fit_transform(df[['sin_angle_to_observers', 'cos_angle_to_observers', 'sin_ground_slope', 'cos_ground_slope']])

In [57]:
X = df.drop(columns='label')
y = df['label']

In [58]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [59]:
# Initialize and train the logistic regression model with parallelization
model = LogisticRegression(max_iter=1000, n_jobs=1, verbose=True, class_weight = 'balanced', penalty = None, C=1.0)  # n_jobs=-1 uses all available cores
model.fit(X_train, y_train)

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           10     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.93147D-01    |proj g|=  2.24271D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
   10     14     17      1     0     0   6.332D-05   6.915D-01
  F =  0.69152207872323579     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


 This problem is unconstrained.


In [60]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.52


In [27]:
# check for multicolinnearity 
vif_data = pd.DataFrame()
vif_data['feature'] = df.columns
vif_data['VIF'] = [variance_inflation_factor(df.values, i) for i in range(len(df.columns))]
vif_data

Unnamed: 0,feature,VIF
0,road,1.04034
1,ground_slope,1.130081
2,visibility,5.192439
3,social_dens,3.770457
4,social_vis,6.620433
5,label,1.17102
6,log_dist_to_observer,1.494856
7,sin_angle_to_observers,1.260283
8,cos_angle_to_observers,1.366675


In [28]:
# Drop social_vis
df_new = df.drop(columns = ['social_vis'])

In [29]:
X = df_new.drop(columns='label')
y = df_new['label']

In [30]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Initialize and train the logistic regression model with parallelization
model = LogisticRegression(max_iter=1000, n_jobs=1, verbose=True, class_weight = 'balanced', penalty = None, C=1.0)  # n_jobs=-1 uses all available cores
model.fit(X_train, y_train)

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.93147D-01    |proj g|=  2.24271D-02



 This problem is unconstrained.


           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    8     15     19      1     0     0   1.457D-05   6.917D-01
  F =  0.69167694039440242     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


In [32]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.52


In [35]:
# Try with MLPClassifier

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)

clf = MLPClassifier(random_state=1, max_iter=300, verbose = True, hidden_layer_sizes=(128,128,128)).fit(X_train, y_train)

Iteration 1, loss = 0.46286479
Iteration 2, loss = 0.44927551
Iteration 3, loss = 0.44888213
Iteration 4, loss = 0.44912423
Iteration 5, loss = 0.44814108
Iteration 6, loss = 0.44789657
Iteration 7, loss = 0.44695919
Iteration 8, loss = 0.44766729
Iteration 9, loss = 0.44689352
Iteration 10, loss = 0.44621411
Iteration 11, loss = 0.44569760
Iteration 12, loss = 0.44582813
Iteration 13, loss = 0.44538760
Iteration 14, loss = 0.44532585
Iteration 15, loss = 0.44472185
Iteration 16, loss = 0.44425662
Iteration 17, loss = 0.44379699
Iteration 18, loss = 0.44350174
Iteration 19, loss = 0.44348023
Iteration 20, loss = 0.44348328
Iteration 21, loss = 0.44296063
Iteration 22, loss = 0.44237390
Iteration 23, loss = 0.44184131
Iteration 24, loss = 0.44156798
Iteration 25, loss = 0.44142499
Iteration 26, loss = 0.44204816
Iteration 27, loss = 0.44089897
Iteration 28, loss = 0.44017370
Iteration 29, loss = 0.44008054
Iteration 30, loss = 0.44049182
Iteration 31, loss = 0.43948167
Iteration 32, los



In [36]:
# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.80


In [37]:
# Try dropping nans instead of imputing
data_noNAN = data.dropna()

In [38]:
# Log-transform distances
df_noNAN['log_dist_to_observer'] = np.log1p(df_noNAN['dist_to_observer'])

# convert angles to sines and cosines
df_noNAN['sin_angle_to_observers'] = np.sin(np.deg2rad(df_noNAN['angle_to_observer']))
df_noNAN['cos_angle_to_observers'] = np.cos(np.deg2rad(df_noNAN['angle_to_observer']))
df_noNAN['sin_ground_slope'] = np.sin(np.deg2rad(df_noNAN['ground_slope']))
df_noNAN['cos_ground_slope'] = np.cos(np.deg2rad(df_noNAN['ground_slope']))

# drop untransformed columns
df = df_noNAN.drop(columns =['dist_to_observer', 'angle_to_observer', 'ground_slope'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_noNAN['log_dist_to_observer'] = np.log1p(data_noNAN['dist_to_observer'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_noNAN['sin_angle_to_observers'] = np.sin(np.deg2rad(data_noNAN['angle_to_observer']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_noNAN['cos_angle_to_observers']

In [39]:
# apply scalers
#df['ground_slope'] = slope_scaler.fit_transform(df[['ground_slope']])
df['visibility'] = visibility_scaler.fit_transform(df[['visibility']])
df['social_dens'] = density_scaler.fit_transform(df[['social_dens']])
df['social_vis'] = visibility_scaler.fit_transform(df[['social_vis']])
df['log_dist_to_observer'] = log_distance_scaler.fit_transform(df[['log_dist_to_observer']])
df[['sin_angle_to_observers', 'cos_angle_to_observers', 'sin_ground_slope', 'cos_ground_slope']] = sin_cos_scaler.fit_transform(df[['sin_angle_to_observers', 'cos_angle_to_observers', 'sin_ground_slope', 'cos_ground_slope']])

In [40]:
X = df.drop(columns='label')
y = df['label']

In [41]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
# Initialize and train the logistic regression model with parallelization
model = LogisticRegression(max_iter=1000, n_jobs=1, verbose=True, class_weight = 'balanced', penalty = None, C=1.0)  # n_jobs=-1 uses all available cores
model.fit(X_train, y_train)

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            9     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.93147D-01    |proj g|=  2.71753D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    9     14     17      1     0     0   3.615D-05   6.910D-01
  F =  0.69096656690986458     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


 This problem is unconstrained.


In [43]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.51
