### **Overview**
This notebook is a variation of "processing for Mamba".
It was used to produce test samples that are cleaned but not interploated in the research to see the effect on the results.

The notebooks contains tools for preprocessing the Walaris dataset to Mamba compatible inputs. Each file is processed into samples of a known durtion. The samples are interpolated to also have the same length (within the same duration). The data is derived and collected into an 8-dimensional input. In addition, two inputs are calculated for each sample: the time interval vector and the scale:
1. The time interval vector for the iterpolated variation is trivial, but the implementation can take any real-tie interval.
2. The "local Std" is calculated for each parameter to serve as a normalizer during training, so an 8 dimensional vector of scales is saved with each sample.

As explained in the report, we use samples of different but know duration to evaluate the perforance according to the duration. The samples we extract have a duration of 5, 10, 15, 20, 25, 30 and 60 second.


### Imports and loading

In [None]:
# imports

import io
import os
import sys
from datetime import datetime
import pickle
import warnings
warnings.filterwarnings('ignore')
import json
import matplotlib.pyplot as plt
import numpy as np
import torch
from scipy.fft import fft
import pandas as pd
import seaborn as sns
from scipy import interpolate
from scipy.interpolate import interp1d
from scipy.spatial.distance import cdist
from scipy.stats import pearsonr

# Machine Learning
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

from google.colab import drive
drive.mount('/content/drive/')
# helper files
sys.path.append('/content/drive/MyDrive/Final Project UAV/')
from UAV_project_preprocessing_and_visualization_helper_functions_full import *

Mounted at /content/drive/


In [None]:
cd /content/drive/MyDrive/Final Project UAV/

/content/drive/MyDrive/Final Project UAV


In [None]:
folder = 'track_data'

####Segmentation for training

with derivation

In [None]:
# Since we are not working with interpolation it is critical here to make sure we have single entries per timestamp ot the derivation would get inf values.
def get_unique_time_and_value(time_vec, value_vec):
  unique_time_vec = []
  unique_val_vec = []
  for i in range(len(time_vec)):
    if i == 0 or time_vec[i] != time_vec[i-1]:
      num_dup = 1
      unique_time_vec.append(time_vec[i])
      unique_val_vec.append(value_vec[i])
    else:
      num_dup = num_dup + 1
      unique_val_vec[-1] = ((num_dup-1)*unique_val_vec[-1] + value_vec[i])/num_dup
  return np.array(unique_time_vec), np.array(unique_val_vec)


In [None]:
def derive(theta_data, phi_data, time):
  # Derivation used for velocity and acceleration
  # Protections for inf should not be relevant since it is assumes unique vectors are sent
  t_der = (time[:-1] + time[1:])/2
  der_theta = np.diff(theta_data)/np.diff(time)
  inf_ind1 = np.where(np.isinf(der_theta))
  nan_ind1 = np.where(np.isnan(der_theta))
  der_phi = np.diff(phi_data)/np.diff(time)
  inf_ind2 = np.where(np.isinf(der_phi))
  nan_ind2 = np.where(np.isnan(der_phi))
  inf_inds = np.union1d(inf_ind1, inf_ind2)
  nan_inds = np.union1d(nan_ind1, nan_ind2)
  inds = np.union1d(inf_inds, nan_inds)
  mask = np.ones(len(t_der), dtype=bool)
  mask[inds] = False
  return der_theta[mask], der_phi[mask], t_der[mask]

In [None]:
def segment_file_by_time_classic(file_path, samples_config):
  sample_duration = samples_config['sample_duration']
  skip_duration = samples_config['skip_duration']
  delta = samples_config['delta']

  tt, xx, yy, zz, theta, phi, size_hor, size_ver, light_domain = raw_angles_data_from_json(file_path)
  theta, phi = convert_to_angles(xx, yy, zz)
  theta, phi = clean_2D_data_w_split(tt, theta, phi, factor = 3, window = 5, threshold = -999)

  # delta = 0.04
  interp_tt, interp_theta = interpolate_data(tt, theta, dt=delta, fixed = True)
  interp_tt, interp_phi = interpolate_data(tt, phi, dt=delta, fixed = True)
  interp_tt, interp_size_hor = interpolate_data(tt, size_hor, dt=delta, fixed = True)
  interp_tt, interp_size_ver = interpolate_data(tt, size_ver, dt=delta, fixed = True)
  dtt = np.diff(interp_tt)
  delta_tt = np.pad(dtt, (1, 0)) #this implementation works for time that is not interpolated - like during inference

  # derive
  # expect possible trouble if np.interp doesn't work perfectly with the length of the outputs (in case of nans)
  interp_vel_theta, interp_vel_phi, interp_t_vel = derive(interp_theta, interp_phi, interp_tt)
  interp_acc_theta, interp_acc_phi, interp_t_acc = derive(interp_vel_theta, interp_vel_phi, interp_t_vel)
  interp_vel_theta = np.pad(interp_vel_theta, (1, 0))
  interp_vel_phi = np.pad(interp_vel_phi, (1, 0))
  interp_acc_theta = np.pad(interp_acc_theta, (2, 0))
  interp_acc_phi = np.pad(interp_acc_phi, (2, 0))
  complete_sample = np.stack([delta_tt, interp_theta, interp_phi, interp_vel_theta, interp_vel_phi, interp_acc_theta, interp_acc_phi, interp_size_hor, interp_size_ver]).T

  #segmenting
  sample_length = int(np.round(sample_duration/delta) + 1)
  skip = int(np.round(skip_duration/delta) + 1)
  # skip_length = int(np.round(skip_duration/delta) + 1)
  #When we produce the test samples, we want no overlap between the samples
  #We pass 0 to signal that we want test samples and then the skip parameter will be the maximum between the sample length and the sent skip
  #The reason is that if we use sample length alone we could have too many samples, so this is how we can regulate
  # if for_test:
  #   skip = sample_length
  #   # skip = max(skip_length, sample_length)
  # else:
  #   skip = skip_length
  sub_samples = []
  sub_dt = []
  start_index = 0
  end_index = start_index + sample_length

  while end_index <= len(interp_tt):
    # print(end_index)
    # print(len(interp_tt))
    #make sure initial sample has enough information
    start_tt_index = np.nonzero(tt>=(start_index-1)*delta)[0][0]
    end_tt_index = np.nonzero(tt>=(end_index-1)*delta)[0][0]
    if end_tt_index - start_tt_index < samples_config['min_samples']:
      start_index += skip
      end_index = start_index + sample_length
      continue
    #extract segment
    complete_tensor = torch.tensor(complete_sample[start_index:end_index-1, 1:], dtype = torch.float32)
    dt_tensor = torch.tensor(complete_sample[start_index:end_index-1, 0], dtype = torch.float32)
    sub_samples.append(complete_tensor)
    sub_dt.append(dt_tensor)
    start_index += skip
    end_index = start_index + sample_length

  #scales
  scale_theta = torch.tensor(local_std(interp_theta, 10), dtype = torch.float32)
  scale_phi = torch.tensor(local_std(interp_phi, 10), dtype = torch.float32)
  scale_vel_theta = torch.tensor(local_std(interp_vel_phi, 10), dtype = torch.float32)
  scale_vel_phi = torch.tensor(local_std(interp_vel_phi, 10), dtype = torch.float32)
  scale_acc_theta = torch.tensor(local_std(interp_acc_theta, 10), dtype = torch.float32)
  scale_acc_phi = torch.tensor(local_std(interp_acc_phi, 10), dtype = torch.float32)
  scale_size_hor = torch.tensor(local_std(interp_size_hor, 10), dtype = torch.float32)
  scale_size_ver = torch.tensor(local_std(interp_size_ver, 10), dtype = torch.float32)
  scale = torch.stack([scale_theta, scale_phi, scale_vel_theta, scale_vel_phi, scale_acc_theta, scale_acc_phi, scale_size_hor, scale_size_ver]).T

  return sub_samples, sub_dt, scale

In [None]:
def segment_file_by_time(file_path, samples_config):
  '''This function segments a file according to the configurations.
  This variation does not clean or interpolate the data, only derives it to obtain the velocity and acceleration and add them as inputs
  It also produces the time interval vector and the local Std (scale) vector, which is used for normaliation of the loss function.
  '''
  sample_duration = samples_config['sample_duration']
  tt, xx, yy, zz, theta, phi, size_hor, size_ver, light_domain = raw_angles_data_from_json(file_path)
  theta, phi = convert_to_angles(xx, yy, zz)
  theta, phi = clean_2D_data_w_split(tt, theta, phi, factor = 3, window = 5, threshold = -999)
  #return unique values (for later derivation)
  _, theta = get_unique_time_and_value(tt, theta)
  _, phi = get_unique_time_and_value(tt, phi)
  _, size_hor = get_unique_time_and_value(tt, size_hor)
  unique_tt, size_ver = get_unique_time_and_value(tt, size_ver)
  dtt = np.diff(unique_tt)
  delta_tt = np.pad(dtt, (1, 0))
  vel_theta, vel_phi, t_vel = derive(theta, phi, unique_tt)
  acc_theta, acc_phi, t_acc = derive(vel_theta, vel_phi, t_vel)
  vel_theta = np.pad(vel_theta, (1, 0))
  vel_phi = np.pad(vel_phi, (1, 0))
  acc_theta = np.pad(acc_theta, (2, 0))
  acc_phi = np.pad(acc_phi, (2, 0))
  complete_sample = np.stack([delta_tt, theta, phi, vel_theta, vel_phi, acc_theta, acc_phi, size_hor, size_ver]).T

  #segmenting
  skip = samples_config['skip_duration']
  current_index = [0]
  end_index = np.nonzero(unique_tt>=unique_tt[0]+sample_duration)[0]

  sub_samples = []
  sub_dt = []
  while end_index.size:
      if end_index[0] - current_index[0] > samples_config['min_samples']: ### a threshold for a minimum number of datapoints in a sample
        complete_tensor = torch.tensor(complete_sample[current_index[0]:end_index[0], 1:], dtype = torch.float32)
        dt_tensor = torch.tensor(complete_sample[current_index[0]:end_index[0], 0], dtype = torch.float32)
        sub_samples.append(complete_tensor)
        sub_dt.append(dt_tensor)
      end_index = np.nonzero(unique_tt>=unique_tt[current_index[0]]+skip+sample_duration)[0]
      current_index = np.nonzero(unique_tt>=unique_tt[current_index[0]]+skip)[0]


  #scales
  scale_theta = torch.tensor(local_std(theta, 10), dtype = torch.float32)
  scale_phi = torch.tensor(local_std(phi, 10), dtype = torch.float32)
  scale_vel_theta = torch.tensor(local_std(vel_phi, 10), dtype = torch.float32)
  scale_vel_phi = torch.tensor(local_std(vel_phi, 10), dtype = torch.float32)
  scale_acc_theta = torch.tensor(local_std(acc_theta, 10), dtype = torch.float32)
  scale_acc_phi = torch.tensor(local_std(acc_phi, 10), dtype = torch.float32)
  scale_size_hor = torch.tensor(local_std(size_hor, 10), dtype = torch.float32)
  scale_size_ver = torch.tensor(local_std(size_ver, 10), dtype = torch.float32)
  scale = torch.stack([scale_theta, scale_phi, scale_vel_theta, scale_vel_phi, scale_acc_theta, scale_acc_phi, scale_size_hor, scale_size_ver]).T

  return sub_samples, sub_dt, scale

In [None]:
subfolders = os.listdir("track_data/")
subf_dict = {i:subfolders[i] for i in range(len(subfolders))}
labels_dict = {subfolders[i]:i for i in range(len(subfolders))}

### Extract samples

In [None]:
labels_dict

{'airplane': 0, 'uav': 1, 'bird': 2, 'static-object': 3}

In [None]:
def extract_samples(folder, samples_config):
  #Extracts sample from an entire subfolder and returns it as a dict of subsamples by filename
  subfolder = samples_config['subfolder']
  samples_dict = {}
  total_samples = 0
  subfolder_path = os.path.join(folder, subfolder)
  files = os.listdir(subfolder_path)
  for file in files:
      file_path = os.path.join(subfolder_path, file)
      sub_samples, sub_dt, scale = segment_file_by_time(file_path, samples_config)
      # if len(sub_samples) == 0:
      #   # continue
      #   print(f'No samples found for {file}')
      samples_dict[file] = (sub_samples, sub_dt, scale)
      total_samples = total_samples + len(sub_samples)
  # print(f'A total of {total_samples} samples are taken from this folder')
  return samples_dict

In [None]:
samples_config = {
    'subfolder' : 'bird',
    'delta' : 0.04,
    'sample_durations' : [5, 10, 15, 20, 25, 30, 60],
    'sample_duration' : 10,
    'skip_duration' : 2,
    'min_samples' : 10,
    'for_test': False
}
for_test = [False, True]

In [None]:
flying_objects = ['airplane', 'uav', 'bird', 'static-object']
skips = [15, 15, 2, 40]

In [None]:
#Execute samples extraction for all subfolders and all durations according to the configuration and saves the results as pickles

sample_durations = samples_config['sample_durations']

for f_object, skip in zip(flying_objects, skips):
  samples_config['subfolder'] = f_object
  samples_config['skip_duration'] = skip

  for dur in sample_durations:
    print('Sample Duration = ', dur)
    samples_config['sample_duration'] = dur
    samples_config['min_samples'] = dur # this sets the minimal average rate of points to be 1 per sec for a valid sample
    for bo in for_test:
      samples_config['for_test'] = bo
      if bo:
        skip = 0
        # print('for test:')
      else:
        skip = samples_config['skip_duration']
        # print('for train:')

      samples_dict = extract_samples(folder, samples_config)
      save_path = './Samples/mamba_samples_'+ samples_config['subfolder'] + str(dur) + 'skip' + str(skip) + '_cleaned'
      with open(save_path , 'wb') as f:
          pickle.dump(samples_dict, f)

Sample Duration =  5
Sample Duration =  10
Sample Duration =  15
Sample Duration =  20
Sample Duration =  25
Sample Duration =  30
Sample Duration =  60
Sample Duration =  5
Sample Duration =  10
Sample Duration =  15
Sample Duration =  20
Sample Duration =  25
Sample Duration =  30
Sample Duration =  60
Sample Duration =  5
Sample Duration =  10
Sample Duration =  15
Sample Duration =  20
Sample Duration =  25
Sample Duration =  30
Sample Duration =  60
Sample Duration =  5
Sample Duration =  10
Sample Duration =  15
Sample Duration =  20
Sample Duration =  25
Sample Duration =  30
Sample Duration =  60


### Prepare dataset

In [None]:
def split_by_scale(samples_config, split_config):
  #This function performs data splitting while trying to perserve the distribution of local std (according to azimuth)
  dur = 5 #smallest - to make sure we get all filenames
  skip = samples_config['skip_duration']
  save_path = './Samples/mamba_samples_'+ samples_config['subfolder'] + str(dur) + 'skip' + str(skip)
  with open(save_path , 'rb') as f:
    samples_dict = pickle.load(f)

  subfolder_path = os.path.join(folder, samples_config['subfolder'])
  files = np.array(list(samples_dict.keys()))
  scales = []
  for file in files:
      file_path = os.path.join(subfolder_path, file)
      sub_samples, sub_dt, scale = segment_file_by_time_classic(file_path, samples_config)
      scales.append(scale[0])

  scales = np.array(scales)
  sorted_files = files[np.argsort(scales)]
  sorted_scales = scales[np.argsort(scales)]
  ratio_skip = int(1/split_config['test_split_ratio'])
  files_test = sorted_files[ratio_skip-1::ratio_skip]
  files_val = sorted_files[ratio_skip-2::ratio_skip]
  files_train = [file for file in files if (file not in files_test and file not in files_val)]

  save_split_path = './Samples/mamba_samples_'+ samples_config['subfolder'] + '_split' + str(split_config['test_split_ratio'])
  with open(save_split_path , 'wb') as f:
      pickle.dump((files_train, files_val, files_test), f)

  return files_train, files_val, files_test

In [None]:
def prepare_test_dataset(samples_config, split_config):
  ts = split_config['test_split_ratio']
  skip = samples_config['skip_duration']
  files_train, files_val, files_test = split_by_scale(samples_config, split_config)

  # print('collecting samples')
  all_train_data = {}
  all_test_data = {}
  all_val_data = {}
  train_sizes = []
  val_sizes = []
  test_sizes = []

  for dur in samples_config['eval_sample_durations']:
    #take from zero overlap dict - no longer relevant
    save_path = './Samples/mamba_samples_'+ samples_config['subfolder'] + str(dur) + 'skip' + str(0) + '_cleaned'
    with open(save_path , 'rb') as f:
      samples_dict = pickle.load(f)
    val_samples = []
    val_samples_filenames = []
    val_dt = []
    val_scales = []
    test_samples = []
    test_samples_filenames = []
    test_dt = []
    test_scales = []

    for file in files_val:
      sub_samples, sub_dt, scale = samples_dict[file]
      val_samples.extend(sub_samples)
      sub_file = [file]*len(sub_samples)
      val_samples_filenames.extend(sub_file)
      val_dt.extend(sub_dt)
      sub_scale = [scale]*len(sub_samples)
      val_scales.extend(sub_scale)
    val_sizes.append(len(val_samples))
    # print(len(val_samples))
    all_val_data[dur] = (val_samples, val_samples_filenames, val_dt, val_scales)

    for file in files_test:
      sub_samples, sub_dt, scale = samples_dict[file]
      test_samples.extend(sub_samples)
      sub_file = [file]*len(sub_samples)
      test_samples_filenames.extend(sub_file)
      test_dt.extend(sub_dt)
      sub_scale = [scale]*len(sub_samples)
      test_scales.extend(sub_scale)
    test_sizes.append(len(test_samples))
    # print(len(test_samples))
    all_test_data[dur] = (test_samples, test_samples_filenames, test_dt, test_scales)

  save_path = './Samples/mamba_samples_' + samples_config['subfolder'] + '_skip' + str(skip) +'_split' + str(ts) + '_val_test_cleaned_samples'
  with open(save_path , 'wb') as f:
    pickle.dump((all_val_data, all_test_data), f)
  # print('Done')
  return val_sizes, test_sizes

In [None]:
samples_config = {
    'subfolder' : 'bird',
    'delta' : 0.04,
    'sample_durations' : [5, 10, 30, 60],
    'eval_sample_durations' : [5, 10, 15, 20, 25, 30],
    'sample_duration' : 10,
    'skip_duration' : 2,
    'min_samples' : 10,
    'for_test': False
}

split_config = {
    'random_state' : 24,
    'test_split_ratio' : 0.2,
  }
# train_summary = pd.DataFrame(columns = samples_config['sample_durations'])
val_summary = pd.DataFrame(columns = samples_config['eval_sample_durations'])
test_summary = pd.DataFrame(columns = samples_config['eval_sample_durations'])

for f_object, skip in zip(flying_objects, skips):
  # summary_columns = ['object', 'train set size', 'validation set size', 'test set size']
  # summary = pd.DataFrame(columns = summary_columns)
  samples_config['subfolder'] = f_object
  samples_config['skip_duration'] = skip

  val_sizes, test_sizes = prepare_test_dataset(samples_config, split_config)
  # train_summary.loc[len(train_summary)] = train_sizes
  val_summary.loc[len(val_summary)] = val_sizes
  test_summary.loc[len(test_summary)] = test_sizes

# train_summary.index = flying_objects
val_summary.index = flying_objects
test_summary.index = flying_objects

# print(train_summary)
print(val_summary)
print(test_summary)

                5    10  15  20  25  30
airplane        67   59  48  42  37  31
uav             81   79  76  71  69  66
bird            67   54  39  31  22  18
static-object  106  105  99  94  89  87
                5    10  15  20  25  30
airplane        62   57  45  37  33  27
uav             89   85  81  79  76  71
bird            89   75  67  60  52  45
static-object  109  105  98  92  87  82


In [None]:
save_path = './Samples/mamba_samples_' + samples_config['subfolder'] + '_skip' + str(skip) +'_split' + str(split_config['test_split_ratio']) + '_val_test_cleaned_samples'
with open(save_path , 'rb') as f:
  (all_val_data, all_test_data) = pickle.load(f)

In [None]:
from torch.utils.data import DataLoader

Since there is no interpolation here, we expect to get samples of various sizes and so we must use a batch size of 1 in a dataloader

In [None]:
dataloaders = []
for dur, (test_samples, test_samples_filenames, test_dt, test_scales) in all_test_data.items():
  dur_vec = [dur]*len(test_samples)
  test_data = list(zip(test_samples, test_dt, test_scales, dur_vec))
  if len(test_data) > 0:
    dataloaders.append(DataLoader(test_data, batch_size = 1))
    it_data = iter(dataloaders[-1])
    sample, dt, scale, duration = next(it_data)
    print(sample.shape)
    print(dt.shape)
    print(scale.shape)
    print(duration.shape)
    sample, dt, scale, duration = next(it_data)
    print(sample.shape)
    print(dt.shape)
    print(scale.shape)
    print(duration.shape)

torch.Size([1, 125, 8])
torch.Size([1, 125])
torch.Size([1, 8])
torch.Size([1])
torch.Size([1, 113, 8])
torch.Size([1, 113])
torch.Size([1, 8])
torch.Size([1])
torch.Size([1, 250, 8])
torch.Size([1, 250])
torch.Size([1, 8])
torch.Size([1])
torch.Size([1, 237, 8])
torch.Size([1, 237])
torch.Size([1, 8])
torch.Size([1])
torch.Size([1, 375, 8])
torch.Size([1, 375])
torch.Size([1, 8])
torch.Size([1])
torch.Size([1, 361, 8])
torch.Size([1, 361])
torch.Size([1, 8])
torch.Size([1])
torch.Size([1, 500, 8])
torch.Size([1, 500])
torch.Size([1, 8])
torch.Size([1])
torch.Size([1, 485, 8])
torch.Size([1, 485])
torch.Size([1, 8])
torch.Size([1])
torch.Size([1, 625, 8])
torch.Size([1, 625])
torch.Size([1, 8])
torch.Size([1])
torch.Size([1, 545, 8])
torch.Size([1, 545])
torch.Size([1, 8])
torch.Size([1])
torch.Size([1, 750, 8])
torch.Size([1, 750])
torch.Size([1, 8])
torch.Size([1])
torch.Size([1, 665, 8])
torch.Size([1, 665])
torch.Size([1, 8])
torch.Size([1])
