# Import packages

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

project_folder = '/Users/dusiyi/Documents/Multifirefly-Project'
os.chdir(project_folder)
sys.path.append(os.path.join(project_folder, 'multiff_analysis', 'methods'))

from data_wrangling import specific_utils, process_monkey_information
from pattern_discovery import pattern_by_trials, pattern_by_trials, cluster_analysis, organize_patterns_and_features
from visualization.matplotlib_tools import plot_behaviors_utils
from non_behavioral_analysis.neural_data_analysis.get_neural_data import neural_data_processing
from non_behavioral_analysis.neural_data_analysis.visualize_neural_data import plot_neural_data, plot_modeling_result
from non_behavioral_analysis.neural_data_analysis.model_neural_data import cca_class, pgam_class, neural_data_modeling, reduce_multicollinearity
from non_behavioral_analysis.neural_data_analysis.neural_vs_behavioral import prep_monkey_data, prep_target_data, neural_vs_behavioral_class
from non_behavioral_analysis.neural_data_analysis.planning_neural import planning_neural_class, planning_neural_utils

import sys
import math
import gc
import subprocess
from pathlib import Path
from importlib import reload

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc
from scipy import linalg, interpolate
from scipy.signal import fftconvolve
from scipy.io import loadmat
from scipy import sparse
import torch
from numpy import pi

# Machine Learning imports
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.cross_decomposition import CCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.multivariate.cancorr import CanCorr

# Neuroscience specific imports
import neo
import rcca

plt.rcParams["animation.html"] = "html5"
os.environ['KMP_DUPLICATE_LIB_OK']='True'
rc('animation', html='jshtml')
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
matplotlib.rcParams['animation.embed_limit'] = 2**128
pd.set_option('display.float_format', lambda x: '%.5f' % x)
np.set_printoptions(suppress=True)
print("done")

%load_ext autoreload
%autoreload 2

done
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Basic data

In [4]:
## Retrieve monkey data
PLAYER = "monkey"
raw_data_folder_path = "all_monkey_data/raw_monkey_data/monkey_Bruno/data_0330"
data_item = neural_vs_behavioral_class.NeuralVsBehavioralClass(raw_data_folder_path=raw_data_folder_path)
data_item.retrieve_or_make_monkey_data()
data_item.make_or_retrieve_ff_dataframe(exists_ok=True)
data_item.find_patterns()
data_item.make_PlotTrials_args()

monkey_information = data_item.monkey_information
ff_dataframe = data_item.ff_dataframe

ff_life_sorted = data_item.ff_life_sorted
ff_real_position_sorted = data_item.ff_real_position_sorted
ff_believed_position_sorted = data_item.ff_believed_position_sorted
cluster_around_target_indices = data_item.cluster_around_target_indices
ff_caught_T_new = data_item.ff_caught_T_new
caught_ff_num = len(ff_caught_T_new)
ff_flash_sorted = data_item.ff_flash_sorted
ff_flash_end_sorted = data_item.ff_flash_end_sorted
max_point_index = data_item.max_point_index
min_point_index = data_item.min_point_index




data_item.make_or_retrieve_target_clust_last_vis_df()
target_clust_last_vis_df = data_item.target_clust_last_vis_df
target_clust_last_vis_df['ff_index'] = target_clust_last_vis_df['target_index']


PlotTrials_args = (monkey_information, ff_dataframe, ff_life_sorted, ff_real_position_sorted, ff_believed_position_sorted, cluster_around_target_indices, ff_caught_T_new)


plot_polar_args = (monkey_information,
                    ff_dataframe, 
                    ff_life_sorted,
                    ff_real_position_sorted,
                    ff_caught_T_new,
                    ff_flash_sorted,)


trial_total_num = 2
PLAYER = "monkey"

Retrieved monkey_information
The number of points that were removed due to delta_position exceeding the ceiling is 0
Note: ff_caught_T_sorted is replaced with ff_caught_T_new
Retrieved ff_dataframe from all_monkey_data/processed_data/monkey_Bruno/data_0330/ff_dataframe.h5
When take out monkey subset for GUAT, 643 clusters out of 856 are too close to the target or the last target. Those clusters are filtered out.
The number of new trials that are used to separate stop clusters is 1338
Retrieved target_clust_last_vis_df


# retrieve chunks of time of no cluster around target)

In [5]:
exists_ok = True
data_item.make_or_retrieve_target_clust_last_vis_df(exists_ok=exists_ok)
data_item.make_or_retrieve_target_last_vis_df(exists_ok=True)

data_item.target_clust_last_vis_df['nearby_vis_ff_indices'] = data_item.target_clust_last_vis_df['nearby_vis_ff_indices'].apply(
    lambda x: [int(i) for i in x.strip('[]').split(',') if i.strip().isdigit()])

data_item.target_clust_last_vis_df['num_nearby_vis_ff'] = data_item.target_clust_last_vis_df['nearby_vis_ff_indices'].apply(lambda x: len(x))

# add ff_caught_time and ff_caught_point_index
data_item.target_clust_last_vis_df['ff_caught_time'] = data_item.ff_caught_T_new[data_item.target_clust_last_vis_df['target_index'].values]
data_item.target_clust_last_vis_df['ff_caught_point_index'] = np.searchsorted(data_item.monkey_information['time'], data_item.target_clust_last_vis_df['ff_caught_time'].values)

Retrieved target_clust_last_vis_df
Retrieved target_last_vis_df


In [6]:
clust_sub = data_item.target_clust_last_vis_df[data_item.target_clust_last_vis_df['num_nearby_vis_ff'] == 1]
# print percentage of clust_sub
print("Percentage of targets not in a visible cluster out of all targets", len(clust_sub) / len(data_item.target_clust_last_vis_df) * 100)

Percentage of targets not in a visible cluster out of all targets 66.4424514200299


In [7]:
# need 'last_vis_point_index' and 'ff_caught_point_index' and the point index in between 

# then get the neural data and behavioral data for those points

# oh btw....should i actually align each section, as if they are trials???
# maybe i can try both that and continuous time... both can shed light on different behavioral variables
# but for aligning trials, it may require alignment or warping since trial durations vary.

# btw, what does it mean stitch data?

# also, what does it look like to use RNN to model it?
# I thought about the paper that Noah presented on


# btw.......IME

# get neural data (as in other notebooks)

In [42]:
data_item = neural_vs_behavioral_class.NeuralVsBehavioralClass(raw_data_folder_path=raw_data_folder_path)
data_item.streamline_preparing_neural_and_behavioral_data()

Retrieved monkey_information
The number of points that were removed due to delta_position exceeding the ceiling is 0
Note: ff_caught_T_sorted is replaced with ff_caught_T_new
Retrieved ff_dataframe from all_monkey_data/processed_data/monkey_Bruno/data_0330/ff_dataframe.h5
When take out monkey subset for GUAT, 643 clusters out of 856 are too close to the target or the last target. Those clusters are filtered out.
The number of new trials that are used to separate stop clusters is 1338
Retrieved all_trial_patterns
Retrieved pattern_frequencies
Retrieved all_trial_features
Retrieved feature_statistics
Retrieved scatter_around_target_df
Updated window width (to get convolved data):  1.25


  rebinned_monkey_info_essential['stop_success_rate'] = num_caught_ff_convolved / \
  rebinned_monkey_info_essential['stop_success_rate'] = num_caught_ff_convolved / \


Number of bins in valid intervals based on ff caught time: 14293 out of 19291 (74.09%)


In [None]:
ref_point_mode='distance'
ref_point_value=-150

normalize = False
eliminate_outliers = False
use_curvature_to_ff_center = False
curv_of_traj_mode = 'distance'
window_for_curv_of_traj=[-25, 25]

pn = planning_neural_class.PlanningAndNeural(raw_data_folder_path=raw_data_folder_path)
pn.streamline_organizing_info(ref_point_mode='time after cur ff visible', ref_point_value=0.1, 
                               curv_of_traj_mode=curv_of_traj_mode, window_for_curv_of_traj=window_for_curv_of_traj, truncate_curv_of_traj_by_time_of_capture=True,
                               use_curvature_to_ff_center=use_curvature_to_ff_center,  eliminate_outliers=eliminate_outliers)
pn.retrieve_neural_data()
pn.get_all_planning_info()


Retrieved monkey_information
The number of points that were removed due to delta_position exceeding the ceiling is 0
Note: ff_caught_T_sorted is replaced with ff_caught_T_new
Removed 0 rows out of 768 rows where cur_ff was not visible bbas or nxt_ff was not visible both bbas and bsans
shared_stops_near_ff_df has 768 rows
Retrieving shared_stops_near_ff_df succeeded
Retrieving stop_0_1_window_-50cm_0cm from all_monkey_data/planning/monkey_Bruno/data_0330/diff_in_curv_df/norm_opt_arc/test/stop_0_1_window_-50cm_0cm succeeded
Retrieving Bruno_stop_0_1 from all_monkey_data/planning/monkey_Bruno/data_0330/heading_info_df/norm_opt_arc/test/Bruno_stop_0_1 succeeded


In [19]:
columns_to_drop = ['nxt_opt_arc_dheading',
    'nxt_arc_curv',
  'angle_from_cur_ff_landing_to_nxt_ff',]

In [29]:
# drop rows with NA in all_planning_info and print the percentage of rows dropped
all_planning_info2 = pn.all_planning_info.drop(columns=columns_to_drop, errors='ignore').copy()
all_planning_info2 = all_planning_info2.dropna().copy()
all_planning_info2.drop(columns={'stop_point_index', 'point_index'}, inplace=True)
print("Percentage of rows dropped: ", 1 - all_planning_info2.shape[0] / len(pn.all_planning_info))
data_item.y_var = all_planning_info2.copy()

_, pn.binned_spikes_df = neural_data_processing.prepare_binned_spikes_matrix_and_df(pn.all_binned_spikes, data_item.max_bin)
pn.binned_spikes_df['bin'] = np.arange(pn.binned_spikes_df.shape[0])
neural_bins_sub2 = pn.binned_spikes_df[pn.binned_spikes_df['bin'].isin(all_planning_info2['bin'].values)]
data_item.x_var = neural_bins_sub2.drop(columns=['bin'])

Percentage of rows dropped:  0.47553855445943227


## get behavioral data (new version)

In [None]:
data_item = neural_vs_behavioral_class.NeuralVsBehavioralClass(raw_data_folder_path=raw_data_folder_path,
                                                               bin_width=0.02, window_width=0.06)
data_item.streamline_preparing_neural_and_behavioral_data()


In [48]:
monkey_features_to_keep = [
'bin', 'point_index', # those are just identifiers
'time', 'monkey_x', 'monkey_y', 'monkey_angle',
'monkey_speed', 'monkey_speeddummy', 'monkey_dw', 'monkey_ddw', 'monkey_ddv',
'LDy', 'RDz', 'LDz', 'RDy',
'gaze_mky_view_x', 'gaze_mky_view_y', 'gaze_mky_view_angle',
'gaze_world_x', 'gaze_world_y', 
'gaze_world_x_l', 'gaze_world_y_l', 
'gaze_world_x_r', 'gaze_world_y_r',
'gaze_mky_view_x_l', 'gaze_mky_view_y_l', 'gaze_mky_view_angle_l', 
'gaze_mky_view_x_r', 'gaze_mky_view_y_r', 'gaze_mky_view_angle_r', 
'valid_view_point_l', 'valid_view_point_r', 'valid_view_point', 'eye_world_speed', 
'crossing_boundary', 'whether_new_distinct_stop',
]

In [None]:
bin_width = 0.02
min_time = data_item.monkey_information['time'].min()
max_time = data_item.monkey_information['time'].max()
time_bins = np.arange(min_time, max_time, bin_width)

new_binned_features = data_item.monkey_information.sort_values(by=['bin', 'point_index']).groupby('bin').first().reset_index()
new_binned_features = new_binned_features[monkey_features_to_keep].copy()
new_binned_features = prep_monkey_data._add_ff_info_to_binned_features(
    new_binned_features, ff_dataframe, ff_caught_T_new, time_bins)
new_binned_features = prep_monkey_data._add_whether_any_ff_is_visible(new_binned_features)

# clip columns
for column in ['gaze_mky_view_x', 'gaze_mky_view_y', 'gaze_world_x', 'gaze_world_y']:
    new_binned_features.loc[:, column] = np.clip(new_binned_features.loc[:, column], -1000, 1000)
  
data_item.binned_features = new_binned_features


# might replace the line below
data_item._add_all_target_info()

In [None]:
def _add_all_target_info(self):
    self._make_or_retrieve_target_df()
    self.target_average_info, self.target_min_info, self.target_max_info = prep_target_data.get_max_min_and_avg_info_from_target_df(
        self.target_df)
    for df in [self.target_average_info, self.target_min_info, self.target_max_info]:
        self.binned_features = self.binned_features.merge(
            df, how='left', on='bin')

In [54]:
data_item.target_df

Unnamed: 0,bin,time,monkey_x,monkey_y,monkey_angle,point_index,target_index,target_x,target_y,target_distance,...,target_last_seen_angle_frozen,target_last_seen_angle_to_boundary_frozen,target_has_disappeared_for_last_time_dummy,target_cluster_has_disappeared_for_last_time_dummy,target_visible_dummy,target_cluster_visible_dummy,current_target_caught_time,last_target_caught_time,time_since_last_capture,while_last_seeing_target_cluster
0,0,0.09134,0.00000,30.80000,1.57080,0,0,99.91141,-39.93396,122.41561,...,0.00000,0.00000,0,0,0,0,25.70465,0.00000,0.09134,0
1,0,0.10820,0.00000,30.80000,-1.53228,1,0,99.91141,-39.93396,122.41561,...,0.00000,0.00000,0,0,0,0,25.70465,0.00000,0.10820,0
2,0,0.12574,0.00000,30.80000,-1.53228,2,0,99.91141,-39.93396,122.41561,...,0.00000,0.00000,0,0,0,0,25.70465,0.00000,0.12574,0
3,0,0.14121,0.00000,30.80000,-1.53228,3,0,99.91141,-39.93396,122.41561,...,0.00000,0.00000,0,0,0,0,25.70465,0.00000,0.14121,0
4,0,0.15774,0.00000,30.80000,-1.53228,4,0,99.91141,-39.93396,122.41561,...,0.00000,0.00000,0,0,0,0,25.70465,0.00000,0.15774,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219079,14395,3598.95314,-307.92563,255.96219,1.03888,219079,1337,-309.14423,252.21994,3.93566,...,0.03930,0.00000,1,1,0,0,3599.01955,3591.56634,7.38681,0
219080,14395,3598.96975,-307.79947,256.17801,1.03888,219080,1337,-309.14423,252.21994,4.18027,...,0.03930,0.00000,1,1,0,0,3599.01955,3591.56634,7.40341,0
219081,14395,3598.98639,-307.73642,256.28595,1.03888,219081,1337,-309.14423,252.21994,4.30283,...,0.03930,0.00000,1,1,0,0,3599.01955,3591.56634,7.42006,0
219082,14396,3599.00288,-307.73639,256.28592,1.03888,219082,1337,-309.14423,252.21994,4.30281,...,0.03930,0.00000,1,1,0,0,3599.01955,3591.56634,7.43655,0


In [52]:
# delete all columns tht contain 'target_cluster'
# Drop all columns containing 'target_cluster'
target_cluster_cols = [col for col in data_item.binned_features.columns if 'target_cluster' in col]
data_item.binned_features = data_item.binned_features.drop(columns=target_cluster_cols)


In [None]:
# maybe get: cum time since flash & cum distance since flash

In [None]:
# originally thought I'll just keep those columns from final_behavioral_data
['bin', 'LDy',
'LDz', 'RDy', 'RDz', 'gaze_mky_view_x', 'gaze_mky_view_y',  'gaze_world_x', 'gaze_world_y', 
'monkey_speed', 'monkey_angle','any_ff_visible'
'monkey_dw', 'monkey_ddw', 'monkey_ddv', 
'avg_target_distance', 'avg_target_angle',
'avg_target_last_seen_time', 'avg_target_last_seen_distance_frozen',
'avg_target_last_seen_angle_frozen',
]
       
       
# might want to not drop angle to boundary, especially since it's close to target