# Import packages

In [None]:
%load_ext autoreload
%autoreload 2

import os, sys, sys
from pathlib import Path
for p in [Path.cwd()] + list(Path.cwd().parents):
    if p.name == 'Multifirefly-Project':
        os.chdir(p)
        sys.path.insert(0, str(p / 'multiff_analysis/multiff_code/methods'))
        break
    
from data_wrangling import specific_utils, process_monkey_information, general_utils
from pattern_discovery import pattern_by_trials, pattern_by_trials, cluster_analysis, organize_patterns_and_features
from visualization.matplotlib_tools import plot_behaviors_utils
from neural_data_analysis.neural_analysis_tools.get_neural_data import neural_data_processing
from neural_data_analysis.neural_analysis_tools.visualize_neural_data import plot_neural_data, plot_modeling_result
from neural_data_analysis.neural_analysis_tools.model_neural_data import transform_vars, neural_data_modeling, drop_high_corr_vars, drop_high_vif_vars
from neural_data_analysis.topic_based_neural_analysis.neural_vs_behavioral import prep_monkey_data, prep_target_data, neural_vs_behavioral_class
from neural_data_analysis.topic_based_neural_analysis.planning_and_neural import planning_and_neural_class, pn_utils, pn_helper_class, pn_aligned_by_seg
from neural_data_analysis.neural_analysis_tools.cca_methods import cca_class
from neural_data_analysis.neural_analysis_tools.cca_methods import cca_class, cca_utils, cca_cv_utils
from neural_data_analysis.neural_analysis_tools.cca_methods.cca_plotting import cca_plotting, cca_plot_lag_vs_no_lag, cca_plot_cv
from machine_learning.ml_methods import regression_utils, regz_regression_utils, ml_methods_class, classification_utils, ml_plotting_utils, ml_methods_utils
from planning_analysis.show_planning import nxt_ff_utils, show_planning_utils
from neural_data_analysis.neural_analysis_tools.align_trials import time_resolved_regression, time_resolved_gpfa_regression,plot_time_resolved_regression
from neural_data_analysis.neural_analysis_tools.gpfa_methods import elephant_utils, fit_gpfa_utils, plot_gpfa_utils, plot_gpfa_utils2, gpfa_helper_class

import sys
import math
import gc
import subprocess
from pathlib import Path
from importlib import reload

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc
from scipy import linalg, interpolate
from scipy.signal import fftconvolve
from scipy.io import loadmat
from scipy import sparse
import torch
from numpy import pi

# Machine Learning imports
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.multivariate.cancorr import CanCorr

# Neuroscience specific imports
import neo
import rcca
import quantities as pq

plt.rcParams["animation.html"] = "html5"
os.environ['KMP_DUPLICATE_LIB_OK']='True'
rc('animation', html='jshtml')
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
matplotlib.rcParams['animation.embed_limit'] = 2**128
pd.set_option('display.float_format', lambda x: '%.5f' % x)
np.set_printoptions(suppress=True)
print("done")

%load_ext autoreload
%autoreload 2

# retrieve data

In [None]:
raw_data_folder_path = "all_monkey_data/raw_monkey_data/monkey_Bruno/data_0402"

In [None]:
raw_data_folder_path = "all_monkey_data/raw_monkey_data/monkey_Schro/data_0416"

In [None]:
reduce_y_var_lags = False
planning_data_by_point_exists_ok = True
y_data_exists_ok = True

pn = pn_aligned_by_seg.PlanningAndNeuralSegmentAligned(raw_data_folder_path=raw_data_folder_path)
pn.prep_data_to_analyze_planning(planning_data_by_point_exists_ok=planning_data_by_point_exists_ok)
pn.planning_data_by_point, cols_to_drop = general_utils.drop_columns_with_many_nans(
    pn.planning_data_by_point)
pn.get_x_and_y_data_for_modeling(exists_ok=y_data_exists_ok, reduce_y_var_lags=reduce_y_var_lags)

# get planning_data by segment

## get data and fit gpfa

In [None]:
pn.prepare_seg_aligned_data(segment_duration=2, rebinned_max_x_lag_number=2)

In [None]:
pn.get_gpfa_traj(latent_dimensionality=5)

In [None]:
# for regression later
use_raw_spike_data_instead = False

pn.get_concat_data_for_regression(use_raw_spike_data_instead=True,
                                    use_lagged_raw_spike_data=True,
                                    apply_pca_on_raw_spike_data=False,
                                    num_pca_components=7)
pn.print_data_dimensions()

In [None]:
# example trajectories
for traj in pn.trajectories[:5]:
    print(traj.shape)

In [None]:
stop!

## point-wise segment regression

In [None]:
pn.make_time_resolved_cv_scores()

In [None]:

features_to_plot = [
'time', 'time_rel_to_stop',
'target_distance',
'target_angle',
'target_rel_x',
'target_rel_y',
'speed',
'stop']

pn.time_resolved_cv_scores.loc[pn.time_resolved_cv_scores['feature'] == 'monkey_speeddummy', 'feature'] = 'stop'
pn.plot_time_resolved_regression(features_to_plot=features_to_plot, n_behaviors_per_plot=8)




In [None]:
pn.time_resolved_cv_scores.loc[pn.time_resolved_cv_scores['feature'] == 'monkey_speeddummy', 'feature'] = 'stop'
for features in [['target_distance', 'target_rel_y'],
                 ['target_rel_x', 'target_angle'],
                 ['time', 'time_rel_to_stop'],
                 ['speed', 'stop']]:
    
    pn.plot_time_resolved_regression(features_to_plot=features)




In [None]:
pn.plot_trial_counts_by_timepoint()  # 

## concat data regression

In [None]:
pn.separate_test_and_control_data()
print(pn.concat_neural_trials.shape)
print(pn.concat_behav_trials.shape)

In [None]:
general_utils.check_na_in_df(pn.concat_neural_trials)
general_utils.check_na_in_df(pn.concat_behav_trials)

In [None]:
# # Multivariate linear regression
# pn.y_var_lr_df = neural_data_modeling.get_y_var_lr_df(
#                 pn.concat_neural_trials.drop(columns=['new_segment', 'new_bin'], errors='ignore'), 
#                 pn.concat_behav_trials)

# pn.y_var_lr_df.head(10)

## segment split regress CV

In [None]:
reload(ml_methods_utils)

In [None]:
columns_of_interest = ['whether_test', 'cur_ff_distance', 'cur_ff_angle', 'cur_ff_rel_x', 'cur_ff_rel_y', 'nxt_ff_distance', 'nxt_ff_rel_y', 'nxt_opt_arc_dheading', 'nxt_ff_rel_x', 'nxt_ff_angle', 'nxt_ff_angle_at_ref']

In [None]:
all_results = []
for test_or_control in ['test', 'control', 'both']:
    x_var, y_var = pn.get_concat_x_and_y_var_for_lr(test_or_control=test_or_control)
    
    results_summary = ml_methods_utils.run_segment_split_regression_cv(
        x_var, 
        y_var, 
        columns_of_interest, 
        num_folds=5, 
    )
    results_summary['test_or_control'] = test_or_control
    all_results.append(results_summary)

all_results = pd.concat(all_results)
all_results.head()

reg_results = all_results[all_results['Model'] == 'Linear Regression']
class_results = all_results[all_results['Model'] == 'Logistic Regression']

In [None]:

#Just 'both'

all_results = []
for test_or_control in ['both']:
    x_var, y_var = pn.get_concat_x_and_y_var_for_lr(test_or_control=test_or_control)
    
    results_summary = ml_methods_utils.run_segment_split_regression_cv(
        x_var, 
        y_var, 
        columns_of_interest, 
        num_folds=5, 
    )
    results_summary['test_or_control'] = test_or_control
    all_results.append(results_summary)

all_results = pd.concat(all_results)
all_results.head()

reg_results = all_results[all_results['Model'] == 'Linear Regression']
class_results = all_results[all_results['Model'] == 'Logistic Regression']

In [None]:
print(reg_results)

In [None]:
import matplotlib.pyplot as plt

df = reg_results.copy()

# Filter only test_r2 rows
df_test_r2 = df[df["Metric"] == "test_r2"]

# Plot
plt.figure(figsize=(8, 4))
plt.bar(df_test_r2["Feature"], df_test_r2["Mean"], 
        yerr=df_test_r2["Std"], capsize=4, color="skyblue", edgecolor="k")

plt.axhline(0, color="red", linestyle="--", linewidth=1)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Test R² (Mean ± Std)")
plt.title("Test R² Across Features")
plt.tight_layout()
plt.show()


In [None]:
# regression results
for metric in ['test_pearson_r', 'test_r2']:
    ml_methods_utils.make_barplot_to_compare_results(
            reg_results, 
            metric=metric, 
        )

In [None]:
# classification results
for metric in ['test_accuracy']:
    ml_methods_utils.make_barplot_to_compare_results(
        class_results, 
        metric=metric, 
    )

In [None]:
stop!

## segment split regress train-test
Warning: results can be very unstable due to the stochasticity of train-test split

In [None]:
x_var = pn.concat_neural_trials
y_var = pn.concat_behav_trials

columns_of_interest = ['nxt_ff_rel_y', 'nxt_opt_arc_dheading', 'nxt_ff_rel_x', 'nxt_ff_angle', 'nxt_ff_angle_at_ref']
ml_methods_utils.run_segment_split_regression(x_var, y_var, columns_of_interest)

## plot latent dimensions

In [None]:
plot_gpfa_utils.plot_gpfa_traj_3d_timecolored_average(pn.trajectories)


In [None]:
plot_gpfa_utils.plot_gpfa_traj_3d_uniform_color(pn.trajectories)


In [None]:
plot_gpfa_utils.plot_gpfa_traj_3d_timecolored_average(pn.trajectories)


In [None]:
# Keep your coordinates, but auto-pick the best azimuth/elevation
plot_gpfa_utils2.plot_gpfa_traj_3d_timecolored_average(pn.trajectories, auto_view="grid", grid_step=5)

# Rotate data to PCA (PC1/PC2/PC3) for maximal in-plane variance
plot_gpfa_utils2.plot_gpfa_traj_3d_timecolored_average(pn.trajectories, auto_view="pca")


In [None]:
# First, enable interactive mode in your notebook
%matplotlib inline

# Import required modules
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm

# Create the interactive plot
fig, ax = plot_gpfa_utils.plot_gpfa_traj_3d(
    trajectories=pn.trajectories,
    figsize=(15, 5),
    linewidth_single_trial=0.75,
    alpha_single_trial=0.3,
    linewidth_trial_average=2,
    title='Latent dynamics extracted by GPFA',
    view_azim=-5,
    view_elev=60
)

plt.show()

In [None]:
# fig = plot_gpfa_utils.plot_gpfa_traj_3d_plotly(trajectories)

In [None]:
# Find variance explained by each latent dimension
traj_stack = np.stack(pn.trajectories, axis=0)  # shape: (n_trials, 3, T)
var_by_dim = np.var(traj_stack, axis=(0, 2))    # variance across trials and time
var_by_dim /= var_by_dim.sum()               # normalize to get explained variance ratio
print("Variance explained by each latent dimension:", var_by_dim)


In [None]:

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

fig, ax = plt.subplots(figsize=(15, 5))

ax.set_title('Latent dynamics extracted by GPFA')
ax.set_xlabel('Time [s]')

average_trajectory = np.mean(pn.trajectories, axis=0)
time = np.arange(len(average_trajectory[0])) * pn.bin_width  # assuming all trajectories have the same length

for i, x in enumerate(average_trajectory):
    ax.plot(time, x, label=f'Dim {i+1}')

ax.legend()

plt.tight_layout()
plt.show()


## why poor performance?

In [None]:
import neural_data_analysis.neural_analysis_tools.gpfa_methods.time_resolved_regression as time_resolved_regression

# 1. Print number of trials per timepoint
time_resolved_regression.print_trials_per_timepoint(pn.gpfa_neural_trials)

# 2. Check for NaNs
time_resolved_regression.check_for_nans_in_trials(pn.gpfa_neural_trials, name='latent')
time_resolved_regression.check_for_nans_in_trials(pn.behav_trials, name='behavioral')

# 3. Standardize trials
latent_trials_std = time_resolved_regression.standardize_trials(pn.gpfa_neural_trials)
behav_trials_std = time_resolved_regression.standardize_trials(pn.behav_trials)

# 4. Plot latent and behavioral variables for a few trials
time_resolved_regression.plot_latents_and_behav_trials(latent_trials_std, behav_trials_std, pn.bin_width, n_trials=5)


## hyperparams (still need to debug)

In [None]:
stop! # this section is not finished yet

# grid search

import itertools
from joblib import Parallel, delayed, cpu_count
print(f"Detected CPU cores: {cpu_count()}")

# # can add for smoothing:
# # other forms of smoothing like (currently it's only uniform_filter1d)
# from scipy.ndimage import gaussian_filter1d
# # gpfa_neural_trials: list of trials, each trial shape (time_bins, n_neurons)
# smoothed_trials = [
#     gaussian_filter1d(trial, sigma=smooth_sigma, axis=0)
#     for trial in gpfa_neural_trials
# ]


# Define your grid
smoothing_windows = [1, 3]
use_sqrt = [True, False]
gpfa_dims = [3, 5]
bin_widths = [0.02]
ridge_alphas = [0.1, 1]
regression_types = ['ridge']
align_at_beginning_opts = [True]
pca_components = [5, 10]

param_grid_gpfa = list(itertools.product(
    smoothing_windows, use_sqrt, gpfa_dims, bin_widths, ridge_alphas, regression_types, align_at_beginning_opts
))

# Baseline configs
param_grid_raw = list(itertools.product(
    smoothing_windows, use_sqrt, bin_widths, ridge_alphas, regression_types, align_at_beginning_opts
))
param_grid_pca = list(itertools.product(
    smoothing_windows, use_sqrt, bin_widths, ridge_alphas, regression_types, align_at_beginning_opts, pca_components
))

# Run GPFA grid
results_gpfa = Parallel(n_jobs=-1, verbose=10)(
    delayed(gpfa_tuning.run_gpfa_experiment_time_resolved)(
        dec, smoothing, sqrt, gpfa_dim, bin_width, ridge_alpha, regression_type, align_at_beginning, baseline=None
    )
    for (smoothing, sqrt, gpfa_dim, bin_width, ridge_alpha, regression_type, align_at_beginning) in param_grid_gpfa
)

# Run raw baseline grid
results_raw = Parallel(n_jobs=-1, verbose=10)(
    delayed(gpfa_tuning.run_gpfa_experiment_time_resolved)(
        dec, smoothing, sqrt, None, bin_width, ridge_alpha, regression_type, align_at_beginning, baseline='raw'
    )
    for (smoothing, sqrt, bin_width, ridge_alpha, regression_type, align_at_beginning) in param_grid_raw
)

# Run PCA baseline grid
results_pca = Parallel(n_jobs=-1, verbose=10)(
    delayed(gpfa_tuning.run_gpfa_experiment_time_resolved)(
        dec, smoothing, sqrt, None, bin_width, ridge_alpha, regression_type, align_at_beginning, baseline='pca', pca_components=pca_comp
    )
    for (smoothing, sqrt, bin_width, ridge_alpha, regression_type, align_at_beginning, pca_comp) in param_grid_pca
)

# Combine all results
results_summary = results_gpfa + results_raw + results_pca
df = pd.DataFrame(results_summary)
print(df.sort_values('mean_r2', ascending=False).head(10))

In [None]:
import matplotlib.pyplot as plt
best = df.iloc[df['mean_r2'].idxmax()]
plt.plot(best['times'], np.nanmean(np.array(best['r2_by_time']), axis=1))
plt.xlabel('Time (s)')
plt.ylabel('Mean R²')
plt.title(f"Best config: {best['model']} R² by time")
plt.show()

# Compare models
import seaborn as sns
sns.catplot(data=df, x='model', y='mean_r2', kind='bar')

# ML to decode single vars

## decode

In [None]:
# neural_data = pn.x_var_lags
# behavioral_data = pn.y_var_reduced

neural_data = pn.concat_neural_trials
behavioral_data = pn.concat_behav_trials

In [None]:
from neural_data_analysis.neural_analysis_tools.model_neural_data import transform_vars, ml_decoder_class, neural_data_modeling, drop_high_corr_vars, drop_high_vif_vars

In [None]:
# General usage for any behavioral variable
decoder = ml_decoder_class.MLBehavioralDecoder()
models_to_use=['rf', 'nn', 'lr']
successful_decodings = {}

for var in ['nxt_ff_rel_y', 'nxt_ff_distance']:
    result = decoder.decode_variable(neural_data, behavioral_data, var, models_to_use=models_to_use)
    if result is not None:
        successful_decodings[var] = result

best_model, best_results = decoder.get_best_model('target_rel_y', 'test_r2')

# Plot rf results for any variable
decoder.plot_ml_results('target_rel_y', 'rf')

successful_decodings

## compare different Models

Let's compare the performance of different machine learning models.


In [None]:
from neural_data_analysis.topic_based_neural_analysis.target_decoder import behav_features_to_keep, target_decoder_class, prep_target_decoder, eval_target_decoder

In [None]:
comparison_df = eval_target_decoder.compare_models(successful_decodings)

## plot feature importance for RF

In [None]:
# Analyze feature importance for Random Forest models
for target_var in successful_decodings.keys():
    if 'rf' in successful_decodings[target_var]:
        print(f"\n{'='*50}")
        print(f"FEATURE IMPORTANCE: {target_var}")
        print('='*50)
        
        rf_model = successful_decodings[target_var]['rf']['model']
        
        if hasattr(rf_model, 'feature_importances_'):
            # Get feature importance
            importance_df = regression_utils._get_rf_feature_importances(rf_model, pn.neural_data.columns)
            # Show top 10 most important features
            print(f"Top 10 most important neurons for {target_var}:")
            print(importance_df.head(10))
            
            # Plot feature importance
            regression_utils.plot_feature_importance(importance_df, target_var)


# Save Results (have yet to try)

Finally, let's save our results for future analysis.


In [None]:
import pickle
import json
import pandas as pd
from typing import Dict, Any

def create_experiment_info(decoder, monkey: str, session: str) -> Dict[str, Any]:
    """Create experiment information dictionary."""
    return {
        'monkey': monkey,
        'session': session,
        'bin_width': decoder.bin_width,
        'neural_data_shape': decoder.neural_data.shape,
        'target_data_shape': decoder.target_data.shape
    }

def create_cca_results(decoder) -> Dict[str, Any]:
    """Create CCA results summary."""
    return {
        'top_3_correlations': (
            decoder.results['cca']['canonical_correlations'][:3].tolist() 
            if 'cca' in decoder.results else None
        )
    }

def find_best_performances(successful_decodings: Dict) -> Dict[str, Dict[str, Any]]:
    """Find best performing model for each target variable."""
    best_performances = {}
    for target_var, models in successful_decodings.items():
        best_model = None
        best_score = -1
        
        for model_name, results in models.items():
            score = results.get('test_r2', results.get('test_accuracy', results.get('cv_mean', 0)))
            if score > best_score:
                best_score = score
                best_model = model_name
        
        best_performances[target_var] = {
            'best_model': best_model,
            'best_score': best_score
        }
    return best_performances

def create_summary_report(decoder, successful_decodings: Dict, monkey: str, session: str) -> Dict[str, Any]:
    """Create complete summary report."""
    return {
        'experiment_info': create_experiment_info(decoder, monkey, session),
        'cca_results': create_cca_results(decoder),
        'ml_results_summary': {
            'successful_targets': list(successful_decodings.keys()),
            'best_performances': find_best_performances(successful_decodings)
        }
    }

def print_summary_report(summary_report: Dict[str, Any]):
    """Print formatted summary report."""
    print("\nEXPERIMENT SUMMARY")
    print("="*50)
    print(f"Neural data shape: {summary_report['experiment_info']['neural_data_shape']}")
    print(f"Target data shape: {summary_report['experiment_info']['target_data_shape']}")
    
    if summary_report['cca_results']['top_3_correlations']:
        print(f"Top 3 CCA correlations: {summary_report['cca_results']['top_3_correlations']}")
    
    print(f"Successfully decoded targets: {summary_report['ml_results_summary']['successful_targets']}")
    
    print("\nBest model performance for each target:")
    for target, perf in summary_report['ml_results_summary']['best_performances'].items():
        print(f"  {target}: {perf['best_model']} (score: {perf['best_score']:.4f})")

def save_experiment_results(decoder, successful_decodings: Dict, monkey: str, session: str, 
                          base_filename: str = None):
    """Save both detailed results and summary report."""
    if base_filename is None:
        base_filename = f"target_decoding_results_{monkey}_{session}"
    
    pkl_filename = f"{base_filename}.pkl"
    json_filename = f"{base_filename}_summary.json"
    
    # Save detailed results
    print("Saving results...")
    decoder.save_results(pkl_filename)
    
    # Create and save summary report
    summary_report = create_summary_report(decoder, successful_decodings, monkey, session)
    print_summary_report(summary_report)
    
    with open(json_filename, 'w') as f:
        json.dump(summary_report, f, indent=2)
    
    print(f"\nResults saved to: {pkl_filename}")
    print(f"Summary saved to: {json_filename}")
    
    return pkl_filename, json_filename

def load_experiment_results(base_filename: str = None, monkey: str = None, session: str = None):
    """Load both detailed results and summary report."""
    if base_filename is None:
        if monkey and session:
            base_filename = f"target_decoding_results_{monkey}_{session}"
        else:
            raise ValueError("Must provide either base_filename or both monkey and session")
    
    pkl_filename = f"{base_filename}.pkl"
    json_filename = f"{base_filename}_summary.json"
    
    try:
        # Load detailed results
        with open(pkl_filename, 'rb') as f:
            decoder_results = pickle.load(f)
        
        # Load summary report
        with open(json_filename, 'r') as f:
            summary_report = json.load(f)
        
        print(f"Loaded results from: {pkl_filename}")
        print(f"Loaded summary from: {json_filename}")
        
        return decoder_results, summary_report
        
    except FileNotFoundError as e:
        print(f"File not found: {e}")
        return None, None
    except Exception as e:
        print(f"Error loading results: {e}")
        return None, None

# --- Usage Examples ---

# Saving (replaces your original code):
# save_experiment_results(decoder, successful_decodings, 'Bruno', 'data_0328')

# Loading:
# decoder_results, summary_report = load_experiment_results(monkey='Bruno', session='data_0328')
# OR
# decoder_results, summary_report = load_experiment_results(base_filename="target_decoding_results_bruno_0328")

# If you want to print the loaded summary:
# if summary_report:
#     print_summary_report(summary_report)

# Appendix

## check NA

In [None]:
pn.x_var.shape

In [None]:
general_utils.check_na_in_df(pn.planning_data_by_bin)

In [None]:
pn.planning_data_by_point.shape

In [None]:
general_utils.check_na_in_df(pn.planning_data_by_point)

In [None]:
general_utils.check_na_in_df(pn.ctrl_inst.both_ff_across_time_df)

## analyze_trial_lengths

In [None]:
from neural_data_analysis.neural_analysis_tools.gpfa_methods import fix_gpfa_trial_length
# First, analyze the trial lengths
analysis = fix_gpfa_trial_length.analyze_trial_lengths(pn.spiketrains)
print("Trial length analysis:")
for key, value in analysis.items():
    print(f"  {key}: {value}")