# Imports

In [1]:
import sys
import time

import numpy as np
import torch
from bnpm import file_helpers, optimization
import sklearn.utils.class_weight
from torch import nn, optim
from tqdm import tqdm
import sklearn.linear_model
import multiprocessing as mp

import roicat.classification.classifier_util as cu
import scipy.sparse
import roicat
import bnpm.h5_handling
from pathlib import Path
import shutil
import warnings
import umap

import matplotlib.pyplot as plt
import matplotlib.offsetbox
import json
import os
import natsort
from roicat import helpers
# from kymatio.torch import Scattering2D
import gc
import functools

2023-06-09 17:06:31.391878: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Specify Initial Parameters

In [2]:
path_params = None # Path(r"")
# directory_data = r'/Users/josh/analysis/outputs/ROICaT/classification/00_data_ingestion'
directory_data = r'/Users/josh/analysis/outputs/ROICaT/classification/01_labels/feature_label_combo.npy'
directory_save = r'/Users/josh/analysis/outputs/ROICaT/classification/02_classifier_train'
testing = True
save_ROIs = True
save_latents = True


# path_params = None # Path(r"")
# # filepath_ROIs = r'/Users/josh/analysis/outputs/ROICaT/classification/01_labels/arr_ROIs.npy'
# filepath_ROIs = None
# filepath_latents = r'/Users/josh/analysis/outputs/ROICaT/classification/01_labels/arr_latents.npy'
# filepath_labels = r'/Users/josh/analysis/outputs/ROICaT/classification/01_labels/arr_labels.npy'
# testing = True
# assert (filepath_ROIs is None) != (filepath_latents is None), 'Exactly one of filepath_ROIs or filepath_latents should be set'
# assert Path(filepath_labels).exists(), 'File located at filepath_labels does not exist'

In [3]:
directory_save = Path(directory_save)
directory_save.mkdir(exist_ok=True, parents=True)

labelingRun_interim = {}

if path_params is not None:
    try:
        Path(str((directory_save).resolve())).mkdir(exist_ok=True, parents=True)
        shutil.copy2(path_params, str(Path(directory_save) / Path(path_params).name));
    except Exception as e:
        print(f'JZ: Error copying params to {directory_save}')
        print(e)
tic = time.time()
tictoc = {}
tictoc['start'] = time.time() - tic

params = file_helpers.json_load(str(Path(path_params).resolve())) if path_params is not None else None

In [4]:
# # %matplotlib notebook
# # dir_allOuterFolders = str(Path('/Users/josh/analysis/data/ROICaT/classification/raw_images').resolve())
# dir_allOuterFolders = str(Path('/Users/josh/analysis/data/ROICaT/classification/stat_s2p').resolve())

# pathSuffixToStat = 'stat.npy'
# pathSuffixToOps = 'ops.npy'

# paths_allStat = test = helpers.find_paths(
#     dir_outer=dir_allOuterFolders,
#     reMatch=pathSuffixToStat,
#     depth=4,
# )
# paths_allOps = test = helpers.find_paths(
#     dir_outer=dir_allOuterFolders,
#     reMatch=pathSuffixToOps,
#     depth=4,
# )

# display(paths_allStat)
# display(paths_allOps)

# #Import data
# data = roicat.data_importing.Data_suite2p(
#     paths_statFiles=paths_allStat,
#     paths_opsFiles=paths_allOps,
#     um_per_pixel=2.0,
#     new_or_old_suite2p='new',
#     out_height_width=[36, 36],
#     type_meanImg='meanImgE',
#     verbose=True,
# );
# # Neural network embedding distances
# roinet = roicat.ROInet.ROInet_embedder(
#     device=roicat.util.helpers.set_device('cuda:0'),
#     dir_networkFiles=r"/Users/josh/analysis/models",
#     download_method="check_local_first",
#     download_url="https://osf.io/xwzhp/download",
#     download_hash="134b170242141c26b0adbd9e0fd80d0e",
#     forward_pass_version="head",
#     verbose=True,
# )

# roinet.generate_dataloader(
#     ROI_images=data.ROI_images,
#     um_per_pixel=data.um_per_pixel,
#     pref_plot=False,
#     batchSize_dataloader=8,
#     pinMemory_dataloader=True,
#     numWorkers_dataloader=mp.cpu_count(),
#     persistentWorkers_dataloader=True,
#     prefetchFactor_dataloader=2,    
# );

# # roicat.visualization.display_toggle_image_stack(roinet.ROI_images_rs)

# roinet.generate_latents();

# gc.collect()
# torch.cuda.empty_cache()
# gc.collect()
# torch.cuda.empty_cache()


# Pass Through Network

In [5]:
feature_label_combo = np.load(directory_data, allow_pickle=True)[()]
# TODO: Add alternative data importing method for raw data when feature_label_combo does not exist

# Train / Validation / Test Split Data, Hyperparameter Tune on Validation Set, and Fit Model

In [6]:
INTEGER_MAX = np.iinfo(np.int64(0).dtype).max

# TODO: JZ, IMPLEMENT AS LOOP FOR OPTUNA FOR HYPERPARAMETER TUNING
print('Splitting data...')
# Create data splitting object for stratified sampling into train and test sets (as well as downsampling)
data_split_val = cu.Datasplit(
    features=feature_label_combo['latents'],
    labels=feature_label_combo['labels'],
    n_train=INTEGER_MAX,
    test_size=0.2,
)
data_split_test = cu.Datasplit(
    features=data_split_val.features_train,
    labels=data_split_val.labels_train,
    n_train=INTEGER_MAX,
    test_size = 0.2/(1 - 0.2),
)

print('Creating X and y matrices for training data...')
X_train = data_split_test.features_train
y_train = data_split_test.labels_train

X_val = data_split_val.features_val
y_val = data_split_val.labels_val

X_test = data_split_val.features_val
y_test = data_split_val.labels_val

y_train = y_train.astype(int)
y_val = y_val.astype(int)
y_test = y_test.astype(int)

tictoc['loaded_data'] = time.time() - tic
print('Calculating class weights...')
num_classes = len(np.unique(feature_label_combo['labels']))
class_weights = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(feature_label_combo['labels']), y=feature_label_combo['labels'])

labels_train = y_train.reshape(-1) # np.stack([data_split.labels_train_subset]*latents_augmented.shape[1], axis=1).reshape(-1)
features_train = X_train.reshape(-1, X_train.shape[-1]) # latents_augmented[data_split.features_train_subset].reshape(-1, latents_augmented.shape[2])

labels_val = y_val.reshape(-1) # data_split.labels_val
features_val = X_val.reshape(-1, X_val.shape[-1]) # latents_unaugmented[data_split.features_val]

labels_test = y_test.reshape(-1) # data_split.labels_val
features_test = X_test.reshape(-1, X_test.shape[-1]) # latents_unaugmented[data_split.features_val]

n_train_actual = X_train.shape[0]
n_val_actual = X_val.shape[0]
n_test_actual = X_test.shape[0]

tictoc['splitted_data'] = time.time() - tic

print(f'Fitting model to data of dimensions: X: {X_train.shape}, y: {y_train.shape}...')
# Create lenet model, associated optimizer, loss function, and training tracker
model = sklearn.linear_model.LogisticRegression(
   solver='lbfgs',
   fit_intercept=True,
   max_iter=10000,
   C=1e5,
   class_weight={iClassWeight:classWeight for iClassWeight, classWeight in enumerate(class_weights)},
#    class_weight=class_weights,
)
model.fit(features_train, labels_train)

print(f'Calculating tracker outputs and saving to {directory_save}...')
training_tracker = cu.TrainingTracker(
    directory_save=directory_save,
    class_weights=class_weights, # Class Weights
    tictoc=tictoc, # Time Tracker
    n_train_actual=n_train_actual,
    model=({'coef':model.coef_, 'intercept':model.intercept_})
)

y_train_preds = model.predict(features_train).astype(int)
y_train_true = labels_train
y_val_preds = model.predict(features_val).astype(int)
y_val_true = labels_val

# Save training loop results from current epoch for training set
training_tracker.add_accuracy(0, 'accuracy_training', y_train_true, y_train_preds) # Generating training loss
training_tracker.add_confusion_matrix(0, 'confusionMatrix_training', y_train_true, y_train_preds) # Generating confusion matrix

# Save training loop results from current epoch for validation set
training_tracker.add_accuracy(0, 'accuracy_val', y_val_true, y_val_preds) # Generating validation accuracy
training_tracker.add_confusion_matrix(0, 'confusionMatrix_val', y_val_true, y_val_preds) # Generating validation confusion matrix

tictoc[f'completed_training_in_{0}'] = time.time() - tic

training_tracker.save_results() # TODO: JZ, ADJUST RESULTS SAVING TO SAVE CONFUSION MATRICES AS NOT A DATAFRAME CSV
training_tracker.print_results()

model_save = {
    'intercept_': model.intercept_,
    'coef_': model.coef_,
    'classes_': model.classes_,
}

Splitting data...
Creating X and y matrices for training data...
Calculating class weights...
Fitting model to data of dimensions: X: torch.Size([2938, 100]), y: (2938,)...
Calculating tracker outputs and saving to /Users/josh/analysis/outputs/ROICaT/classification/02_classifier_train...
Saving results:  /Users/josh/analysis/outputs/ROICaT/classification/02_classifier_train/results_training.csv /Users/josh/analysis/outputs/ROICaT/classification/02_classifier_train/results_timing.json
self.tictoc={'start': 6.699562072753906e-05, 'loaded_data': 0.2793619632720947, 'splitted_data': 0.28069400787353516, 'completed_training_in_0': 1.374708890914917}
self.model={'coef': array([[ 1.69406483e-04, -5.27381853e-04, -2.60215527e-04,
        -2.68778208e-04,  3.91715543e-04, -4.79283612e-04,
         2.32144896e-04,  2.07393075e-04,  6.25804876e-04,
         3.00665636e-04,  6.02871984e-04,  6.75283610e-04,
        -2.07972300e-04, -1.61987044e-04,  1.55279871e-04,
         4.83258452e-04,  3.8750

# Save Outputs

In [8]:
np.save(str((Path(directory_save) / 'model.npy').resolve()), model_save, allow_pickle=True)
# with open(str((Path(directory_save) / 'classifierTrainingRun_interim.pkl').resolve()), 'wb') as f:
#     np.save(
#         file=f,
#         arr=classifierTrainingRun_interim,
#         allow_pickle=True
#     )

print(f'Saved model fit results.')

Saved model fit results.
