# Package

In [2]:
# ! pip install cebra

import sys
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import random
import cebra
import cebra.datasets
from cebra import CEBRA
import cebra.grid_search
import time
import os
from collections import Counter

In [6]:
conf_dir = "/scratch/09117/xz6783/Xu/PL_miniscope/PL/Analysis/Cat_Odor_Conflict/Conflict/extracted_data/Calcium"
source_dir = "/scratch/09117/xz6783/Xu/CEBRA/Cebra_behavior/Cat_odor_conflict/training/data"
output_dir = "/scratch/09117/xz6783/Xu/CEBRA/Cebra_behavior/Cat_odor_conflict/training/output"
animal_id = '9M6'

# Preprocessing

In [16]:
def process_data(conf_dir, file_name):
    # Read data file
    Dta = pd.read_pickle(os.path.join(conf_dir, file_name))
    
    # Bin the dataset to one-second bin
    Bins = int(Dta.shape[1] / 20)  
    Dta = Dta.groupby(pd.cut(Dta.columns, Bins), axis=1).mean() 
    Dta.columns = np.arange(0, Bins)
    
    # Identifying cue onset and offset
    cue = Dta.loc["Tone active"]
    cueonset = np.array(np.where(np.diff(np.concatenate(([0], cue > 0))) > 0)[0])
    cueoffset = np.array(np.where(np.diff(np.concatenate(([0], cue > 0))) < 0)[0])

    # Prepare data
    Dt = Dta.T
    Time = Dt.index.values.astype(int) + 1
    Dt.reset_index(drop=True, inplace=True)
#     Dt.columns = np.arange(0, Dt.shape[1])
#     Dt = Dt.astype("int64")
    Dt.insert(0, "Timepoint", Time)
    Dt.replace([np.inf, -np.inf], np.nan, inplace=True)
    Dt.dropna(axis=1, inplace=True)


    # Reorder columns
    cols = list(Dt.columns)
    cols = cols[:1] + cols[-8:-2] + cols[1:-8]
    Dt = Dt[cols]
    Dt = Dt.rename(columns={Dt.columns[1]: 'Time', Dt.columns[2]: 'Speed'})

    # Extract the column names you want to rename (from the 8th column onwards)
    columns_to_rename = Dt.columns[7:]

    # Create a mapping dictionary
    rename_dict = {old_name: new_name for old_name, new_name in zip(columns_to_rename, range(len(columns_to_rename)))}

    # Use rename() with the dictionary
    Dt = Dt.rename(columns=rename_dict)

    return Dt

# 1. Dataset

In [18]:
Dt = pd.read_csv(os.path.join(source_dir, f'{animal_id}.csv'), index_col= 0)
# Split
Y = Dt.iloc[:, 0:7]
X = Dt.iloc[:, 7:]
# View data
Dt

Unnamed: 0,Timepoint,Time,Speed,Freezing,In Food Area,In Hidden Area,Bar Press active,0,1,2,...,35,36,37,38,39,40,41,42,43,44
0,1,0.463857,0.010170,0.0,0.0,1.0,0.0,0.000000,0.000000,0.000000e+00,...,0.000000,0.000000,0.000000,0.033187,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000
1,2,1.500900,0.040032,0.0,0.0,1.0,0.0,0.000000,0.000000,0.000000e+00,...,0.000000,0.000000,0.000000,0.447949,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000
2,3,2.512950,0.021748,0.0,0.0,1.0,0.0,0.000000,0.000000,0.000000e+00,...,0.000000,0.000000,0.000000,0.423676,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000
3,4,3.525550,0.008394,0.0,0.0,1.0,0.0,0.000000,0.000000,0.000000e+00,...,0.000000,0.000000,0.000000,0.394329,0.000000,0.047105,0.000000,7.818765e-10,0.000000e+00,0.000000
4,5,4.537200,0.023210,0.0,0.0,1.0,0.0,0.000000,0.000000,0.000000e+00,...,0.000000,0.092810,0.000000,0.367015,0.000000,0.261374,0.054188,6.545757e-01,4.199436e-10,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722,1723,1744.519150,0.016032,0.0,0.0,0.0,0.0,0.537072,1.685947,2.335534e-08,...,0.676480,4.079527,0.613445,1.040939,0.161407,1.026348,2.604121,0.000000e+00,1.319064e-03,2.268557
1723,1724,1745.531500,0.028584,0.0,0.0,0.0,0.0,0.482538,1.564312,2.011682e-08,...,0.647195,3.778004,0.582228,0.978935,0.144029,1.089173,3.347503,0.000000e+00,1.202144e-03,2.126459
1724,1725,1746.543550,0.027268,0.0,0.0,0.0,0.0,0.433542,1.451453,1.732317e-08,...,0.619177,3.498768,0.552599,0.911128,0.128522,1.009110,2.995678,0.000000e+00,1.095588e-03,1.934140
1725,1726,1747.555450,0.017227,0.0,0.0,0.0,0.0,0.389520,1.346736,1.491559e-08,...,0.592372,3.240170,0.524478,0.848017,0.114685,1.551771,2.667912,0.000000e+00,9.984770e-04,1.827837


In [19]:
Y

Unnamed: 0,Timepoint,Time,Speed,Freezing,In Food Area,In Hidden Area,Bar Press active
0,1,0.463857,0.010170,0.0,0.0,1.0,0.0
1,2,1.500900,0.040032,0.0,0.0,1.0,0.0
2,3,2.512950,0.021748,0.0,0.0,1.0,0.0
3,4,3.525550,0.008394,0.0,0.0,1.0,0.0
4,5,4.537200,0.023210,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...
1722,1723,1744.519150,0.016032,0.0,0.0,0.0,0.0
1723,1724,1745.531500,0.028584,0.0,0.0,0.0,0.0
1724,1725,1746.543550,0.027268,0.0,0.0,0.0,0.0
1725,1726,1747.555450,0.017227,0.0,0.0,0.0,0.0


In [20]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
0,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.033187,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000
1,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.447949,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000
2,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.423676,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000
3,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.394329,0.000000,0.047105,0.000000,7.818765e-10,0.000000e+00,0.000000
4,0.000000,0.000000,0.000000e+00,0.008079,0.000000,0.014879,0.000000,0.000000,0.000000,0.451172,...,0.000000,0.092810,0.000000,0.367015,0.000000,0.261374,0.054188,6.545757e-01,4.199436e-10,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722,0.537072,1.685947,2.335534e-08,0.118103,0.033334,0.001051,0.349872,0.008583,0.000382,0.178944,...,0.676480,4.079527,0.613445,1.040939,0.161407,1.026348,2.604121,0.000000e+00,1.319064e-03,2.268557
1723,0.482538,1.564312,2.011682e-08,0.100563,0.028107,0.000881,0.309043,0.007353,0.000305,0.154162,...,0.647195,3.778004,0.582228,0.978935,0.144029,1.089173,3.347503,0.000000e+00,1.202144e-03,2.126459
1724,0.433542,1.451453,1.732317e-08,0.085628,0.023699,0.000739,0.272979,0.006299,0.000243,0.132813,...,0.619177,3.498768,0.552599,0.911128,0.128522,1.009110,2.995678,0.000000e+00,1.095588e-03,1.934140
1725,0.389520,1.346736,1.491559e-08,0.072911,0.019982,0.000620,0.241124,0.005397,0.000194,0.114420,...,0.592372,3.240170,0.524478,0.848017,0.114685,1.551771,2.667912,0.000000e+00,9.984770e-04,1.827837


# 2. Model setup
## We consider three combination of categorical outcomes in this demo
1. [Freezing, In Hidden Area] 
2. [In Food Area, Bar Press Active] 
3. [Freezing, In Hidden Area, In Food Area, Bar Press Active]

## (1). Hyperparameter space

In [21]:
distance = ['cosine']               
temperature_mode = ['auto']                     
time_offsets = [10]   
batch_size = [None]              
learning_rate = [0.0001, 0.001, 0.01, 0.1, 1]    
conditional = ['time_delta']                            
output_dimension = [2, 3, 5, 10, 15]       
num_hidden_units = [32, 50, 75]                 
max_iterations = [7500]                           


# Put into a matrix
Hyperparameter_Space = [(a,b,c,d,e,f,g,h,i) 
                        for a in distance 
                        for b in temperature_mode 
                        for c in time_offsets 
                        for d in batch_size 
                        for e in learning_rate 
                        for f in conditional 
                        for g in output_dimension
                        for h in num_hidden_units
                        for i in max_iterations]
Hyperparameter_Mat = pd.DataFrame(Hyperparameter_Space)
Hyperparameter_Mat.columns = ['distance', 'temperature_mode', 'time_offsets', 'batch_size', 
                              'learning_rate', 'conditional', 'output_dimension', 'num_hidden_units', 'max_iterations']
Hyperparameter_Mat

Unnamed: 0,distance,temperature_mode,time_offsets,batch_size,learning_rate,conditional,output_dimension,num_hidden_units,max_iterations
0,cosine,auto,10,,0.0001,time_delta,2,32,7500
1,cosine,auto,10,,0.0001,time_delta,2,50,7500
2,cosine,auto,10,,0.0001,time_delta,2,75,7500
3,cosine,auto,10,,0.0001,time_delta,3,32,7500
4,cosine,auto,10,,0.0001,time_delta,3,50,7500
...,...,...,...,...,...,...,...,...,...
70,cosine,auto,10,,1.0000,time_delta,10,50,7500
71,cosine,auto,10,,1.0000,time_delta,10,75,7500
72,cosine,auto,10,,1.0000,time_delta,15,32,7500
73,cosine,auto,10,,1.0000,time_delta,15,50,7500


## (2). Training function

In [22]:
# Self define training function
def Self_CEBRA_Time_Training(Toy, X, y):
    # Timer
    start_time = time.time()

    # DataFrame for evaluation metrics
    Performance = pd.DataFrame([])
    InfoNCE = []
    
    n_comb = len(Toy)

    for i in range(0, n_comb):
        # Extract hyperparameter
        Hyper_i = Toy.iloc[i, :]
        Hyperpara = {'distance': Hyper_i['distance'], 
                     'temperature_mode': Hyper_i['temperature_mode'], 
                     'time_offsets': Hyper_i['time_offsets'], 
                     'batch_size': Hyper_i['batch_size'],       
                     'learning_rate': Hyper_i['learning_rate'], 
                     'conditional': Hyper_i['conditional'], 
                     'output_dimension': Hyper_i['output_dimension'], 
                     'num_hidden_units': Hyper_i['num_hidden_units'], 
                     'max_iterations': Hyper_i['max_iterations']}

        # Extract sub-hyperparameters
        cebra_behavior_model = CEBRA(batch_size = Hyperpara['batch_size'],
                                     learning_rate = Hyperpara['learning_rate'],
                                     temperature_mode = Hyperpara['temperature_mode'],
                                     output_dimension = Hyperpara['output_dimension'],
                                     max_iterations = Hyperpara['max_iterations'],
                                     distance = Hyperpara['distance'],
                                     conditional = Hyperpara['conditional'],
                                     time_offsets = Hyperpara['time_offsets'],
                                     device = 'cuda_if_available',
                                     verbose = True)
        
        # Model fitting
        cebra_behavior_model.fit(X, y)
        
        # Model loss function value
        InfoNCELoss = cebra_behavior_model.state_dict_['loss'][7499]
        InfoNCE.append(InfoNCELoss.tolist())

    print("--- %s seconds ---" % (time.time() - start_time))

    # Training performance
    dt = {'Accuracy_Testing': InfoNCE}
    Performance = pd.DataFrame(dt)

    # Combined train and test results
    Combined_Rsult = pd.concat([Toy, Performance], axis = 1)
    return(Combined_Rsult)

In [23]:
# Cut hyperparameter space into difference pieces: 5 pieces (0-4)
Hyp_Pieces = np.array_split(range(len(Hyperparameter_Mat)), 5)

## (3). Train test split

In [24]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 2024)

# Sort dataset by row index
X_train = X_train.sort_index()
X_test = X_test.sort_index()
y_train = y_train.sort_index()
y_test = y_test.sort_index()

# 3. Model training for different outcome combination

## (1). [Freezing, In Hidden Area]

In [25]:
# Extract specific combination
y_train_Comb1 = y_train.iloc[:, [3, 5]].to_numpy()

In [None]:
# Batch 0
Toy0 = Hyperparameter_Mat.iloc[Hyp_Pieces[0], :]
Toy0.reset_index(drop=True, inplace=True)
Result_0 = Self_CEBRA_Time_Training(Toy0, X_train, y_train_Comb1)
Result_0.to_csv(os.path.join(output_dir, f'CEBRA_Behavior Training Results 0 (Comb 1) {animal_id}.csv'))

pos:  0.5056 neg:  6.3822 total:  6.8878 temperature:  0.6276: 100%|██████████| 7500/7500 [03:25<00:00, 36.44it/s]
pos:  0.4912 neg:  6.4594 total:  6.9506 temperature:  0.7721:  49%|████▉     | 3675/7500 [01:43<01:50, 34.55it/s]

In [14]:
# Batch 1
Toy0 = Hyperparameter_Mat.iloc[Hyp_Pieces[1], :]
Toy0.reset_index(drop=True, inplace=True)
Result_0 = Self_CEBRA_Time_Training(Toy0, X_train, y_train_Comb1)
Result_0.to_csv('CEBRA_Behavior Training Results 1 (Comb 1).csv')

pos:  0.4384 neg:  6.3003 total:  6.7387 temperature:  0.4306: 100%|██████████| 7500/7500 [03:16<00:00, 38.17it/s]
pos:  0.4767 neg:  6.3141 total:  6.7908 temperature:  0.4343: 100%|██████████| 7500/7500 [03:22<00:00, 37.12it/s]
pos:  0.4609 neg:  6.2913 total:  6.7522 temperature:  0.4382: 100%|██████████| 7500/7500 [03:08<00:00, 39.69it/s]
pos:  0.6248 neg:  5.9505 total:  6.5753 temperature:  0.1363: 100%|██████████| 7500/7500 [03:13<00:00, 38.67it/s]
pos:  0.6821 neg:  5.9670 total:  6.6491 temperature:  0.1449: 100%|██████████| 7500/7500 [03:10<00:00, 39.36it/s]
pos:  0.6744 neg:  5.9592 total:  6.6336 temperature:  0.1373: 100%|██████████| 7500/7500 [03:10<00:00, 39.43it/s]
pos:  1.1961 neg:  5.2089 total:  6.4051 temperature:  0.1000: 100%|██████████| 7500/7500 [03:32<00:00, 35.34it/s]
pos:  1.1651 neg:  5.2052 total:  6.3703 temperature:  0.1000: 100%|██████████| 7500/7500 [03:31<00:00, 35.48it/s]
pos:  1.2474 neg:  5.1946 total:  6.4421 temperature:  0.1000: 100%|██████████| 

--- 3197.175255537033 seconds ---


In [15]:
# Batch 2
Toy0 = Hyperparameter_Mat.iloc[Hyp_Pieces[2], :]
Toy0.reset_index(drop=True, inplace=True)
Result_0 = Self_CEBRA_Time_Training(Toy0, X_train, y_train_Comb1)
Result_0.to_csv('CEBRA_Behavior Training Results 2 (Comb 1).csv')

pos:  0.4205 neg:  6.3108 total:  6.7313 temperature:  0.4636: 100%|██████████| 7500/7500 [03:07<00:00, 39.93it/s]
pos:  0.4848 neg:  6.3017 total:  6.7864 temperature:  0.4500: 100%|██████████| 7500/7500 [03:06<00:00, 40.29it/s]
pos:  0.4559 neg:  6.2958 total:  6.7517 temperature:  0.4276: 100%|██████████| 7500/7500 [03:06<00:00, 40.12it/s]
pos:  0.6781 neg:  5.9492 total:  6.6273 temperature:  0.1000: 100%|██████████| 7500/7500 [03:16<00:00, 38.17it/s]
pos:  0.6398 neg:  5.9544 total:  6.5942 temperature:  0.1000: 100%|██████████| 7500/7500 [03:09<00:00, 39.49it/s]
pos:  0.6421 neg:  5.9488 total:  6.5909 temperature:  0.1000: 100%|██████████| 7500/7500 [03:05<00:00, 40.46it/s]
pos:  1.1814 neg:  5.1972 total:  6.3786 temperature:  0.1000: 100%|██████████| 7500/7500 [03:19<00:00, 37.63it/s]
pos:  1.1749 neg:  5.2184 total:  6.3933 temperature:  0.1000: 100%|██████████| 7500/7500 [03:19<00:00, 37.53it/s]
pos:  1.2006 neg:  5.2076 total:  6.4082 temperature:  0.1000: 100%|██████████| 

KeyboardInterrupt: 

In [None]:
# Batch 3
Toy0 = Hyperparameter_Mat.iloc[Hyp_Pieces[3], :]
Toy0.reset_index(drop=True, inplace=True)
Result_0 = Self_CEBRA_Time_Training(Toy0, X_train, y_train_Comb1)
Result_0.to_csv('CEBRA_Behavior Training Results 3 (Comb 1).csv')

In [None]:
# Batch 4
Toy0 = Hyperparameter_Mat.iloc[Hyp_Pieces[4], :]
Toy0.reset_index(drop=True, inplace=True)
Result_0 = Self_CEBRA_Time_Training(Toy0, X_train, y_train_Comb1)
Result_0.to_csv('CEBRA_Behavior Training Results 4 (Comb 1).csv')

## (2). [In Food Area, Bar Press Active]

In [None]:
# Extract specific combination
y_train_Comb2 = y_train.iloc[:, [4, 6]]

In [None]:
# Batch 0
Toy0 = Hyperparameter_Mat.iloc[Hyp_Pieces[0], :]
Toy0.reset_index(drop=True, inplace=True)
Result_0 = Self_CEBRA_Time_Training(Toy0, X_train, y_train_Comb2)
Result_0.to_csv('CEBRA_Behavior Training Results 0 (Comb 2).csv')

In [None]:
# Batch 1
Toy0 = Hyperparameter_Mat.iloc[Hyp_Pieces[1], :]
Toy0.reset_index(drop=True, inplace=True)
Result_0 = Self_CEBRA_Time_Training(Toy0, X_train, y_train_Comb2)
Result_0.to_csv('CEBRA_Behavior Training Results 1 (Comb 2).csv')

In [None]:
# Batch 2
Toy0 = Hyperparameter_Mat.iloc[Hyp_Pieces[2], :]
Toy0.reset_index(drop=True, inplace=True)
Result_0 = Self_CEBRA_Time_Training(Toy0, X_train, y_train_Comb2)
Result_0.to_csv('CEBRA_Behavior Training Results 2 (Comb 2).csv')

In [None]:
# Batch 3
Toy0 = Hyperparameter_Mat.iloc[Hyp_Pieces[3], :]
Toy0.reset_index(drop=True, inplace=True)
Result_0 = Self_CEBRA_Time_Training(Toy0, X_train, y_train_Comb2)
Result_0.to_csv('CEBRA_Behavior Training Results 3 (Comb 2).csv')

In [None]:
# Batch 4
Toy0 = Hyperparameter_Mat.iloc[Hyp_Pieces[4], :]
Toy0.reset_index(drop=True, inplace=True)
Result_0 = Self_CEBRA_Time_Training(Toy0, X_train, y_train_Comb2)
Result_0.to_csv('CEBRA_Behavior Training Results 4 (Comb 2).csv')

## (3). [Freezing, In Hidden Area, In Food Area, Bar Press Active]

In [None]:
# Extract specific combination
y_train_Comb3 = y_train.iloc[:, [3,4,5,6]]

In [None]:
# Batch 0
Toy0 = Hyperparameter_Mat.iloc[Hyp_Pieces[0], :]
Toy0.reset_index(drop=True, inplace=True)
Result_0 = Self_CEBRA_Time_Training(Toy0, X_train, y_train_Comb3)
Result_0.to_csv('CEBRA_Behavior Training Results 0 (Comb 3).csv')

In [None]:
# Batch 1
Toy0 = Hyperparameter_Mat.iloc[Hyp_Pieces[1], :]
Toy0.reset_index(drop=True, inplace=True)
Result_0 = Self_CEBRA_Time_Training(Toy0, X_train, y_train_Comb3)
Result_0.to_csv('CEBRA_Behavior Training Results 1 (Comb 3).csv')

In [None]:
# Batch 2
Toy0 = Hyperparameter_Mat.iloc[Hyp_Pieces[2], :]
Toy0.reset_index(drop=True, inplace=True)
Result_0 = Self_CEBRA_Time_Training(Toy0, X_train, y_train_Comb3)
Result_0.to_csv('CEBRA_Behavior Training Results 2 (Comb 3).csv')

In [None]:
# Batch 3
Toy0 = Hyperparameter_Mat.iloc[Hyp_Pieces[3], :]
Toy0.reset_index(drop=True, inplace=True)
Result_0 = Self_CEBRA_Time_Training(Toy0, X_train, y_train_Comb3)
Result_0.to_csv('CEBRA_Behavior Training Results 3 (Comb 3).csv')

In [None]:
# Batch 4
Toy0 = Hyperparameter_Mat.iloc[Hyp_Pieces[4], :]
Toy0.reset_index(drop=True, inplace=True)
Result_0 = Self_CEBRA_Time_Training(Toy0, X_train, y_train_Comb3)
Result_0.to_csv('CEBRA_Behavior Training Results 4 (Comb 3).csv')