# __Quantify the clustering performance and the stability of cluster assignment__

# Part 1) Compute UMAPs of continous or interval-binned data using the rapids-0.18 environment 

In [None]:
# Load all dependencies from the rapids environment:

from __future__ import print_function
import time
from datetime import datetime

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import math

from cuml.manifold import UMAP as cuML_UMAP

%matplotlib inline

## Load, clean, and shuffle the data:

In [None]:
USE_CONT_DATA = True
INTERVALLS_PER_DIMENSION = 100

In [None]:
filename = 'States_ceiling_reduced.csv'

#filename = 'States_ceiling_mix2_with_UMAP.csv'

In [None]:
df = pd.read_csv(filename)
df.columns

In [None]:
df.shape[0]

In [None]:
l_col_names = ['norm_HeartRate', 
               #'norm_HR_Low_Signal',
               #'norm_HR_Med_Delta',
               #'norm_HR_Med_Amp',
               'norm_HR_High_Amp',
               #'norm_HR_CoV_10s',
               'norm_Ceiling',
               'norm_DistanceToCeiling',
               'norm_Motion',
               'norm_AreaExplored',
               'norm_Speed',
               'norm_Temperature']

In [None]:
if filename == 'States_ceiling_reduced.csv':
    # Since some adaptations had to be made "on the fly" to integrate the interval binned data, not sure whether it still works from scratch..
    df = pd.read_csv(filename)

    # Remove all datapoints that should be excluded:
    df = df.loc[df['Exclude'] == False]

    # Rename former index column that can be used later to match the different clusterings again after shuffling of the data:
    l_colums = list(df.columns)
    l_colums[0] = 'Data_idx'
    df.columns = l_colums
    
    column_count = df.columns.shape[0]
    # Should be re-usable to add other interval binnings as well and requires only updating of the column index:
    for col_name in l_col_names:
        n_dim_bins = INTERVALLS_PER_DIMENSION
        new_col_name = col_name + '_intervals_' + str(n_dim_bins)
        df.insert(column_count + l_col_names.index(col_name), new_col_name, pd.cut(df[col_name], n_dim_bins, labels=False, duplicates='drop') / n_dim_bins)

else:
    df = pd.read_csv(filename, index_col = 0)

# Shuffle the data:
df_shuffled = df.sample(frac=1).copy()

# Show the DataFrame for visual inspection:
df_shuffled

In [None]:
l_dimensions = []

if USE_CONT_DATA == False:
    for col_name in l_col_names:
        n_dim_bins = INTERVALLS_PER_DIMENSION
        new_col_name = col_name + '_intervals_' + str(n_dim_bins)
        df_shuffled[new_col_name] = pd.cut(df_shuffled[col_name], n_dim_bins, labels=False, duplicates='drop') / n_dim_bins
        l_dimensions.append(new_col_name)
else:
    l_dimensions = l_col_names
    
print('\nThese columns will be used as input for UMAP: \n')
print(l_dimensions)
print('\n \n')

For now, `n_neighbors` is set to 103, which is the result of the following computation if all valid datapoints are used: 

##### `Square-root of the number of all datapoints / (number of dimensions +1)`
##### `int((df.shape[0] ** (1/2)) / len(l_dimensions)) +1`

In [None]:
n_neighbors = int((df_shuffled[l_dimensions].shape[0] ** (1/2)) / len(l_dimensions)) +1

In [None]:
#n_neighbors = 103

In [None]:
print(n_neighbors)

In [None]:
if n_neighbors < 75:
    n_neighbors = n_neighbors * 2
    
print(n_neighbors)

In [None]:
df_shuffled[l_dimensions]

### Compute UMAP:

Alright, let´s go. This step may need up to half an hour. To keep the time, starting time and finally the computation times will be printed

In [None]:
now = datetime.now()
print('UMAP computations were started at:')
print(now.strftime("%d/%m/%Y %H:%M:%S"))

In [None]:
time_start = time.time()
umap = cuML_UMAP(n_components=2, random_state = 42, n_neighbors=n_neighbors)
umap_results = umap.fit_transform(df_shuffled[l_dimensions])
print('UMAP done! Time elapsed: {} seconds'.format(time.time()-time_start))

### Inspect the embedded space:

Especially for the interval binned data the UMAP space is quite heavily spread out (yet only single points at these extremes). <br>
To keep everything focused, both x- and y-axis are fixed between -25 and 25.

In [None]:
size = 0.25

fig = plt.figure(figsize=(10, 10), facecolor='white')

sns.scatterplot(
    x=umap_results[:,0],
    y=umap_results[:,1],
    s=size)
plt.xlim(-25,25)
plt.ylim(-25,25)

plt.show()

### Append the embedding coordinates to the DataFrame

In [None]:
if USE_CONT_DATA:
    embedding_col_name = 'UMAP_cont_r'
else:
    embedding_col_name = 'UMAP_intv_{}_r'.format(str(INTERVALLS_PER_DIMENSION))
    
df_shuffled[embedding_col_name + '1_1'] = umap_results[:, 0]
df_shuffled[embedding_col_name + '1_2'] = umap_results[:, 1]

In [None]:
df_shuffled_results = df_shuffled[['Data_idx', embedding_col_name + '1_1', embedding_col_name + '1_2']].copy()

df = pd.merge(df, df_shuffled_results, on='Data_idx', how='outer')

df

### If everything looks good so far, uncomment the following cell to save the DataFrame

In [None]:
df.to_csv('States_ceiling_reduced3_with_UMAP.csv')

### Since we now validated the workflow, let´s run the remaining `X` runs in one go:

In [None]:
X = 2

In [None]:
now = datetime.now()
print('Remaining computations were started at:')
print(now.strftime("%d/%m/%Y %H:%M:%S"))

for run in range(X):
    run = run + 2

    # Re-do the shuffling for the new run:
    df_shuffled = df.sample(frac=1).copy()

    now = datetime.now()
    print('UMAP computations of run {} were started at:'.format(run))
    print(now.strftime("%d/%m/%Y %H:%M:%S"))
    
    time_start = time.time()
    umap = cuML_UMAP(n_components=2, random_state = 42, n_neighbors=n_neighbors)
    umap_results = umap.fit_transform(df_shuffled[l_dimensions])
    print('UMAP done! Time elapsed: {} seconds'.format(time.time()-time_start))
    
    df_shuffled[embedding_col_name + str(run) + '_1'] = umap_results[:, 0]
    df_shuffled[embedding_col_name + str(run) + '_2'] = umap_results[:, 1]
    
    df_shuffled_results = df_shuffled[['Data_idx', embedding_col_name + str(run) + '_1', embedding_col_name + str(run) + '_2']].copy()

    df = pd.merge(df, df_shuffled_results, on='Data_idx', how='outer')
    
    print('Done with run {}! :)'.format(run))

In [None]:
df.columns

### Now let´s compare the three UMAP "state spaces":

In [None]:
size = 1

column_specifier = 'cont'
#column_specifier = 'intv_100'

fig = plt.figure(figsize=(20, 10), facecolor='white')
gs = fig.add_gridspec(1,3)


ax1 = fig.add_subplot(gs[0,0])
sns.scatterplot(
    data=df,
    x='UMAP_{}_r1_1'.format(column_specifier),
    y='UMAP_{}_r1_2'.format(column_specifier),
    s=size,
    ax=ax1)
plt.title('Run 1')


ax2 = fig.add_subplot(gs[0,1])
sns.scatterplot(
    data=df,
    x='UMAP_{}_r2_1'.format(column_specifier),
    y='UMAP_{}_r2_2'.format(column_specifier),
    s=size,
    ax=ax2)
plt.title('Run 2')


ax3 = fig.add_subplot(gs[0,2])
sns.scatterplot(
    data=df,
    x='UMAP_{}_r3_1'.format(column_specifier),
    y='UMAP_{}_r3_2'.format(column_specifier),
    s=size,
    ax=ax3)
plt.title('Run 3')

plt.show()

In [None]:
df.head()

### All good? Then save the results and either start again to compute UMAPs of interval binned data or continue with the clustering of the data! :)

In [None]:
df.to_csv('States_ceiling_reduced3_with_UMAP.csv')

# Part 2) Cluster the embedded spaces using the HDBSCAN environment

In [None]:
# Load all dependencies from the HDBSCAN environment:

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from hdbscan import HDBSCAN

# Until the joblib bug is fixed (https://github.com/scikit-learn-contrib/hdbscan/issues/436) - joblib version 0.17 (instead of latest: 1.0.0) has to be used!
import joblib

%matplotlib inline

### Confirm that joblib version 0.17 is used:

In [None]:
joblib.__version__

### Load the data:

In [None]:
df = pd.read_csv('States_ceiling_reduced3_with_UMAP.csv', index_col = 0)

In [None]:
df

### Before we even start with the clustering, let´s define the parameter `n_neighbors`:

In [None]:
df.loc[df['Exclude'] == False].groupby('behaviors').count()

#### min_cluster_size is set to 161, which reflects 25% of the bin_count of the least frequently occuring behavior, calculated like:
###### `int(df.loc[df['behaviors'].isin(['Flight', 'Rearing', 'StretchAttend', 'TailRattling', 'Grooming', 'Immobility', 'HeadDips'])].groupby('behaviors').count().min().min()*0.25)`

In [None]:
min_cluster_size = 161

In [None]:
#for code in ['cont', 'intv_100']:
for code in ['cont']:
    for run in range(1, 4):
        l_features = ['UMAP_{}_r{}_1'.format(code, str(run)), 'UMAP_{}_r{}_2'.format(code, str(run))]
        df_to_cluster = df[l_features].copy()
        
        cluster_column = 'Cluster_{}_r{}'.format(code, str(run))
        hdbscan = HDBSCAN(min_cluster_size=min_cluster_size, cluster_selection_method='leaf', allow_single_cluster=True).fit(df_to_cluster[l_features])
        df[cluster_column] = hdbscan.labels_
    df.to_csv('Clustered_nN-{}_States_ceiling_reduced3_with_UMAP.csv'.format(min_cluster_size))

#### Set the variables to inspect the clustering. You can also highlight the cluster IDs of a different run to visually inspect the consistency

In [None]:
l_features = 'UMAP_cont_r1_1', 'UMAP_cont_r1_2'

In [None]:
cluster_column = 'Cluster_cont_r1'

In [None]:
plt.figure(figsize=(14, 14))
sns.scatterplot(
    x=l_features[0],
    y=l_features[1],
    hue=cluster_column,
    palette='Spectral',
    data=df,
    legend=False,
    #alpha=0.3,
    s=1)
plt.ylim(-25, 25)
plt.xlim(-25, 25)

plt.show()

In [None]:
fig = plt.figure(figsize=(30, 15), facecolor='white')

gs = fig.add_gridspec(2,4)

fig.add_subplot(gs[0,0])

sns.scatterplot(
    x=l_features[0],
    y=l_features[1],
    #hue='Session',
    #palette='colorblind',
    color = 'goldenrod',
    data=df.loc[df['Session'] == 'OF'],
    legend=False,
    #alpha=0.3,
    s=0.5)
plt.ylim(-15, 15)
plt.xlim(-15, 15)
plt.title('OF')

fig.add_subplot(gs[0,1])

sns.scatterplot(
    x=l_features[0],
    y=l_features[1],
    #hue='Session',
    #palette='colorblind',
    color = 'forestgreen',
    data=df.loc[df['Session'] == 'EPM'],
    legend=False,
    #alpha=0.3,
    s=0.5)
plt.ylim(-15, 15)
plt.xlim(-15, 15)
plt.title('EPM')

fig.add_subplot(gs[1,0])

sns.scatterplot(
    x=l_features[0],
    y=l_features[1],
    #hue='Session',
    #palette='colorblind',
    color='dodgerblue',
    data=df.loc[df['Session'] == 'CD1'],
    legend=False,
    #alpha=0.3,
    s=0.5)
plt.ylim(-15, 15)
plt.xlim(-15, 15)
plt.title('CD1')

fig.add_subplot(gs[1,1])

sns.scatterplot(
    x=l_features[0],
    y=l_features[1],
    #hue='Session',
    #palette='colorblind',
    color = 'magenta',
    data=df.loc[df['Session'] == 'CD2'],
    legend=False,
    #alpha=0.3,
    s=0.5)
plt.ylim(-15, 15)
plt.xlim(-15, 15)
plt.title('CD2')


fig.add_subplot(gs[0:2,2:4])

sns.scatterplot(
    x=l_features[0],
    y=l_features[1],
    hue='Session',
    palette=['goldenrod', 'forestgreen', 'dodgerblue', 'magenta'],
    #color = 'forestgreen',
    data=df,
    legend=True,
    alpha=0.3,
    s=2)
plt.ylim(-15, 15)
plt.xlim(-15, 15)

#plt.savefig('State_spaces_reduced2.png', dpi=600)

plt.show()



# Part 3) Finally, let´s compare the clustering performance and the consistency of the results:

In [None]:
# Load all dependencies of the base environment:
from __future__ import print_function
import time
from datetime import datetime

import pickle
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import pairwise_distances

import math


%matplotlib inline

In [None]:
filename = 'Clustered_nN-161_States_ceiling_reduced3_with_UMAP.csv'

#filename = 'Clustered_nN-161_States_ceiling_reduced_with_UMAP_with_similarity_results.csv'


In [None]:
df = pd.read_csv(filename, index_col = 0)
df

### This first section is intended to quantify the general clustering performance, using metrics that are implemented in sklearn

In [None]:
results = {}
results_no_noise = {}
#for code in ['cont', 'intv_100']:
for code in ['cont']:
    results[code] = {}
    results_no_noise[code] = {}
    for run in range(1, 4):
        l_features = ['UMAP_{}_r{}_1'.format(code, str(run)), 'UMAP_{}_r{}_2'.format(code, str(run))]
        cluster_column = 'Cluster_{}_r{}'.format(code, str(run))
        
        d_b_score = metrics.davies_bouldin_score(df[l_features], df[cluster_column])
        c_h_score = metrics.calinski_harabasz_score(df[l_features], df[cluster_column])
        
        results[code]['run_{}'.format(str(run))] = [d_b_score, c_h_score]
        
        df_temp = df.loc[df[cluster_column] != -1].copy()
        d_b_score_no_noise = metrics.davies_bouldin_score(df_temp[l_features], df_temp[cluster_column])
        c_h_score_no_noise = metrics.calinski_harabasz_score(df_temp[l_features], df_temp[cluster_column])
        
        results_no_noise[code]['run_{}'.format(str(run))] = [d_b_score, c_h_score]

Clustering results from UMAPS with continous values seems to be more stable, compared to larger fluctuations after intervall binning

In [None]:
results

Clustering performance measures are not affected by those datapoints that are classified as noise:

In [None]:
results_no_noise

### This section is based on custom written code to quantify the similarity between the cluster results of two runs
#### These similarities should provide a measure for stability of the detected clusters across runs. Be aware that these computations take ~ 1.5 hours

In [None]:
Execute_similarity_computations = True
filename_suffix = 'ceiling_reduced3' 

In [None]:
if Execute_similarity_computations:
    print(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))

    time_start = time.time()
    similarity_results = {}

    # for cont & intv
    code = 'cont'

    similarity_results[code] = {}

    # for all possible run combinations:
    for run_combo in [('1', '2'), ('1', '3'), ('2', '1'), ('2', '3'), ('3', '1'), ('3', '2')]:
    #for run_combo in [('1', '2')]:
    #for run_combo in [('1', '3'), ('2', '1'), ('2', '3'), ('3', '1'), ('3', '2')]:

        run_a, run_b = run_combo[0], run_combo[1]

        similarity_results[code]['run_{}_vs_{}'.format(run_a, run_b)] = {}

        cluster_col_a = 'Cluster_{}_r{}'.format(code, run_a)
        cluster_col_b = 'Cluster_{}_r{}'.format(code, run_b)

        # If this is written to automatically iterate through all clusterings, remember to add the proper data selection from the main df:
        df_to_check = df[['Data_idx', cluster_col_a, cluster_col_b]].copy()

        for Data_idx in df_to_check['Data_idx'].unique():
            cluster_id_a = df_to_check.loc[df_to_check['Data_idx'] == Data_idx, cluster_col_a].values[0]
            if cluster_id_a != -1:
                # Check if this cluster has already its own key and create one, if not:
                if cluster_id_a not in similarity_results[code]['run_{}_vs_{}'.format(run_a, run_b)].keys():
                    similarity_results[code]['run_{}_vs_{}'.format(run_a, run_b)][cluster_id_a] = {'Matching_cluster_IDs': [],
                                                                                                   'Similarities_in_perc': []}
                # Retrieve the cluster id to which Data_idx was assigned to in run_b:
                cluster_id_b = df_to_check.loc[df_to_check['Data_idx'] == Data_idx, cluster_col_b].values[0]

                # Check whether it belongs to a real cluster and not to noise:
                if cluster_id_b != -1:

                    # Okay, the datapoint was assigned to a cluster in both runs. Let´s check for the similarity (if it was not already computed):
                    if cluster_id_b not in similarity_results[code]['run_{}_vs_{}'.format(run_a, run_b)][cluster_id_a]['Matching_cluster_IDs']:

                        # The similarities between these two clusters has not been computed yet, so let´s do it:
                        cluster_id_a_members = df_to_check.loc[df_to_check[cluster_col_a] == cluster_id_a, 'Data_idx'].values
                        cluster_id_b_members = df_to_check.loc[df_to_check[cluster_col_b] == cluster_id_b, 'Data_idx'].values

                        points_in_both_clusters = np.intersect1d(cluster_id_a_members, cluster_id_b_members)
                        similarity_a_to_b = points_in_both_clusters.shape[0] / cluster_id_a_members.shape[0] * 100

                        # Append the results to the dictionary:
                        similarity_results[code]['run_{}_vs_{}'.format(run_a, run_b)][cluster_id_a]['Matching_cluster_IDs'].append(cluster_id_b)
                        similarity_results[code]['run_{}_vs_{}'.format(run_a, run_b)][cluster_id_a]['Similarities_in_perc'].append((cluster_id_b, similarity_a_to_b))

                    else:
                        # The similarity between these two clusters has already been calculated. So we can simply retrieve the result:
                        similarity_a_to_b = [item for item in similarity_results[code]['run_{}_vs_{}'.format(run_a, run_b)][cluster_id_a]['Similarities_in_perc'] if item[0] == cluster_id_b][0][1]

                    sim_suffix = str(int(round(similarity_a_to_b, 0))).zfill(3)
                    classification = 'ac_bc_{}'.format(sim_suffix)



                else: # Specify error type. Here the datapoint was in a cluster in a but was noise in b
                    classification = 'ac_bn'

            else: # Check for potential errors. Is the datapoint also assigned to noise or into a cluster in b? 
                cluster_id_b = df_to_check.loc[df_to_check['Data_idx'] == Data_idx, cluster_col_b].values[0]

                if cluster_id_b != -1:
                    classification = 'an_bc'

                else:
                    classification = 'an_bn'

            df_to_check.loc[df_to_check['Data_idx'] == Data_idx, 'Comparison_{}_run_{}_vs_{}'.format(code, run_a, run_b)] = classification 

            # Because these computations take quite some time, let´s print from time to time how it´s progressing:
            if df_to_check.loc[df_to_check['Data_idx'] == Data_idx].index[0] % 50000 == 0:

                print('This is run {} vs run {}'.format(run_a, run_b))
                print(list(df_to_check['Data_idx'].unique()).index(Data_idx))    
                print('Time passed so far:')
                print(time.time()-time_start)
                print('######################')

        # Save the results:
        with open('similarity_results_{}_r{}_vs_r{}_{}.p'.format(code, run_a, run_b, filename_suffix), 'wb') as fp:
            pickle.dump(similarity_results, fp, protocol=pickle.HIGHEST_PROTOCOL)

        # To automatize the merging into the main df, check whether this actually works. Otherwise, use the cells below
        if df.shape[0] == df_to_check.shape[0]:
            df = pd.merge(df, df_to_check[['Data_idx', 'Comparison_{}_run_{}_vs_{}'.format(code, run_a, run_b)]], on='Data_idx', how='outer')
            # As safety net, let´s save the data in another .csv:
            df.to_csv('Clustered_nN-161_States_ceiling_reduced3_with_UMAP_with_similarity_results.csv')
            print('All data was successfully saved!')
        else:
            print('There was an issue with saving the DataFrame!')

else:
    print('If you want to execute this code, please remember to set `Execute_similarity_computations` to True')

##### Inspect, whether the column with the classifications of each data point was correctly added to the DataFrame

In [None]:
df

#### Get some rough overview on the cluster similarities, subdivided into pairs of high (66-100%), medium (33-66%), and low (0-33%) similarities:

In [None]:
no_cluster_similarity = {}
low_cluster_similarity = {}
medium_cluster_similarity = {}
high_cluster_similarity = {}

for cluster_id in similarity_results[code]['run_{}_vs_{}'.format(run_a, run_b)].keys():
    l_high = [item for item in similarity_results[code]['run_{}_vs_{}'.format(run_a, run_b)][cluster_id]['Similarities_in_perc'] if item[1] > 66]
    l_medium = [item for item in similarity_results[code]['run_{}_vs_{}'.format(run_a, run_b)][cluster_id]['Similarities_in_perc'] if (item[1] > 33) & (item[1] <= 66)]
    l_low = [item for item in similarity_results[code]['run_{}_vs_{}'.format(run_a, run_b)][cluster_id]['Similarities_in_perc'] if (item[1] > 0) & (item[1] <= 33)]
    l_none = [item for item in similarity_results[code]['run_{}_vs_{}'.format(run_a, run_b)][cluster_id]['Similarities_in_perc'] if item[1]==0]
    if len(l_high) > 0:
        high_cluster_similarity[cluster_id] = l_high
    
    if len(l_medium) > 0:
        medium_cluster_similarity[cluster_id] = l_medium
    
    if len(l_low) > 0:
        low_cluster_similarity[cluster_id] = l_low
    
    if len(l_none) > 0:
        no_cluster_similarity[cluster_id] = l_none

In [None]:
high_cluster_similarity

In [None]:
medium_cluster_similarity

### Let´s see, if we can identify clusters that are found with a certain similarity threshold across all three runs

Ideas how this could be extended:
* If we would use soft clustering, we could check wether those datapoints that are assigned to a cluster in run a but not in run b, <br> have at least a high probability of belonging to the respective matching cluster in b?
* Sticking to the "hard" clustering, we could also check how much of the remaining points (not assigned to the matching cluster) are assigned to a different cluster (potentially problematic if this cluster is not matching), or whether they are classified as noise (low distance to cluster border)

In [None]:
filename_suffix

In [None]:
code = 'cont'
run_a = '1'
run_b = '2'
run_c = '3'

similarity_threshold = 50

In [None]:
# Load the similarity results:
with open('similarity_results_{}_r{}_vs_r{}_{}.p'.format(code, run_a, run_b, filename_suffix), 'rb') as fp:
    similarity_results_a_vs_b = pickle.load(fp)
    
with open('similarity_results_{}_r{}_vs_r{}_{}.p'.format(code, run_b, run_a, filename_suffix), 'rb') as fp:
    similarity_results_b_vs_a = pickle.load(fp)

with open('similarity_results_{}_r{}_vs_r{}_{}.p'.format(code, run_a, run_c, filename_suffix), 'rb') as fp:
    similarity_results_a_vs_c = pickle.load(fp)
    
with open('similarity_results_{}_r{}_vs_r{}_{}.p'.format(code, run_c, run_a, filename_suffix), 'rb') as fp:
    similarity_results_c_vs_a = pickle.load(fp)
    
with open('similarity_results_{}_r{}_vs_r{}_{}.p'.format(code, run_b, run_c, filename_suffix), 'rb') as fp:
    similarity_results_b_vs_c = pickle.load(fp)
    
with open('similarity_results_{}_r{}_vs_r{}_{}.p'.format(code, run_c, run_b, filename_suffix), 'rb') as fp:
    similarity_results_c_vs_b = pickle.load(fp)

# For each combination of two runs, find all matching cluster pairs from these two runs that exceed the similarity_threshold (both directions will be checked):

# For run_a & run_b:
matching_clusters_a_b = {'run_{}_vs_{}'.format(run_a, run_b): {},
                         'run_{}_vs_{}'.format(run_b, run_a): {}}

for cluster_id in similarity_results_a_vs_b[code]['run_{}_vs_{}'.format(run_a, run_b)].keys():
    l_matching_clusters_a_b = [item for item in similarity_results_a_vs_b[code]['run_{}_vs_{}'.format(run_a, run_b)][cluster_id]['Similarities_in_perc'] if item[1] >= similarity_threshold]
    if len(l_matching_clusters_a_b) > 0:
        matching_clusters_a_b['run_{}_vs_{}'.format(run_a, run_b)][cluster_id] = l_matching_clusters_a_b
        
for cluster_id in similarity_results_b_vs_a[code]['run_{}_vs_{}'.format(run_b, run_a)].keys():
    l_matching_clusters_b_a = [item for item in similarity_results_b_vs_a[code]['run_{}_vs_{}'.format(run_b, run_a)][cluster_id]['Similarities_in_perc'] if item[1] >= similarity_threshold]
    if len(l_matching_clusters_b_a) > 0:
        matching_clusters_a_b['run_{}_vs_{}'.format(run_b, run_a)][cluster_id] = l_matching_clusters_b_a
        

# For run_a & run_c:
matching_clusters_a_c = {'run_{}_vs_{}'.format(run_a, run_c): {},
                         'run_{}_vs_{}'.format(run_c, run_a): {}}

for cluster_id in similarity_results_a_vs_c[code]['run_{}_vs_{}'.format(run_a, run_c)].keys():
    l_matching_clusters_a_c = [item for item in similarity_results_a_vs_c[code]['run_{}_vs_{}'.format(run_a, run_c)][cluster_id]['Similarities_in_perc'] if item[1] >= similarity_threshold]
    if len(l_matching_clusters_a_c) > 0:
        matching_clusters_a_c['run_{}_vs_{}'.format(run_a, run_c)][cluster_id] = l_matching_clusters_a_c
        
for cluster_id in similarity_results_c_vs_a[code]['run_{}_vs_{}'.format(run_c, run_a)].keys():
    l_matching_clusters_c_a = [item for item in similarity_results_c_vs_a[code]['run_{}_vs_{}'.format(run_c, run_a)][cluster_id]['Similarities_in_perc'] if item[1] >= similarity_threshold]
    if len(l_matching_clusters_c_a) > 0:
        matching_clusters_a_c['run_{}_vs_{}'.format(run_c, run_a)][cluster_id] = l_matching_clusters_c_a
        

# For run_b & run_c:
matching_clusters_b_c = {'run_{}_vs_{}'.format(run_b, run_c): {},
                         'run_{}_vs_{}'.format(run_c, run_b): {}}

for cluster_id in similarity_results_b_vs_c[code]['run_{}_vs_{}'.format(run_b, run_c)].keys():
    l_matching_clusters_b_c = [item for item in similarity_results_b_vs_c[code]['run_{}_vs_{}'.format(run_b, run_c)][cluster_id]['Similarities_in_perc'] if item[1] >= similarity_threshold]
    if len(l_matching_clusters_b_c) > 0:
        matching_clusters_b_c['run_{}_vs_{}'.format(run_b, run_c)][cluster_id] = l_matching_clusters_b_c
        
for cluster_id in similarity_results_c_vs_b[code]['run_{}_vs_{}'.format(run_c, run_b)].keys():
    l_matching_clusters_c_b = [item for item in similarity_results_c_vs_b[code]['run_{}_vs_{}'.format(run_c, run_b)][cluster_id]['Similarities_in_perc'] if item[1] >= similarity_threshold]
    if len(l_matching_clusters_c_b) > 0:
        matching_clusters_b_c['run_{}_vs_{}'.format(run_c, run_b)][cluster_id] = l_matching_clusters_c_b


# For each combination of two runs, we now have the smilarity-results for both directions. 
# Let´s go through these results to find all unique combinations (in this step, we lose the inforamtion about the directionality, which doesn´t really matter after all)

# For run_a & run_b:
l_matching_pairs_ab = []

for cluster_id_a in matching_clusters_a_b[list(matching_clusters_a_b.keys())[0]].keys():
    l_matches = matching_clusters_a_b[list(matching_clusters_a_b.keys())[0]][cluster_id_a]
    for i in range(len(l_matches)):
        l_matching_pairs_ab.append((cluster_id_a, matching_clusters_a_b[list(matching_clusters_a_b.keys())[0]][cluster_id_a][i][0]))

for cluster_id_b in matching_clusters_a_b[list(matching_clusters_a_b.keys())[1]].keys():
    l_matches = matching_clusters_a_b[list(matching_clusters_a_b.keys())[1]][cluster_id_b]
    for i in range(len(l_matches)):
        l_matching_pairs_ab.append((matching_clusters_a_b[list(matching_clusters_a_b.keys())[1]][cluster_id_b][i][0], cluster_id_b))
        
l_matching_pairs_ab = list(set(l_matching_pairs_ab))


# For run_a & run_b:
l_matching_pairs_ac = []

for cluster_id_a in matching_clusters_a_c[list(matching_clusters_a_c.keys())[0]].keys():
    l_matches = matching_clusters_a_c[list(matching_clusters_a_c.keys())[0]][cluster_id_a]
    for i in range(len(l_matches)):
        l_matching_pairs_ac.append((cluster_id_a, matching_clusters_a_c[list(matching_clusters_a_c.keys())[0]][cluster_id_a][i][0]))
        
for cluster_id_c in matching_clusters_a_c[list(matching_clusters_a_c.keys())[1]].keys():
    l_matches = matching_clusters_a_c[list(matching_clusters_a_c.keys())[1]][cluster_id_c]
    for i in range(len(l_matches)):
        l_matching_pairs_ac.append((matching_clusters_a_c[list(matching_clusters_a_c.keys())[1]][cluster_id_c][i][0], cluster_id_c))
        
l_matching_pairs_ac = list(set(l_matching_pairs_ac))


# For run_b & run_c:
l_matching_pairs_bc = []

for cluster_id_b in matching_clusters_b_c[list(matching_clusters_b_c.keys())[0]].keys():
    l_matches = matching_clusters_b_c[list(matching_clusters_b_c.keys())[0]][cluster_id_b]
    for i in range(len(l_matches)):
        l_matching_pairs_bc.append((cluster_id_b, matching_clusters_b_c[list(matching_clusters_b_c.keys())[0]][cluster_id_b][i][0]))
        
for cluster_id_c in matching_clusters_b_c[list(matching_clusters_b_c.keys())[1]].keys():
    l_matches = matching_clusters_b_c[list(matching_clusters_b_c.keys())[1]][cluster_id_c]
    for i in range(len(l_matches)):
        l_matching_pairs_bc.append((matching_clusters_b_c[list(matching_clusters_b_c.keys())[1]][cluster_id_c][i][0], cluster_id_c))
        
l_matching_pairs_bc = list(set(l_matching_pairs_bc))


# Now that we have all unique matching cluster pairs for all run combinations, let´s try to find those clusters that have a matching cluster in all three runs:
l_matching_triplets_abc = []
for pair_id in range(len(l_matching_pairs_ab)):
    cluster_id_a, cluster_id_b = l_matching_pairs_ab[pair_id][0], l_matching_pairs_ab[pair_id][1]
    
    l_matching_cluster_ids_c_from_a = []
    for i in range(len(l_matching_pairs_ac)):
        if l_matching_pairs_ac[i][0] == cluster_id_a:
            l_matching_cluster_ids_c_from_a.append(l_matching_pairs_ac[i][1])
            
    l_matching_cluster_ids_c_from_b = []
    for i in range(len(l_matching_pairs_bc)):
        if l_matching_pairs_bc[i][0] == cluster_id_b:
            l_matching_cluster_ids_c_from_b.append(l_matching_pairs_bc[i][1])   
    
    l_cluster_ids_c = []
    for cluster_id_c_from_a in l_matching_cluster_ids_c_from_a:
        if cluster_id_c_from_a in l_matching_cluster_ids_c_from_b:
            l_cluster_ids_c.append(cluster_id_c_from_a)
    
    for cluster_id_c in l_cluster_ids_c:
        l_matching_triplets_abc.append([(run_a, cluster_id_a) , (run_b, cluster_id_b), (run_c, cluster_id_c)])
        
if len(l_matching_triplets_abc) > 0:
    print('Congratulations! You found {} matching cluster tripletts across all runs! :-)'.format(len(l_matching_triplets_abc)))
    
else: 
    print('I´m sorry. I could not find any cluster tripletts that fullfill your criteria! :-(')

### Now let´s plot all matching cluster triplets to inspect how the individual measurements look like: 

In [None]:
d_measure_columns = {'intv_100': [column for column in df.columns if column.endswith('_intervals_100')],
                     'cont': [column[:column.index('_intervals_100')] for column in df.columns if column.endswith('_intervals_100')]}


fig = plt.figure(figsize=(20,5*len(l_matching_triplets_abc)), facecolor='white')

gs = fig.add_gridspec(len(l_matching_triplets_abc), 1)

for row in range(len(l_matching_triplets_abc)):
    l_dfs_individual_clusters = []
    for elem in l_matching_triplets_abc[row]:
        run, cluster_id = elem
        l_dfs_single_measurements = []
        for measure in d_measure_columns[code]:
            df_temp = df.loc[df['Cluster_{}_r{}'.format(code, run)] == cluster_id, ['Data_idx', 'Animal_ID', measure]].copy()
            df_temp.columns = ['Data_idx', 'Animal_ID', 'Data']
            n_datapoints = df_temp.shape[0]
            N_mice = df_temp['Animal_ID'].unique().shape[0]
            df_temp['Measure'] = measure
            df_temp['Cluster_Run'] = 'Cluster-{}_Run-{} ({} datapoints from {} mice)'.format(str(cluster_id), run, n_datapoints, N_mice)
            l_dfs_single_measurements.append(df_temp)
        l_dfs_individual_clusters.append(pd.concat(l_dfs_single_measurements))

    df_for_boxplots = pd.concat(l_dfs_individual_clusters)
    
    fig.add_subplot(gs[row, 0])
    sns.boxplot(x="Measure", y="Data", hue="Cluster_Run", data=df_for_boxplots)

plt.tight_layout()
plt.show()
    


## Plot these data back onto the original traces. How does it look like?

## Plotting only those bins that belong to the stable clusters and were found exclusively in run 1

In [None]:
l_mice = df['Animal_ID'].unique()
#l_mice = [df['Animal_ID'].unique()[3]]
#l_mice = ['175_F4-19']


folder = 'Plots_states_over_raw_data_all_mice_ceiling_reduced3'

SAVE = True

In [None]:
l_matching_triplets_r1 = [elem[0][1] for elem in l_matching_triplets_abc]


for mouse in l_mice:
    l_sessions = df.loc[df['Animal_ID'] == mouse, 'Session'].unique()
    #l_sessions = ['CD2']
    
    for session in l_sessions:
        # Select the corresponding session data
        df_session = df.loc[(df['Animal_ID'] == mouse) & (df['Session'] == session)].copy()
        
        # Since df was entirely cleaned from NaNs already, we have to recreate the gaps so that they appear in the plotted traces and are not connected
        last_bin = df_session['Bin'].max()
        data = {'Bin': [int(elem) for elem in np.linspace(1, last_bin, last_bin)]}
        df_bins = pd.DataFrame (data=data)
        df_merged = pd.merge(df_session, df_bins, on='Bin', how='right')
        
        # To show only relevant behaviors and relevant states, we will create corresponding columns that can be used to specify the hue
        df_merged['matching_cluster_ids'] = np.NaN
        df_merged.loc[df_merged['Cluster_cont_r1'].isin(l_matching_triplets_r1), 'matching_cluster_ids'] = df_merged['Cluster_cont_r1']

        l_selected_behaviors = ['Immobility', 'Grooming', 'Rearing', 'Flight', 'StretchAttend', 'TailRattling', 'HeadDips']
        df_merged['selected_behaviors'] = np.NaN
        df_merged.loc[df_merged['behaviors'].isin(l_selected_behaviors), 'selected_behaviors'] = df_merged['behaviors']
        
        # Also, we need some fixed y-value for the scatterplots of behaviors and states:
        df_merged['plain_one'] = 1
        
        # Now we are ready to create the figure: 
        fig = plt.figure(figsize=(20, 8), facecolor='white')
        gs = fig.add_gridspec(14,1)

        ax1 = fig.add_subplot(gs[0,0])
        sns.scatterplot(data=df_merged, x='Bin', y='plain_one', hue='selected_behaviors', marker="|", palette='colorblind', legend=False, s=100, ax=ax1)
        plt.title('{} during {}'.format(mouse, session))
        plt.xlim(0,last_bin)
        plt.ylabel('behavior', rotation=0)
        plt.xlabel('')
        ax1.yaxis.set_label_coords(-0.05,0.3)

        ax2 = fig.add_subplot(gs[1, 0], sharex=ax1)
        sns.scatterplot(data=df_merged, x='Bin', y='plain_one', hue='matching_cluster_ids', marker="|", palette='Spectral', legend=False, s=100, ax=ax2)
        plt.ylabel('state', rotation=0)
        plt.xlabel('')
        ax2.yaxis.set_label_coords(-0.05,0.3)

        ax3 = fig.add_subplot(gs[2:6, 0], sharex=ax1)
        plt.plot(df_merged['norm_HeartRate'], color='darkorange')
        plt.ylabel('HeartRate', rotation=0)
        ax3.yaxis.set_label_coords(-0.05,0.5)

        ax4 = fig.add_subplot(gs[6:10, 0], sharex=ax1)
        plt.plot(df_merged['norm_Motion'])
        plt.ylabel('Motion', rotation=0)
        ax4.yaxis.set_label_coords(-0.05,0.5)

        ax5 = fig.add_subplot(gs[10:14, 0])
        plt.plot(df_merged['norm_Temperature'], color='k')
        plt.ylabel('Temperature', rotation=0)
        plt.xlabel('time')
        plt.xlim(0,last_bin)
        ax5.yaxis.set_label_coords(-0.05,0.5)


        for ax in [ax1, ax2]:
            ax.spines['top'].set_visible(False)
            ax.spines['left'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.spines['bottom'].set_visible(False)
            ax.get_xaxis().set_ticks([])
            ax.get_yaxis().set_ticks([])

        for ax in [ax3, ax4]:
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.spines['bottom'].set_visible(False)

        for ax in [ax5]:
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)   

        plt.tight_layout()    
        
        if SAVE:
            plt.savefig('/home/ds/DCL/Defensive_states/{}/{}_{}.png'.format(folder, mouse, session), dpi=300)
            plt.close()
        else:
            plt.show()        

## Plot all cluster from 1 run - not checking for stable clusters

In [None]:
l_mice = df['Animal_ID'].unique()
#l_mice = [df['Animal_ID'].unique()[1]]
#l_mice = ['175_F4-19']

folder = 'Plots_states_over_raw_data_all_mice_ceiling_reduced3_all_run1_cluster'


SAVE = True

In [None]:
l_matching_triplets_r1 = [elem[0][1] for elem in l_matching_triplets_abc]


for mouse in l_mice:
    l_sessions = df.loc[df['Animal_ID'] == mouse, 'Session'].unique()
    #l_sessions = ['CD2']
    
    for session in l_sessions:
        # Select the corresponding session data
        df_session = df.loc[(df['Animal_ID'] == mouse) & (df['Session'] == session)].copy()
        
        # Since df was entirely cleaned from NaNs already, we have to recreate the gaps so that they appear in the plotted traces and are not connected
        last_bin = df_session['Bin'].max()
        data = {'Bin': [int(elem) for elem in np.linspace(1, last_bin, last_bin)]}
        df_bins = pd.DataFrame (data=data)
        df_merged = pd.merge(df_session, df_bins, on='Bin', how='right')
        
        # To show only relevant behaviors and relevant states, we will create corresponding columns that can be used to specify the hue
        df_merged['matching_cluster_ids'] = np.NaN
        df_merged.loc[df_merged['Cluster_cont_r1'].isin(l_matching_triplets_r1), 'matching_cluster_ids'] = df_merged['Cluster_cont_r1']

        l_selected_behaviors = ['Immobility', 'Grooming', 'Rearing', 'Flight', 'StretchAttend', 'TailRattling', 'HeadDips']
        df_merged['selected_behaviors'] = np.NaN
        df_merged.loc[df_merged['behaviors'].isin(l_selected_behaviors), 'selected_behaviors'] = df_merged['behaviors']
        
        # Also, we need some fixed y-value for the scatterplots of behaviors and states:
        df_merged['plain_one'] = 1
        
        # Now we are ready to create the figure: 
        fig = plt.figure(figsize=(20, 8), facecolor='white')
        gs = fig.add_gridspec(14,1)

        ax1 = fig.add_subplot(gs[0,0])
        sns.scatterplot(data=df_merged, x='Bin', y='plain_one', hue='selected_behaviors', marker="|", palette='colorblind', legend=False, s=100, ax=ax1)
        plt.title('{} during {}'.format(mouse, session))
        plt.xlim(0,last_bin)
        plt.ylabel('behavior', rotation=0)
        plt.xlabel('')
        ax1.yaxis.set_label_coords(-0.05,0.3)

        ax2 = fig.add_subplot(gs[1, 0], sharex=ax1)
        sns.scatterplot(data=df_merged.loc[df_merged['Cluster_cont_r1'] != -1], x='Bin', y='plain_one', hue='Cluster_cont_r1', marker="|", palette='Spectral', legend=False, s=100, ax=ax2)
        plt.ylabel('state', rotation=0)
        plt.xlabel('')
        ax2.yaxis.set_label_coords(-0.05,0.3)

        ax3 = fig.add_subplot(gs[2:6, 0], sharex=ax1)
        plt.plot(df_merged['norm_HeartRate'], color='darkorange')
        plt.ylabel('HeartRate', rotation=0)
        ax3.yaxis.set_label_coords(-0.05,0.5)

        ax4 = fig.add_subplot(gs[6:10, 0], sharex=ax1)
        plt.plot(df_merged['norm_Motion'])
        plt.ylabel('Motion', rotation=0)
        ax4.yaxis.set_label_coords(-0.05,0.5)

        ax5 = fig.add_subplot(gs[10:14, 0])
        plt.plot(df_merged['norm_Temperature'], color='k')
        plt.ylabel('Temperature', rotation=0)
        plt.xlabel('time')
        plt.xlim(0,last_bin)
        ax5.yaxis.set_label_coords(-0.05,0.5)


        for ax in [ax1, ax2]:
            ax.spines['top'].set_visible(False)
            ax.spines['left'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.spines['bottom'].set_visible(False)
            ax.get_xaxis().set_ticks([])
            ax.get_yaxis().set_ticks([])

        for ax in [ax3, ax4]:
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.spines['bottom'].set_visible(False)

        for ax in [ax5]:
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)   

        plt.tight_layout()    
        
        if SAVE:
            plt.savefig('/home/ds/DCL/Defensive_states/{}/{}_{}.png'.format(folder, mouse, session), dpi=300)
            plt.close()
        else:
            plt.show()        

## Plot all bins that belong to a stable cluster (in any of the three runs)

In [None]:
l_matching_triplets_r1 = [elem[0][1] for elem in l_matching_triplets_abc]
l_unique_matching_triples_r1 = list(set(l_matching_triplets_r1))

l_matching_triplets_r2 = [elem[1][1] for elem in l_matching_triplets_abc]
l_unique_matching_triples_r2 = list(set(l_matching_triplets_r2))

l_matching_triplets_r3 = [elem[2][1] for elem in l_matching_triplets_abc]
l_unique_matching_triples_r3 = list(set(l_matching_triplets_r3))



ID = '0'
l_matching_triplets_abc_fused = []

for unique_id_r1 in l_unique_matching_triples_r1:
    for triplet_combo in l_matching_triplets_abc:
        if triplet_combo[0][1] == unique_id_r1:
            l_matching_triplets_abc_fused.append(triplet_combo + [['fused', ID]])
    ID = str(int(ID) + 1)

for unique_id_r2 in l_unique_matching_triples_r2:
    l_matching_fused_ids = []
    for combo in l_matching_triplets_abc_fused:
        if combo[1][1] == unique_id_r2:
            l_matching_fused_ids.append(combo[3][1])
    if len(l_matching_fused_ids) > 1:
            for combi in l_matching_triplets_abc_fused:
                if combi[1][1] == unique_id_r2:
                    combi[3][1] = l_matching_fused_ids[0]

                    
for unique_id_r3 in l_unique_matching_triples_r3:
    l_matching_fused_ids = []
    for combo in l_matching_triplets_abc_fused:
        if combo[2][1] == unique_id_r3:
            l_matching_fused_ids.append(combo[3][1])
    if len(l_matching_fused_ids) > 1:
            for combi in l_matching_triplets_abc_fused:
                if combi[2][1] == unique_id_r3:
                    combi[3][1] = l_matching_fused_ids[0]

                    
for elem in l_matching_triplets_abc_fused:
    elem[3] = tuple(elem[3])

    
    
df['fused_stable_cluster_ids'] = np.NaN

for unique_id_r1 in l_unique_matching_triples_r1:
    for elem in l_matching_triplets_abc_fused:
        if elem[0][1] == unique_id_r1:
            fused_id = elem[3][1]
    df.loc[df['Cluster_cont_r1'] == unique_id_r1, 'fused_stable_cluster_ids'] = fused_id
    
for unique_id_r2 in l_unique_matching_triples_r2:
    for elem in l_matching_triplets_abc_fused:
        if elem[1][1] == unique_id_r2:
            fused_id = elem[3][1]
    df.loc[df['Cluster_cont_r2'] == unique_id_r2, 'fused_stable_cluster_ids'] = fused_id
    
for unique_id_r3 in l_unique_matching_triples_r3:
    for elem in l_matching_triplets_abc_fused:
        if elem[2][1] == unique_id_r3:
            fused_id = elem[3][1]
    df.loc[df['Cluster_cont_r3'] == unique_id_r3, 'fused_stable_cluster_ids'] = fused_id

    
df['fused_stable_cluster_ids'].unique()               

In [None]:
l_mice = df['Animal_ID'].unique()
#_mice = [df['Animal_ID'].unique()[3]]
#l_mice = ['175_F4-11']

folder = 'Plots_states_over_raw_data_all_mice_ceiling_reduced3_fused_runs'


SAVE = True

In [None]:
for mouse in l_mice:
    l_sessions = df.loc[df['Animal_ID'] == mouse, 'Session'].unique()
    #l_sessions = ['CD1']
    
    for session in l_sessions:
        # Select the corresponding session data
        df_session = df.loc[(df['Animal_ID'] == mouse) & (df['Session'] == session)].copy()
        
        # Since df was entirely cleaned from NaNs already, we have to recreate the gaps so that they appear in the plotted traces and are not connected
        last_bin = df_session['Bin'].max()
        data = {'Bin': [int(elem) for elem in np.linspace(1, last_bin, last_bin)]}
        df_bins = pd.DataFrame (data=data)
        df_merged = pd.merge(df_session, df_bins, on='Bin', how='right')
        
        # To show only relevant behaviors and relevant states, we will create corresponding columns that can be used to specify the hue
        #df_merged['matching_cluster_ids'] = np.NaN
        #df_merged.loc[df_merged['Cluster_cont_r1'].isin(l_matching_triplets_r1), 'matching_cluster_ids'] = df_merged['Cluster_cont_r1']

        l_selected_behaviors = ['Immobility', 'Grooming', 'Rearing', 'Flight', 'StretchAttend', 'TailRattling', 'HeadDips']
        df_merged['selected_behaviors'] = np.NaN
        df_merged.loc[df_merged['behaviors'].isin(l_selected_behaviors), 'selected_behaviors'] = df_merged['behaviors']
        
        # Also, we need some fixed y-value for the scatterplots of behaviors and states:
        df_merged['plain_one'] = 1
        
        # Now we are ready to create the figure: 
        fig = plt.figure(figsize=(20, 8), facecolor='white')
        gs = fig.add_gridspec(14,1)

        ax1 = fig.add_subplot(gs[0,0])
        sns.scatterplot(data=df_merged, x='Bin', y='plain_one', hue='selected_behaviors', marker="|", palette='colorblind', legend=False, s=100, ax=ax1)
        plt.title('{} during {}'.format(mouse, session))
        plt.xlim(0,last_bin)
        plt.ylabel('behavior', rotation=0)
        plt.xlabel('')
        ax1.yaxis.set_label_coords(-0.05,0.3)

        ax2 = fig.add_subplot(gs[1, 0], sharex=ax1)
        sns.scatterplot(data=df_merged, x='Bin', y='plain_one', hue='fused_stable_cluster_ids', marker="|", palette='Spectral', legend=False, s=100, ax=ax2)
        plt.ylabel('state', rotation=0)
        plt.xlabel('')
        ax2.yaxis.set_label_coords(-0.05,0.3)

        ax3 = fig.add_subplot(gs[2:6, 0], sharex=ax1)
        plt.plot(df_merged['norm_HeartRate'], color='darkorange')
        plt.ylabel('HeartRate', rotation=0)
        ax3.yaxis.set_label_coords(-0.05,0.5)

        ax4 = fig.add_subplot(gs[6:10, 0], sharex=ax1)
        plt.plot(df_merged['norm_Motion'])
        plt.ylabel('Motion', rotation=0)
        ax4.yaxis.set_label_coords(-0.05,0.5)

        ax5 = fig.add_subplot(gs[10:14, 0])
        plt.plot(df_merged['norm_Temperature'], color='k')
        plt.ylabel('Temperature', rotation=0)
        plt.xlabel('time')
        plt.xlim(0,last_bin)
        ax5.yaxis.set_label_coords(-0.05,0.5)


        for ax in [ax1, ax2]:
            ax.spines['top'].set_visible(False)
            ax.spines['left'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.spines['bottom'].set_visible(False)
            ax.get_xaxis().set_ticks([])
            ax.get_yaxis().set_ticks([])

        for ax in [ax3, ax4]:
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.spines['bottom'].set_visible(False)

        for ax in [ax5]:
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)   

        plt.tight_layout()    
        
        if SAVE:
            plt.savefig('/home/ds/DCL/Defensive_states/{}/{}_{}.png'.format(folder, mouse, session), dpi=300)
            plt.close()
        else:
            plt.show()        

## Check the distribution of the detected stable clusters

Do the mice display a larger variety of states in OF and EPM, compared to CD1 and CD2?

In [None]:
d_clusters_per_mouse = {'Cluster_count': [],
                        'Session': [],
                        'Animal_ID': []}

for mouse in df['Animal_ID'].unique():
    for session in ['OF', 'EPM', 'CD1', 'CD2']:
        d_clusters_per_mouse['Cluster_count'].append(df.loc[(df['Animal_ID'] == mouse) & (df['Session'] == session), 'fused_stable_cluster_ids'].unique().shape[0] - 1) # quick and dirty to "remove" nan
        d_clusters_per_mouse['Session'].append(session)
        d_clusters_per_mouse['Animal_ID'].append(mouse)
        
    d_clusters_per_mouse['Cluster_count'].append(df.loc[(df['Animal_ID'] == mouse) & (df['Session'].isin(['OF', 'EPM'])), 'fused_stable_cluster_ids'].unique().shape[0] - 1)
    d_clusters_per_mouse['Session'].append('Explorative')
    d_clusters_per_mouse['Animal_ID'].append(mouse)
    
    d_clusters_per_mouse['Cluster_count'].append(df.loc[(df['Animal_ID'] == mouse) & (df['Session'].isin(['CD1', 'CD2'])), 'fused_stable_cluster_ids'].unique().shape[0] - 1)
    d_clusters_per_mouse['Session'].append('CondFlight')
    d_clusters_per_mouse['Animal_ID'].append(mouse)
    
df_clusters_per_mouse = pd.DataFrame(data=d_clusters_per_mouse)

In [None]:
plt.figure(figsize=(10, 6), facecolor='white')

sns.boxplot(data=df_clusters_per_mouse, x='Session', y='Cluster_count')
sns.stripplot(data=df_clusters_per_mouse, x='Session', y='Cluster_count', color='k')
plt.title('Different states found in each session [per mouse]')

plt.show()

## Special stuff added for Ninas Institute PR:

In [None]:
df.columns

In [None]:
cluster_column = 'Cluster_cont_r2'

In [None]:
def get_percentage_of_bins_per_behavior(df_temp):
    perc_Immobility = df_temp.loc[df_temp['behaviors'] == 'Immobility'].shape[0] / df_temp.shape[0] * 100
    perc_StretchAttend = df_temp.loc[df_temp['behaviors'] == 'StretchAttend'].shape[0] / df_temp.shape[0] * 100
    perc_Grooming = df_temp.loc[df_temp['behaviors'] == 'Grooming'].shape[0] / df_temp.shape[0] * 100
    perc_TailRattling = df_temp.loc[df_temp['behaviors'] == 'TailRattling'].shape[0] / df_temp.shape[0] * 100
    perc_Flight = df_temp.loc[df_temp['behaviors'] == 'Flight'].shape[0] / df_temp.shape[0] * 100
    perc_Rearing = df_temp.loc[df_temp['behaviors'] == 'Rearing'].shape[0] / df_temp.shape[0] * 100
    
    #perc_Struggle = df_temp.loc[df_temp['behaviors'] == 'Struggle'].shape[0] / df_temp.shape[0] * 100
    perc_HeadDips = df_temp.loc[df_temp['behaviors'] == 'HeadDips'].shape[0] / df_temp.shape[0] * 100
    #perc_OpenRearing = df_temp.loc[df_temp['behaviors'] == 'OpenRearing'].shape[0] / df_temp.shape[0] * 100
    
    perc_NoScore = df_temp.loc[df_temp['behaviors'] == 'No score'].shape[0] / df_temp.shape[0] * 100
    perc_Remaining = df_temp.loc[df_temp['behaviors'] == 'Remaining'].shape[0] / df_temp.shape[0] * 100
    perc_multiple = df_temp.loc[df_temp['behaviors'] == 'multiple'].shape[0] / df_temp.shape[0] * 100
    dict_perc_behaviors = {'Immobility': perc_Immobility,
                          'StretchAttend': perc_StretchAttend,
                          'Grooming': perc_Grooming, 
                          'TailRattling': perc_TailRattling, 
                          'Flight': perc_Flight,
                          'Rearing': perc_Rearing,
                          #'OpenRearing': perc_OpenRearing,
                          'HeadDips': perc_HeadDips,
                          #'Struggle': perc_Struggle,
                          'No Score': perc_NoScore, 
                          'Remaining': perc_Remaining,
                          'multiple': perc_multiple}
    return dict_perc_behaviors


l_clusters = list(df.loc[df['Exclude'] == False, cluster_column].unique())
l_clusters.remove(-1)

l_enriched_dfs = []

for cluster_id in l_clusters:
    dict_cluster = get_percentage_of_bins_per_behavior(df.loc[(df['Exclude'] == False) & (df[cluster_column] == cluster_id)])
    dict_all_data = get_percentage_of_bins_per_behavior(df.loc[df['Exclude'] == False])

    zip_object = zip(dict_cluster.values(), dict_all_data.values())

    l_enriched = []
    for perc_cluster, perc_all_data in zip_object:
        l_enriched.append(perc_cluster/perc_all_data)

    dict_enriched = {'Cluster_ID': [cluster_id]}
    for i in range(len(l_enriched)):
        dict_enriched[list(dict_all_data.keys())[i]] = [l_enriched[i]]

    df_enriched = pd.DataFrame(data=dict_enriched)
    df_enriched = df_enriched.set_index('Cluster_ID', drop=True)

    l_enriched_dfs.append(df_enriched)
    
df_all_clusters = pd.concat(l_enriched_dfs)

l_cluster_sizes = []
for idx in df_all_clusters.index:
    l_cluster_sizes.append(df.loc[(df['Exclude'] == False) & (df[cluster_column] == idx)].shape[0])

df_all_clusters['Cluster_size'] = l_cluster_sizes
df_all_clusters.head()


In [None]:
# Which behavior should be used to sort the heatmaps?

sort_behavior = 'Immobility'

In [None]:
df_heatmap = df_all_clusters.copy()

for behavior in ['StretchAttend', 'Grooming', 'TailRattling', 'Flight', 'Rearing', 'HeadDips']:
    df_heatmap[behavior] = np.sqrt(df_heatmap[behavior])

fig = plt.figure(figsize=(20,15), facecolor='w')
gs = fig.add_gridspec(15,7)

plt.subplots_adjust(hspace=0.05) 

cbar_1 = fig.add_subplot(gs[14,0])
cbar_2 = fig.add_subplot(gs[14,1])
cbar_3 = fig.add_subplot(gs[14,2])
cbar_4 = fig.add_subplot(gs[14,3])
cbar_5 = fig.add_subplot(gs[14,4])
cbar_6 = fig.add_subplot(gs[14,5])
cbar_7 = fig.add_subplot(gs[14,6])
#cbar_8 = fig.add_subplot(gs[14,7])

ax1 = fig.add_subplot(gs[0:13,0])
sns.heatmap(df_heatmap.sort_values(by = sort_behavior, ascending=False).iloc[:, :1], center=1, cbar_kws={"orientation": "horizontal"}, cmap='vlag', vmin=0, vmax=df_heatmap['Immobility'].max(), cbar_ax=cbar_1)
plt.tick_params(bottom=False, labelbottom=False, labeltop=True, left='Full')

fig.add_subplot(gs[0:13,1])
sns.heatmap(df_heatmap.sort_values(by = sort_behavior, ascending=False).iloc[:, 1:2], center=1, cbar_kws={"orientation": "horizontal"}, cmap='vlag', vmin=0, vmax=df_heatmap['StretchAttend'].max(), cbar_ax=cbar_2)
plt.ylabel('')
plt.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False, labeltop=True)

fig.add_subplot(gs[0:13,2])
sns.heatmap(df_heatmap.sort_values(by = sort_behavior, ascending=False).iloc[:, 2:3], center=1, cbar_kws={"orientation": "horizontal"}, cmap='vlag', vmin=0, vmax=df_heatmap['Grooming'].max(), cbar_ax=cbar_3)
plt.ylabel('')
plt.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False, labeltop=True)

fig.add_subplot(gs[0:13,3])
sns.heatmap(df_heatmap.sort_values(by = sort_behavior, ascending=False).iloc[:, 3:4], center=1, cbar_kws={"orientation": "horizontal"}, cmap='vlag', vmin=0, vmax=df_heatmap['TailRattling'].max(), cbar_ax=cbar_4)
plt.ylabel('')
plt.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False, labeltop=True)

fig.add_subplot(gs[0:13,4])
sns.heatmap(df_heatmap.sort_values(by = sort_behavior, ascending=False).iloc[:, 4:5], center=1, cbar_kws={"orientation": "horizontal"}, cmap='vlag', vmin=0, vmax=df_heatmap['Flight'].max(), cbar_ax=cbar_5)
plt.ylabel('')
plt.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False, labeltop=True)

fig.add_subplot(gs[0:13,5])
sns.heatmap(df_heatmap.sort_values(by = sort_behavior, ascending=False).iloc[:, 5:6], center=1, cbar_kws={"orientation": "horizontal"}, cmap='vlag', vmin=0, vmax=df_heatmap['Rearing'].max(), cbar_ax=cbar_6)
plt.ylabel('')
plt.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False, labeltop=True)

fig.add_subplot(gs[0:13,6])
sns.heatmap(df_heatmap.sort_values(by = sort_behavior, ascending=False).iloc[:, 6:7], center=1, cbar_kws={"orientation": "horizontal"}, cmap='vlag', vmin=0, vmax=df_heatmap['HeadDips'].max(), cbar_ax=cbar_7)
plt.ylabel('')
plt.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False, labeltop=True)

#fig.add_subplot(gs[0:13,7])
#sns.heatmap(df_heatmap.sort_values(by = sort_behavior, ascending=False).iloc[:, 5:6], center=1, cbar_kws={"orientation": "horizontal"}, cmap='vlag', vmin=0, vmax=df_heatmap['HeadDips'].max(), cbar_ax=cbar_6)
#plt.ylabel('')
#plt.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False, labeltop=True)




#plt.tight_layout()
#plt.savefig('Heatmap_clusters_sorted_by_immobility.png', dpi=300)
plt.suptitle('Square-root transformed')
plt.show()

### Use all clusters of that run and sort them (don´t check for matching triplets):

In [None]:
l_sorted = list(df_all_clusters.sort_values(by=[sort_behavior], ascending=False).index)

if len(l_sorted) > 24:
    l_clusters = l_sorted[:25]
else:
    l_clusters = l_sorted
    
df_inspect = df.loc[df[cluster_column].isin(l_clusters)].copy()

### Use all clusters of that run and sort them AND check for matching triplets

In [None]:
l_sorted = list(df_all_clusters.sort_values(by=[sort_behavior], ascending=False).index)

l_matching_clusters_run_2 = [elem[1][1] for elem in l_matching_triplets_abc]

l_matchings_sorted = [elem for elem in l_sorted if elem in l_matching_clusters_run_2]

if len(l_matchings_sorted) > 24:
    l_clusters = l_matchings_sorted[:25]
else:
    l_clusters = l_matchings_sorted
    
df_inspect = df.loc[df[cluster_column].isin(l_clusters)].copy()

### Plot only selected clusters:

In [None]:
# Run 1:
l_clusters = [44, 63]

In [None]:
# Run 2:
l_clusters = [120, 43]

In [None]:
df_inspect = df.loc[df[cluster_column].isin(l_clusters)].copy()

In [None]:
l_n_mice = []

for cluster_id in l_clusters:
    l_n_mice.append(df.loc[df[cluster_column] == cluster_id, 'Animal_ID'].unique().shape[0])
    
fig = plt.figure(figsize=(15, 8), facecolor='w')

sns.barplot(x=l_clusters, y=l_n_mice, order=l_clusters)
plt.ylabel('Number of mice')
plt.xlabel('Cluster_ID')
plt.xticks(rotation='vertical')

plt.show()

In [None]:
X = cluster_column
norm = 'norm_'

fig = plt.figure(figsize=(28, 30), facecolor='w')
gs = fig.add_gridspec(3,3)
plt.subplots_adjust(hspace=0.4) 

ax1 = fig.add_subplot(gs[0,0])
sns.boxplot(data=df_inspect, x=X, y=norm + 'HeartRate', ax=ax1, order=l_clusters)
plt.title('Heart Rate', fontsize=20)
plt.ylim(0,1)
plt.xlabel('Cluster ID')
plt.ylabel('normalized measure')
plt.xticks(rotation='vertical')

ax2 = fig.add_subplot(gs[0,1])
sns.boxplot(data=df_inspect, x=X, y=norm + 'HR_High_Amp', ax=ax2, order=l_clusters)
plt.title('Amplitude of high HR frequency band', fontsize=20)
plt.ylim(0,1)
plt.xlabel('Cluster ID')
plt.ylabel('normalized measure')
plt.xticks(rotation='vertical')

ax3 = fig.add_subplot(gs[0,2])
sns.boxplot(data=df_inspect, x=X, y=norm + 'HR_CoV_10s', ax=ax3, order=l_clusters)
plt.title('HR coefficient of variation [10s window]', fontsize=20)
plt.ylim(0,1)
plt.xlabel('Cluster ID')
plt.ylabel('normalized measure')
plt.xticks(rotation='vertical')

ax4 = fig.add_subplot(gs[1,0])
sns.boxplot(data=df_inspect, x=X, y='norm_Motion', ax=ax4, order=l_clusters)
plt.title('Motion', fontsize=20)
plt.ylim(0,1)
plt.xlabel('Cluster ID')
plt.ylabel('normalized measure')
plt.xticks(rotation='vertical')

ax5 = fig.add_subplot(gs[1,1])
sns.boxplot(data=df_inspect, x=X, y='norm_Speed', ax=ax5, order=l_clusters)
plt.title('Speed', fontsize=20)
plt.ylim(0,1)
plt.xlabel('Cluster ID')
plt.ylabel('normalized measure')
plt.xticks(rotation='vertical')

ax6 = fig.add_subplot(gs[1,2])
sns.boxplot(data=df_inspect, x=X, y='norm_AreaExplored_sqrt', ax=ax6, order=l_clusters)
plt.title('AreaExplored - sqrt. transformed', fontsize=20)
plt.ylim(0,1)
plt.xlabel('Cluster ID')
plt.ylabel('normalized measure')
plt.xticks(rotation='vertical')


ax7 = fig.add_subplot(gs[2,0])
sns.boxplot(data=df_inspect, x=X, y=norm + 'Temperature_s1', ax=ax7, order=l_clusters)
plt.title('Temperature tail segment 1', fontsize=20)
plt.ylim(0,1)
plt.xlabel('Cluster ID')
plt.ylabel('normalized measure')
plt.xticks(rotation='vertical')

ax8 = fig.add_subplot(gs[2,1])
sns.boxplot(data=df_inspect, x=X, y=norm + 'Temperature_s3', ax=ax8, order=l_clusters)
plt.title('Temperature tail segment 3', fontsize=20)
plt.ylim(0,1)
plt.xlabel('Cluster ID')
plt.ylabel('normalized measure')
plt.xticks(rotation='vertical')



#plt.tight_layout()

#plt.savefig('5_Immobility_cluster_boxplots.png')
plt.show()

In [None]:
def calculate_pie_chart_infos(cluster_id):
    df_temp = df_inspect.loc[df_inspect[X] == cluster_id, ['behaviors', 'Bin']].copy()
    total_bins = df_temp.shape[0]
    behaviors_perc = {'Immobility': round(df_temp.loc[df_temp['behaviors'] == 'Immobility'].shape[0] / total_bins * 100, 2),
                      'Stretch Attend': round(df_temp.loc[df_temp['behaviors'] == 'StretchAttend'].shape[0] / total_bins * 100, 2),
                      'No score': round(df_temp.loc[df_temp['behaviors'] == 'No score'].shape[0] / total_bins * 100, 2),
                      'Grooming': round(df_temp.loc[df_temp['behaviors'] == 'Grooming'].shape[0] / total_bins * 100, 2),
                      'Remaining': round(df_temp.loc[df_temp['behaviors'] == 'Remaining'].shape[0] / total_bins * 100, 2),
                      'Tail Rattling': round(df_temp.loc[df_temp['behaviors'] == 'TailRattling'].shape[0] / total_bins * 100, 2),
                      'Flight': round(df_temp.loc[df_temp['behaviors'] == 'Flight'].shape[0] / total_bins * 100, 2),
                      'Rearing': round(df_temp.loc[df_temp['behaviors'] == 'Rearing'].shape[0] / total_bins * 100, 2),
                      
                      #'Struggle': round(df_temp.loc[df_temp['behaviors'] == 'Struggle'].shape[0] / total_bins * 100, 2),
                      #'OpenRearing': round(df_temp.loc[df_temp['behaviors'] == 'OpenRearing'].shape[0] / total_bins * 100, 2),
                      'HeadDips': round(df_temp.loc[df_temp['behaviors'] == 'HeadDips'].shape[0] / total_bins * 100, 2),
                      
                      'Multiple': round(df_temp.loc[df_temp['behaviors'] == 'multiple'].shape[0] / total_bins * 100, 2)}
    labels = []
    sizes = []
    for key in behaviors_perc.keys():
        if behaviors_perc[key] > 0.5:
            labels.append(key)
            sizes.append(behaviors_perc[key])
    return sizes, labels

def calculate_pie_chart_infos_average():
    df_temp = df.copy()
    total_bins = df_temp.shape[0]
    behaviors_perc = {'Immobility': round(df_temp.loc[df_temp['behaviors'] == 'Immobility'].shape[0] / total_bins * 100, 2),
                      'Stretch Attend': round(df_temp.loc[df_temp['behaviors'] == 'StretchAttend'].shape[0] / total_bins * 100, 2),
                      'No score': round(df_temp.loc[df_temp['behaviors'] == 'No score'].shape[0] / total_bins * 100, 2),
                      'Grooming': round(df_temp.loc[df_temp['behaviors'] == 'Grooming'].shape[0] / total_bins * 100, 2),
                      'Remaining': round(df_temp.loc[df_temp['behaviors'] == 'Remaining'].shape[0] / total_bins * 100, 2),
                      'Tail Rattling': round(df_temp.loc[df_temp['behaviors'] == 'TailRattling'].shape[0] / total_bins * 100, 2),
                      'Flight': round(df_temp.loc[df_temp['behaviors'] == 'Flight'].shape[0] / total_bins * 100, 2),
                      'Rearing': round(df_temp.loc[df_temp['behaviors'] == 'Rearing'].shape[0] / total_bins * 100, 2),
                      
                      #'OpenRearing': round(df_temp.loc[df_temp['behaviors'] == 'OpenRearing'].shape[0] / total_bins * 100, 2),
                      #'Struggle': round(df_temp.loc[df_temp['behaviors'] == 'Struggle'].shape[0] / total_bins * 100, 2),
                      'HeadDips': round(df_temp.loc[df_temp['behaviors'] == 'HeadDips'].shape[0] / total_bins * 100, 2),
                      
                      'Multiple': round(df_temp.loc[df_temp['behaviors'] == 'multiple'].shape[0] / total_bins * 100, 2)}
    labels = []
    sizes = []
    for key in behaviors_perc.keys():
        if behaviors_perc[key] > 0.5:
            labels.append(key)
            sizes.append(behaviors_perc[key])
    return sizes, labels
        
fig = plt.figure(figsize=(22, 55), facecolor='w')
gs = fig.add_gridspec(6,4)
plt.subplots_adjust(hspace=1) 


cluster_id = 0
for row in [0, 1, 2, 3, 4, 5]:
    if row < 5:
        for column in [0, 1, 2, 3]:
            fig.add_subplot(gs[row,column])
            sizes, labels = calculate_pie_chart_infos(l_clusters[cluster_id])
            plt.pie(sizes, labels=labels, autopct='%1.1f%%',
                    shadow=False, startangle=90, rotatelabels=True)
            plt.axis('equal')
            plt.title('Cluster ' + str(l_clusters[cluster_id]) + '\n' + str(df_all_clusters.loc[l_clusters[cluster_id],'Cluster_size']) + ' bins', pad=80, fontsize=24)
            cluster_id = cluster_id + 1
    if row == 5:     
        for column in [0, 1, 2]:
            fig.add_subplot(gs[row,column])
            sizes, labels = calculate_pie_chart_infos(l_clusters[cluster_id])
            plt.pie(sizes, labels=labels, autopct='%1.1f%%',
                    shadow=False, startangle=90, rotatelabels=True)
            plt.axis('equal')
            plt.title('Cluster ' + str(l_clusters[cluster_id]) + '\n' + str(df_all_clusters.loc[l_clusters[cluster_id],'Cluster_size']) + ' bins', pad=80, fontsize=24)
            cluster_id = cluster_id + 1
        
        fig.add_subplot(gs[row,3])
        sizes, labels = calculate_pie_chart_infos_average()
        plt.pie(sizes, labels=labels, autopct='%1.1f%%',
                shadow=False, startangle=90, rotatelabels=True)
        plt.axis('equal')
        plt.title('Entire DataFrame \n' + str(df.shape[0]) + ' bins', pad=90, fontsize=24)
        cluster_id = cluster_id + 1

#plt.savefig('10_Immobility_cluster_piecharts_with_bins.png', dpi=300)
plt.show()

## Normalization of time across sessions:

In [None]:
df

In [None]:
df.loc[df['Session'] == 'OF', 'Times'].max()

In [None]:
max_session_times = {}

for session in df['Session'].unique():
    max_session_times[session] = df.loc[df['Session'] == session, 'Times'].max()

In [None]:
max_session_times

In [None]:
l_mice = df['Animal_ID'].unique()
for mouse in l_mice:
    l_sessions = df.loc[df['Animal_ID'] == mouse, 'Session'].unique()
    for session in l_sessions:
        df.loc[(df['Animal_ID'] == mouse) & (df['Session'] == session), 'norm_Times'] = df['Times'] / max_session_times[session]

In [None]:
df.loc[df['Times'] == 5.75]

In [None]:
l_clusters

In [None]:
fig = plt.figure(figsize=(10, 3), facecolor='w')
gs = fig.add_gridspec(1,1)

stripplot = True


ax1 = fig.add_subplot(gs[0,0])
sns.boxplot(data=df.loc[df[cluster_column].isin(l_clusters)], x='norm_Times', y=cluster_column, orient='h', order=l_clusters, ax=ax1, palette=['magenta', 'darkgreen'], fliersize=0)

sns.stripplot(data=df.loc[df[cluster_column].isin(l_clusters)], x='norm_Times', y=cluster_column, orient='h', order=l_clusters, ax=ax1, palette=['black', 'black'], alpha=0.4)


#sns.boxplot(data=df.loc[(df['Session'].isin(['EPM', 'OF'])) & (df['Exclude'] == False)], x='Times', y=cluster_column, orient='h', order=l_clusters, ax=ax1, palette='husl')
#if stripplot:
#    sns.stripplot(data=df.loc[(df['Session'].isin(['EPM', 'OF'])) & (df['Exclude'] == False)], x='Times', color='k', y=cluster_column, orient='h', order=l_clusters, ax=ax1)
#plt.title('EPM & OF', fontsize=20)
plt.ylabel('Cluster ID')
plt.xlim(0,1)
plt.xlabel('normalized recording time')
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)

plt.savefig('Cont_run-2_normalized_recording_times_all_data.png', dpi=600)

plt.show()

In [None]:
l_clusters

In [None]:
df.columns

In [None]:
code = 'cont'
run = '2'

measure_columns = ['norm_HeartRate', 'norm_HR_CoV_10s', 'norm_Motion', 'norm_AreaExplored_sqrt', 'norm_Temperature_s1']


fig = plt.figure(figsize=(20,5), facecolor='white')

gs = fig.add_gridspec(1, 1)


l_dfs_individual_clusters = []

for cluster_id in l_clusters:
    l_dfs_single_measurements = []
    for measure in measure_columns:
        df_temp = df.loc[df['Cluster_{}_r{}'.format(code, run)] == cluster_id, ['Data_idx', 'Animal_ID', measure]].copy()
        df_temp.columns = ['Data_idx', 'Animal_ID', 'Data']
        n_datapoints = df_temp.shape[0]
        N_mice = df_temp['Animal_ID'].unique().shape[0]
        df_temp['Measure'] = measure
        df_temp['Cluster_Run'] = 'Cluster-{}_Run-{} ({} datapoints from {} mice)'.format(str(cluster_id), run, n_datapoints, N_mice)
        l_dfs_single_measurements.append(df_temp)
    l_dfs_individual_clusters.append(pd.concat(l_dfs_single_measurements))

df_for_boxplots = pd.concat(l_dfs_individual_clusters)

ax = fig.add_subplot(gs[0,0])

sns.boxplot(x="Measure", y="Data", hue="Cluster_Run", data=df_for_boxplots, palette=['magenta', 'darkgreen'])

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.get_legend().remove()

plt.ylabel('normalized measure')
plt.ylim(0,1)

#plt.savefig('Cont_run-2_boxplots_120-magenta_43_green.png', dpi=600)
plt.show()

    


In [None]:
cluster_column

In [None]:
def calculate_pie_chart_infos(cluster_id):
    df_temp = df_inspect.loc[df_inspect[cluster_column] == cluster_id, ['behaviors', 'Bin']].copy()
    total_bins = df_temp.shape[0]
    behaviors_perc = {'Immobility': round(df_temp.loc[df_temp['behaviors'] == 'Immobility'].shape[0] / total_bins * 100, 2),
                      'Stretch Attend': round(df_temp.loc[df_temp['behaviors'] == 'StretchAttend'].shape[0] / total_bins * 100, 2),
                      'No score': round(df_temp.loc[df_temp['behaviors'] == 'No score'].shape[0] / total_bins * 100, 2),
                      'Grooming': round(df_temp.loc[df_temp['behaviors'] == 'Grooming'].shape[0] / total_bins * 100, 2),
                      'Remaining': round(df_temp.loc[df_temp['behaviors'] == 'Remaining'].shape[0] / total_bins * 100, 2),
                      'Tail Rattling': round(df_temp.loc[df_temp['behaviors'] == 'TailRattling'].shape[0] / total_bins * 100, 2),
                      'Flight': round(df_temp.loc[df_temp['behaviors'] == 'Flight'].shape[0] / total_bins * 100, 2),
                      'Rearing': round(df_temp.loc[df_temp['behaviors'] == 'Rearing'].shape[0] / total_bins * 100, 2),
                      
                      #'Struggle': round(df_temp.loc[df_temp['behaviors'] == 'Struggle'].shape[0] / total_bins * 100, 2),
                      #'OpenRearing': round(df_temp.loc[df_temp['behaviors'] == 'OpenRearing'].shape[0] / total_bins * 100, 2),
                      'HeadDips': round(df_temp.loc[df_temp['behaviors'] == 'HeadDips'].shape[0] / total_bins * 100, 2),
                      
                      'Multiple': round(df_temp.loc[df_temp['behaviors'] == 'multiple'].shape[0] / total_bins * 100, 2)}
    labels = []
    sizes = []
    for key in behaviors_perc.keys():
        if behaviors_perc[key] > 1:
            labels.append(key)
            sizes.append(behaviors_perc[key])
    return sizes, labels

def calculate_pie_chart_infos_average():
    df_temp = df.copy()
    total_bins = df_temp.shape[0]
    behaviors_perc = {'Immobility': round(df_temp.loc[df_temp['behaviors'] == 'Immobility'].shape[0] / total_bins * 100, 2),
                      'Stretch Attend': round(df_temp.loc[df_temp['behaviors'] == 'StretchAttend'].shape[0] / total_bins * 100, 2),
                      'No score': round(df_temp.loc[df_temp['behaviors'] == 'No score'].shape[0] / total_bins * 100, 2),
                      'Grooming': round(df_temp.loc[df_temp['behaviors'] == 'Grooming'].shape[0] / total_bins * 100, 2),
                      'Remaining': round(df_temp.loc[df_temp['behaviors'] == 'Remaining'].shape[0] / total_bins * 100, 2),
                      'Tail Rattling': round(df_temp.loc[df_temp['behaviors'] == 'TailRattling'].shape[0] / total_bins * 100, 2),
                      'Flight': round(df_temp.loc[df_temp['behaviors'] == 'Flight'].shape[0] / total_bins * 100, 2),
                      'Rearing': round(df_temp.loc[df_temp['behaviors'] == 'Rearing'].shape[0] / total_bins * 100, 2),
                      
                      #'OpenRearing': round(df_temp.loc[df_temp['behaviors'] == 'OpenRearing'].shape[0] / total_bins * 100, 2),
                      #'Struggle': round(df_temp.loc[df_temp['behaviors'] == 'Struggle'].shape[0] / total_bins * 100, 2),
                      'HeadDips': round(df_temp.loc[df_temp['behaviors'] == 'HeadDips'].shape[0] / total_bins * 100, 2),
                      
                      'Multiple': round(df_temp.loc[df_temp['behaviors'] == 'multiple'].shape[0] / total_bins * 100, 2)}
    labels = []
    sizes = []
    for key in behaviors_perc.keys():
        if behaviors_perc[key] > 0.5:
            labels.append(key)
            sizes.append(behaviors_perc[key])
    return sizes, labels
        
fig = plt.figure(figsize=(22, 10), facecolor='w')
gs = fig.add_gridspec(1,3)
plt.subplots_adjust(hspace=1) 

cluster_id = 0


for column in [0, 1]:
    fig.add_subplot(gs[0,column])
    sizes, labels = calculate_pie_chart_infos(l_clusters[cluster_id])
    plt.pie(sizes, labels=labels, autopct='%1.1f%%',
            shadow=False, startangle=90, rotatelabels=True, textprops={'fontsize': 16})
    plt.axis('equal')
    plt.title('Cluster #' + str(l_clusters[cluster_id]) + '\n' + str(df_all_clusters.loc[l_clusters[cluster_id],'Cluster_size']) + ' bins', pad=10, fontsize=24)
    cluster_id = cluster_id + 1

fig.add_subplot(gs[0,2])
sizes, labels = calculate_pie_chart_infos_average()
plt.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=False, startangle=90, rotatelabels=True, textprops={'fontsize': 16})
plt.axis('equal')
plt.title('Entire DataFrame \n' + str(df.shape[0]) + ' bins', pad=10, fontsize=24)
cluster_id = cluster_id + 1

plt.savefig('Cont_run-2_pie_charts_120-43.png', dpi=300)
plt.show()

In [None]:
df.columns

In [None]:
clusters_run_2 = [elem[1][1] for elem in l_matching_triplets_abc]

In [None]:
X = 'UMAP_cont_r2_1'
Y = 'UMAP_cont_r2_2'
hue = 'Cluster_cont_r2'

palette = 'viridis'

size = 2

fig = plt.figure(figsize=(20, 15), facecolor='w')
gs = fig.add_gridspec(1,1)

ax = fig.add_subplot(gs[0,0])
sns.scatterplot(
    x=X,
    y=Y,
    #hue=hue,
    palette=palette,
    data=df,
    legend=False,
    alpha=0.3,
    s=size)

sns.scatterplot(
    x=X,
    y=Y,
    color='gold',
    data=df.loc[df[hue].isin(clusters_run_2)],
    legend=False,
    alpha = 0.3,
    s=size)

sns.scatterplot(
    x=X,
    y=Y,
    color='magenta',
    data=df.loc[df[hue] == 120],
    legend=False,
    s=size+1)


sns.scatterplot(
    x=X,
    y=Y,
    color='darkgreen',
    data=df.loc[df[hue] == 43],
    legend=False,
    s=size+1)


plt.xlim(-15,15)
plt.ylim(-15,15)

# Hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

# Only show ticks on the left and bottom spines
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')

#plt.setp(ax.spines.values(), linewidth=2)

#plt.title('Global scaling - 87 nn - Type 3 - factor 1')
plt.xlabel('UMAP dimension one')
plt.ylabel('UMAP dimension two')

#plt.savefig('Cont_run-2_no_cluster.png', dpi=600)

plt.savefig('Cont_run-2_magenta-120_green-43_only_stable_clusters.png', dpi=600)
plt.show()

In [None]:
d_measure_columns = {'intv_100': [column for column in df.columns if column.endswith('_intervals_100')],
                     'cont': [column[:column.index('_intervals_100')] for column in df.columns if column.endswith('_intervals_100')]}

l_dfs_individual_clusters = []
for elem in l_run_cluster_combis:
    run, cluster_id = elem
    l_dfs_single_measurements = []
    for measure in d_measure_columns[code]:
        df_temp = df.loc[df['Cluster_{}_r{}'.format(code, run)] == cluster_id, ['Data_idx', measure]].copy()
        df_temp.columns = ['Data_idx', 'Data']
        df_temp['Measure'] = measure
        df_temp['Cluster_Run'] = 'Cluster-{}_Run-{}'.format(str(cluster_id), run)
        l_dfs_single_measurements.append(df_temp)
    l_dfs_individual_clusters.append(pd.concat(l_dfs_single_measurements))

df_for_boxplots = pd.concat(l_dfs_individual_clusters)

plt.figure(figsize=(20,5), facecolor='white')

sns.boxplot(x="Measure", y="Data", hue="Cluster_Run", data=df_for_boxplots, palette="Set3")

plt.show()

In [None]:
df['Cluster_cont_r3'].max()