In [1]:
import os
import sys

In [2]:
import numpy as np
import pandas as pd

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
import umap
import hdbscan

In [5]:
import tqdm

In [6]:
from sklearn.preprocessing import MinMaxScaler, Normalizer

In [7]:
import joblib
from joblib import Parallel, delayed

In [8]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [9]:
from scipy import signal

# Load the umap models fit on control dataset

In [10]:
fn_umap_speeds = f'../../results/umap_model_29072023_v1.joblib'
fn_umap_wavelets = f'../../results/umap_wavelets_model_31072023.joblib'

In [11]:
model_umap_speeds = joblib.load(fn_umap_speeds)
model_umap_wavelets = joblib.load(fn_umap_wavelets)

# Load the drugs data

## Load metadata

In [15]:
df_meta = pd.read_pickle('../../data/amphioxus_metadata_final500.pickle')

In [16]:
# create a filename column to match with the filename column in the dataset
df_meta['filename'] = df_meta['filename_video'].apply(lambda x: x.split('.avi')[0])

In [17]:
df_meta.columns

Index(['path_to_video', 'filename_video', 'date', 'time', 'light', 'drugs',
       'duration', 'age', 'stim_on', 'stim_off', 'stim_RGB', 'dlc_result_file',
       'filename'],
      dtype='object')

In [18]:
df_meta.light.unique()

array(['None', 'Light'], dtype=object)

## Load feature data

In [19]:
df = pd.read_hdf('../../results/featureset_v4_29072023.h5', key='features')

In [20]:
# threshold speed mouth 
df = df[(df['speed_MOUTH'].isna())|(df['speed_MOUTH'] < 20)]

In [21]:
df_merged = df.merge(df_meta, how='left', on='filename')

In [22]:
df_drugs = df_merged[(df_merged['age'] > 50)&(df_merged['drugs']!='none')&(df_merged['light']=='None')]

In [23]:
df_light = df_merged[(df_merged['drugs']=='none')&((df_merged['light']=='Light'))]

In [24]:
df_young = df_merged[(df_merged['age'] < 50)&(df_merged['drugs']=='none')&(df_merged['light']=='None')]

In [25]:
len(df_merged.filename.unique())

497

In [26]:
len(df_drugs.filename.unique())

224

In [27]:
len(df_light.filename.unique())

37

In [28]:
len(df_young.filename.unique())

12

In [29]:
cols_speed = list(df_merged.filter(like='speed').columns)
feats_to_use = cols_speed

### Drugs

In [30]:
df_drugs.drugs.unique()

array(['SNAP-5114', '2-AminoEthylHydrogenSulfate', 'Dihydrokainic acid'],
      dtype=object)

In [31]:
df_drugs.groupby('drugs')['filename'].nunique()

drugs
2-AminoEthylHydrogenSulfate     62
Dihydrokainic acid             109
SNAP-5114                       53
Name: filename, dtype: int64

#### using speeds

In [33]:
df_drugs_in_speeds = df_drugs[feats_to_use]

In [34]:
df_drugs_in_speeds = df_drugs_in_speeds.fillna(-1)

#### using curvatures

In [35]:
df_drugs_curv = df_drugs.filter(like='curv')
df_drugs_curv

Unnamed: 0,curv_NT,curv_DH1,curv_DH2,curv_DNP,curv_DTP1,curv_DTP2,curv_DTP3,curv_DTP4,curv_DTP5,curv_DTP6,curv_DTP7,curv_DTP8,curv_DTP9,curv_DTP10,curv_TT
153340,0.013380,0.009219,0.009076,0.005172,0.002113,-0.000195,-0.003139,0.004987,0.006574,-0.000283,-0.002878,-0.008611,-0.008452,-0.003661,-0.015665
153341,0.008837,0.009898,0.009605,0.008805,0.008939,-0.003009,-0.002129,0.000417,0.002854,0.006816,0.006886,0.007832,0.009816,0.003130,-0.002718
153342,0.010357,0.008353,0.008875,0.008603,0.008995,-0.001119,0.000541,0.001855,0.002350,0.004407,0.010736,0.015750,0.023295,0.028004,0.046040
153343,0.009862,0.008674,0.009495,0.009537,0.009932,-0.000706,-0.000209,0.000348,0.000953,0.003805,0.005585,0.006664,0.008826,0.002393,-0.009292
153344,0.009487,0.008736,0.009922,0.009537,0.010057,-0.001396,-0.000692,0.000020,0.001600,0.005250,0.012083,0.017239,0.025667,0.030416,0.051598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4846183,0.021333,0.027936,0.046264,0.031860,0.035355,0.010524,0.001607,0.030915,0.024820,0.023326,0.013283,-0.018896,-0.015744,-0.053510,-0.057289
4846184,0.029519,0.034929,0.056841,0.035523,0.037640,0.003005,-0.011638,0.021358,0.027889,0.026526,0.021663,-0.017021,-0.003190,-0.017222,-0.058838
4846185,0.019415,0.022215,0.025758,0.033281,0.051620,0.046796,0.013669,0.026086,0.008567,-0.002281,-0.014983,-0.030599,-0.026739,-0.046361,-0.137057
4846186,0.026801,0.027433,0.044836,0.034619,0.036923,0.017941,0.002590,0.025001,0.013343,0.008279,0.001342,-0.020221,-0.011377,-0.016699,-0.056658


In [36]:
pca = PCA()
pca_drugs_curv = pca.fit_transform(df_drugs_curv)

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))

### Light

In [None]:
df_light.stim_RGB.unique()

In [None]:
df_light.groupby('stim_RGB')['filename'].nunique()

# Find the UMAP projections

In [None]:
df_results = pd.read_hdf('../../results/UMAP_HDBSCANclustering_19072023_1230.h5', key='features_with_results')

In [None]:
embedding =  df_results.filter(like = 'umap_raw').values
embedding.shape

## Drugs

In [None]:
embedding_drugs = {}

In [None]:
for i, (name, group) in enumerate(df_drugs.groupby('drugs')):
    
    group_in = group[feats_to_use]
    group_in = group_in.fillna(-1)
    embedding_drugs[name] = loaded_reducer.transform(group_in.values)

In [None]:
embedding_drugs.keys()

In [None]:
fig, axes = plt.subplots(1,4, figsize=(32,7))
axes= axes.ravel()

axes[0].scatter(embedding[:, 0],embedding[:, 1], s=0.2)
axes[0].set_title('Control')

for i, key in enumerate(embedding_drugs.keys()):
    
    axes[i+1].scatter(embedding_drugs[key][:, 0],embedding_drugs[key][:, 1], s=0.2)
    axes[i+1].set_title(key)

In [None]:
fig, axes = plt.subplots(1,4, figsize=(32,7))
axes= axes.ravel()

axes[0].hist2d(embedding[:, 0],embedding[:, 1], bins=(150,150), density=True)
axes[0].set_title('Control')

for i, key in enumerate(embedding_drugs.keys()):
    
    axes[i+1].hist2d(embedding_drugs[key][:, 0],embedding_drugs[key][:, 1], bins=(150,150), density=True)
    axes[i+1].set_title(key)

## Light

In [None]:
embedding_light = {}

In [None]:
for i, (name, group) in tqdm.tqdm(enumerate(df_light.groupby('stim_RGB'))):
    
    group_in = group[feats_to_use]
    group_in = group_in.fillna(-1)
    embedding_light[name] = loaded_reducer.transform(group_in.values)

In [None]:
embedding_light.keys()

In [None]:
fig, axes = plt.subplots(1,4, figsize=(32,7))
axes= axes.ravel()

axes[0].scatter(embedding[:, 0],embedding[:, 1], s=0.2)
axes[0].set_title('Control')

for i, key in enumerate(embedding_light.keys()):
    
    if key != 'v0310000':
    
        axes[i+1].scatter(embedding_light[key][:, 0],embedding_light[key][:, 1], s=0.2)
        axes[i+1].set_title(key)

In [None]:
fig, axes = plt.subplots(1,4, figsize=(32,7), sharex=True, sharey=True)
axes= axes.ravel()

axes[0].hist2d(embedding[:, 0],embedding[:, 1], bins=(150,150), density=True)
axes[0].set_title('Control')

for i, key in enumerate(embedding_light.keys()):
    if key != 'v0310000':
    
        axes[i+1].hist2d(embedding_light[key][:, 0],embedding_light[key][:, 1], bins=(150,150), density=True)
        axes[i+1].set_title(key)

## Age

In [None]:
df_young_in = df_young[feats_to_use]
df_young_in = df_young_in.fillna(-1)

In [None]:
embedding_age = loaded_reducer.transform(df_young_in.values)

In [None]:
fig, axes = plt.subplots(1,2, figsize=(15,7))
axes[0].scatter(embedding[:, 0],embedding[:, 1], s=0.2)
axes[0].set_title('Control')
axes[1].scatter(embedding_age[:, 0],embedding_age[:, 1], s=0.2)
axes[1].set_title('Young larvae')

In [None]:
fig, axes = plt.subplots(1,2, figsize=(15,7))
axes= axes.ravel()

axes[0].hist2d(embedding[:, 0],embedding[:, 1], bins=(150,150), density=True)
axes[0].set_title('Control')

    
axes[1].hist2d(embedding_age[:, 0],embedding_age[:, 1], bins=(150,150), density=True)
axes[1].set_title('Young larvae')