## DUSP1 Confirmation Notebook
The purpose of this notebook is to:
1. Confirm successful segmentation.
2. Confirm successful BigFISH spot and cluster detection.
3. Refine spots and clusters through additional filtering (SNR) for gating and final dataframe preparation:  
    a. Test predefined SNR thresholds.  
    b. Test weighted SNR tresholding    
    c. Filter `df_spots` with snr threshold if needed.  

In [None]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import dask.array as da
import os
import sys
import logging


logging.getLogger('matplotlib.font_manager').disabled = True
numba_logger = logging.getLogger('numba')
numba_logger.setLevel(logging.WARNING)

matplotlib_logger = logging.getLogger('matplotlib')
matplotlib_logger.setLevel(logging.WARNING)

src_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
print(src_path)
sys.path.append(src_path)

from src.Analysis import AnalysisManager, Analysis, SpotDetection_SNRConfirmation, Spot_Cluster_Analysis_WeightedSNR, GR_Confirmation

# Use the log file to search for analyses

In [None]:
loc = None 
log_location = r'/Volumes/share/Users/Eric/GR_DUSP1_2025'  #  r'/Volumes/share/Users/Jack/All_Analysis'
am = AnalysisManager(location=loc, log_location=log_location, mac=True) 

In [None]:
# list all analysis done 
all_analysis_names = am.list_analysis_names()
all_analysis_names

# DUSP1 Experiment Analysis List

### DUSP1 100nM Dex 3hr Time-sweep
- Replica D: `Analysis_DUSP1_D_JacksRunAll_2025-02-05`
- Replica E: `Analysis_DUSP1_E_ERonRunAll_2025-02-06`
- Replica F: `Analysis_DUSP1_F_ERonReRun_2025-02-08`
- Replica M: `Analysis_DUSP1_M_ERonRunAll_2025-02-06`
- Replica N: `Analysis_DUSP1_N_JacksRunAll_2025-02-06`

### DUSP1 75min Concentration-sweep
- Replica G: `Analysis_DUSP1_G_ERonReRun_2025-02-08`
- Replica H: `Analysis_DUSP1_H_ERonRunAll_2025-02-06`
- Replica I: `Analysis_DUSP1_I_JacksRunAll_2025-02-06`

### DUSP1 0.3, 1, 10nM Dex 3hr Time-sweep
- Replica J: `Analysis_DUSP1_J_ERonRunAll_2025-02-06`
- Replica K: `Analysis_DUSP1_K_ERonReRun_2025-02-08`
- Replica L: `Analysis_DUSP1_L_JacksRunAll_2025-02-06`

### DUSP1 TPL
- Replica O `Analysis_DUSP1_O_JacksRunAll_2025-02-06`
- Replica P `Analysis_DUSP1_P_ERonReRun_2025-02-08`

In [None]:

# # DUSP1 100nM Dex 3hr Time-sweep Replica 1
# am.select_analysis('DUSP1_D_JacksRunAll')
# # DUSP1 100nM Dex 3hr Time-sweep Replica 2
# am.select_analysis('DUSP1_E_ERonRunAll')
# # DUSP1 100nM Dex 3hr Time-sweep Replica 3
# am.select_analysis('DUSP1_F_JacksRunAll')
# # DUSP1 100nM Dex 3hr Time-sweep Replica 4
# am.select_analysis('DUSP1_M_ERonRunAll')
# # DUSP1 100nM Dex 3hr Time-sweep Replica 5
# am.select_analysis('DUSP1_N_JacksRunAll')

# # DUSP1 75min Concentration-sweep Replica 1
# am.select_analysis('DUSP1_G_JacksRunAll')
# # DUSP1 75min Concentration-sweep Replica 2
# am.select_analysis('DUSP1_H_ERonRunAll')
# # DUSP1 75min Concentration-sweep Replica 3
# am.select_analysis('DUSP1_I_JacksRunAll')

# # DUSP1 0.3, 1, 10nM Dex 3hr Time-sweep Replica 1
# am.select_analysis('DUSP1_J_ERonRunAll')
# # DUSP1 0.3, 1, 10nM Dex 3hr Time-sweep Replica 2
# am.select_analysis('DUSP1_K_ERonReRun')
# # DUSP1 0.3, 1, 10nM Dex 3hr Time-sweep Replica 2
# am.select_analysis('DUSP1_L_JacksRunAll')

# DUSP1 Dex Tpl replica 1
# am.select_analysis('DUSP1_O_JacksRunAll')
# DUSP1 Dex Tpl replica 2
# am.select_analysis('DUSP1_P_ERonReRun')

## Analysis/confirmation

In [None]:
# Initiate the class
SD = Spot_Cluster_Analysis_WeightedSNR(am)
# Load the data
SD.get_data()

In [None]:
SD.cellprops['NAS_location'].unique()

In [None]:
# Display Segmentation, BF_spotdetection, SNR thresholding (basic and weighted), Summary Stats and plots
SD.display(newFOV=True, newCell=True)

In [None]:
am.close()

## Validate DUSP1 spots, clusters, cellspots, and cellprops dataframe agreement

In [None]:
# Create unique cell id for every cell
SD.cellprops['unique_cell_id'] = np.arange(len(SD.cellprops))

# Merge the spots and clusters dataframes by the unique cell ID
SD.spots = SD.spots.merge(SD.cellprops[['NAS_location', 'cell_label', 'fov', 'unique_cell_id']], 
                            on=['NAS_location', 'cell_label', 'fov'], 
                            how='left')
SD.clusters = SD.clusters.merge(SD.cellprops[['NAS_location', 'cell_label', 'fov', 'unique_cell_id']], 
                            on=['NAS_location', 'cell_label', 'fov'], 
                            how='left')

In [None]:
SD.spots

In [None]:
# Function to get the second largest value or default to 0
def second_largest(series):
    unique_vals = series.dropna().unique()  # Remove NaN and get unique values
    if len(unique_vals) < 2:
        return 0  # Return 0 if there's no second-largest value
    return np.sort(unique_vals)[-2]  # Return the second-largest value


def measure_DUSP1(spots, clusters, props) -> pd.DataFrame:
    results = pd.DataFrame(columns=['cell_id', 'num_ts', 'num_spots_ts', 'largest_ts', 'second_largest_ts', 'num_foci', 'num_spots_foci', 'num_spots', 'num_nuc_spots', 'num_cyto_spots', 
                                    'nuc_area_px', 'cyto_area_px', 'avg_nuc_int', 'avg_cyto_int', 'time', 'Dex_conc', 'replica'])
    
    # Sort spots, clusters, and props by unique_cell_id
    spots = spots.sort_values(by='unique_cell_id')
    clusters = clusters.sort_values(by='unique_cell_id')
    props = props.sort_values(by='unique_cell_id')

    # unique cell id
    cell_ids = props['unique_cell_id']

    # num of ts
    num_ts = clusters[clusters['is_nuc'] == 1].groupby('unique_cell_id').size().reindex(cell_ids, fill_value=0)

    # num of foci
    num_foci = clusters[clusters['is_nuc'] == 0].groupby('unique_cell_id').size().reindex(cell_ids, fill_value=0)

    # num of ts spots
    num_spots_ts = clusters[clusters['is_nuc'] == 1].groupby('unique_cell_id')['nb_spots'].sum().reindex(cell_ids, fill_value=0)

    # largest TS size
    largest_ts = clusters[clusters['is_nuc'] == 1].groupby('unique_cell_id')['nb_spots'].max().reindex(cell_ids, fill_value=0)

    # Compute second-largest TS size per cell
    second_largest_ts = (clusters[clusters['is_nuc'] == 1].groupby('unique_cell_id')['nb_spots'].apply(second_largest).reindex(cell_ids, fill_value=0))    

    # num of foci spots
    num_spots_foci = clusters[clusters['is_nuc'] == 0].groupby('unique_cell_id')['nb_spots'].sum().reindex(cell_ids, fill_value=0)

    # num of spots
    num_spots = spots.groupby('unique_cell_id').size().reindex(cell_ids, fill_value=0)

    # num of spot in nuc
    num_nuc_spots = spots[spots['is_nuc'] == 1].groupby('unique_cell_id').size().reindex(cell_ids, fill_value=0)

    # num of spot in cyto 
    num_cyto_spots = spots[spots['is_nuc'] == 0].groupby('unique_cell_id').size().reindex(cell_ids, fill_value=0)

    # nuc area
    nuc_area = props['nuc_area']

    # cyto area
    cyto_area = props['cyto_area']

    # avg int nuc
    avg_nuc_int = props['nuc_intensity_mean-0']
    
    # avg int cyto
    avg_cyto_int = props['cyto_intensity_mean-0']

    # time (experiment)
    time = props['time'] 

    # Dex conc
    dex_conc = props['Dex_Conc']

    # Replica
    replica = spots.groupby('unique_cell_id')['replica'].first().reindex(cell_ids, fill_value=np.nan)

    results['cell_id'] = cell_ids
    results['num_ts'] = num_ts.values
    results['largest_ts'] = largest_ts.values
    results['second_largest_ts'] = second_largest_ts.values
    results['num_foci'] = num_foci.values
    results['num_spots_ts'] = num_spots_ts.values
    results['num_spots_foci'] = num_spots_foci.values
    results['num_spots'] = num_spots.values
    results['num_nuc_spots'] = num_nuc_spots.values
    results['num_cyto_spots'] = num_cyto_spots.values
    results['nuc_area_px'] = nuc_area.values
    results['cyto_area_px'] = cyto_area.values
    results['avg_nuc_int'] = avg_nuc_int.values
    results['avg_cyto_int'] = avg_cyto_int.values
    results['time'] = time.values
    results['Dex_conc'] = dex_conc.values
    results['replica'] = replica.values

    return results

In [None]:
cell_spots = measure_DUSP1(SD.spots, SD.clusters, SD.cellprops)

In [None]:
# Ensure num_spots = num_nuc_spots + num_cyto_spots for all rows
assert (cell_spots['num_spots'] == cell_spots['num_nuc_spots'] + cell_spots['num_cyto_spots']).all(), "Mismatch in spot counts"

In [None]:
cell_spots

In [None]:
SD.cellspots = SD.cellspots.merge(SD.cellprops[['NAS_location', 'cell_label', 'fov', 'unique_cell_id']], 
                                    left_on=['NAS_location', 'cell_id', 'fov'], 
                                    right_on=['NAS_location', 'cell_label', 'fov'], 
                                    how='left')


# Align indices before performing the assertion
if 'cell_id' in cell_spots.keys():
    cell_spots = cell_spots.set_index('cell_id')

if 'unique_cell_id' in SD.cellspots.keys():
    SD.cellspots = SD.cellspots.set_index('unique_cell_id')

aligned_nb_rna = SD.cellspots['nb_rna']

aligned_num_spots = cell_spots['num_spots']
aligned_num_spots = aligned_num_spots - cell_spots['num_spots_ts']

# Ensure aligned_num_spots only contains indices present in aligned_nb_rna
aligned_num_spots = aligned_num_spots.loc[SD.cellspots.index]

not_close_indices = np.where(~np.isclose(aligned_nb_rna, aligned_num_spots, rtol = 0.05))[0]
print("Indices where nb_rna and num_spots are not close:", len(not_close_indices))

assert len(not_close_indices) == 0, "Mismatch in nb_rna and num_spots counts"


print(f"{'cell_id':<10} {'my counting':<30} {'bigfish counting':<30} {'corrected':<30} {'ts spots':<30} {'foci spots':<30}")
for cell_id, my, bf, cr, ts, foci in zip(aligned_nb_rna.iloc[not_close_indices].index, cell_spots.loc[aligned_nb_rna.index]['num_spots'].iloc[not_close_indices], 
                                         aligned_nb_rna.iloc[not_close_indices], aligned_num_spots.iloc[not_close_indices], 
                                         cell_spots.loc[aligned_nb_rna.index]['num_spots_ts'].iloc[not_close_indices], cell_spots.loc[aligned_nb_rna.index]['num_spots_foci'].iloc[not_close_indices] ):
    print(f"{cell_id:<10} {my:<30} {bf:<30} {cr:<30} {ts:<30} {foci:<30}")