In [None]:
%load_ext autoreload 
%autoreload 2
%matplotlib notebook
from matplotlib import pyplot as plt

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


import pathlib
import pandas as pd
import numpy as np


import anndata as ad
from spatial_wrangling import spatial_subset

from functools import partial
from multiprocessing import Pool

# Subsetting MERSCOPE spot data based on region annotations

### The region annotations in mapped MERSCOPE data are very nice, but a few tasks require the spot data to be subsetted based on spatial locations. For example:

1. regional comparison to bulk RNASeq
2. regional (re)segmentation of cells


### Here I have several files, each with the spot data from 1 MERSCOPE section, and I also have the cell data mapped cell data (with `clean_region_label` columns) for each of those sections.  I'll organize them a bit...

In [None]:
spot_dirs = list(pathlib.Path("/home/imaging_mfish/surveyNAS05/scratch/mouse_brain/brain_1_TH_section_spots/").glob("*"))
                    
    

spotfiles = []
cellfiles = []
slicenumbers = []

for ss in list([str(a) for a in range(17,48)]):  
    
    
    possible_file = [sdir for sdir in spot_dirs if sdir.name.split("609882")[1].split("_")[0] == ss]
    if len(possible_file)==0:
        continue
    spotfile = pathlib.Path(possible_file[0]).joinpath("detected_transcripts.csv")
    if not spotfile.exists():
        continue
        
    #  now that we have the spot file, use the matching cell data from that section.
    cellfile = pathlib.Path("/home/imaging_mfish/surveyNAS05/scratch/mouse_brain/20220201_mouse_WholeBrain_10Xv3_4018_processed_2022040_section_"+ss+".h5ad")
    cellfiles.append(cellfile)
    spotfiles.append(spotfile)
    slicenumbers.append(ss)
                 

### Now we have, for each section:
A cell file, a spot file and a section number.  

We'll run this with a multiprocess Pool next, using "TH" as the `clean_region_label` to extract the spot data within the area defined by those cells

In [None]:
          

def for_mp(input_list):
    spatial_subset(input_list[0],input_list[1],"TH", output_path = input_list[1].parent.joinpath("detected_spots_in_TH_section_"+input_list[2]+".csv"))

with Pool(processes=4) as pool:  #on a survey workstation, this could go up to 6 safely

    pool.map(for_mp, zip(cellfiles,spotfiles,slicenumbers))