# Cell Sectioner and ID
#### Note: Use the picasso kernel only

What is this used for?
- This code sections the entire data filed into small sectors and attributes a unique ID to each localization based on which sector it is present in. 
- The sector IDs will be used in the later steps for dimentionality reduction analysis.  

Workflow
- Define source folder location and files.
- Define output folder location and files.
- Load the lamin dataset first.
    - Extract the pixel range from this data.
    - This will be used to build the sector grids, grid unit size of 26nm (breaking each pixel into 5 by 5 units)
    - Assign IDs to the sector grids.
- Read all the data in a loop.
    - Assign the sector ID for them too.
    - Save files in new folder.
- Append a group column in every single localization file defining which Sector ID the particular localization belongs to. This Sector ID can then be used in the later stages to perform analysis such as UMAP. 
- Save the files in a different directory with the same name as they had, but with the extra group column that will define the Sector ID. 

In [None]:
# Import Dependencies.

import os.path as _ospath
import os as _os
import h5py as _h5py
import yaml as _yaml
import numpy as _np
from picasso import lib as _lib
from picasso import io as _io
import math

In [None]:
# Define the folder location and the file externsion inside the folder.

folder = '' # Folder name for specific cell.
folder = _ospath.join(folder, 'Masked')
file_extn = '.hdf5'
file_names = [f for f in _os.listdir(folder) if f.endswith(file_extn)]

In [None]:
# Define the output folder.

parent_folder, working_folder = _ospath.split(folder)
output_folder = _ospath.join(parent_folder, 'Sectored')
if not _ospath.exists(output_folder):
    _os.makedirs(output_folder)

In [None]:
# Extract pixel information and define sectoring parameters from the Lamin file.

for file in file_names:
    if 'Lamin' in file:
        fpath = _ospath.join(folder, file)
        locs, info = _io.load_locs(fpath)
        break

sector_division = 1 # Define the number of sections we want to cut each pixel along either dimension.
x_min = math.floor(locs['x'].min()) - 1
y_min = math.floor(locs['y'].min()) - 1
x_max = math.ceil(locs['x'].max()) + 1
y_max = math.ceil(locs['y'].max()) + 1
x_pixels = x_max - x_min
y_pixels = y_max - y_min
sector_size = round((1/sector_division), 1)
x_offset = 0.0
y_offset = 0.0

In [None]:
# Functions used in the script.

def create_grid(x_min, x_max, y_min, y_max, sector_size, x_offset, y_offset):
    """
    Create a grid with unique IDs for each sector.
    
    Parameters:
        image_dimension_x: The width of the image in pixels.
        image_dimension_y: The height of the image in pixels.
        sector_size: The size of the sector in nanometers.
        pixel_size: The size of the pixel in nanometers.
    
    Returns:
        grid_info: A dictionary containing grid coordinates and unique IDs.
    """
    x_edges = _np.arange(x_min + x_offset, x_max + (sector_size/2) + x_offset, sector_size) # The additon of (sector_size/2) is to include the last sector
    y_edges = _np.arange(y_min + y_offset, y_max + (sector_size/2) + y_offset, sector_size) # The additon of (sector_size/2) is to include the last sector
    x_edges = _np.round(x_edges, 1)
    y_edges = _np.round(y_edges, 1)
    x_centers = (x_edges[:-1] + x_edges[1:]) / 2
    y_centers = (y_edges[:-1] + y_edges[1:]) / 2
    x_centers = _np.round(x_centers, 1)
    y_centers = _np.round(y_centers, 1)

    grid_id = 1 # Grid ID starts from 1 AND NOT 0.
    grid = {}
    for i, xc in enumerate(x_centers):
        for j, yc in enumerate(y_centers):
            grid[(i, j)] = {
                "id": grid_id,
                "x_center": xc, # Check if we actually need these later? 
                "y_center": yc, # Check if we actually need these later?
            }
            grid_id += 1
    return grid, x_edges, y_edges

def save_locs_withSuffix(path, locs, info, suffix=''):
    locs = _lib.ensure_sanity(locs, info)
    base, ext_locs = _ospath.splitext(path)
    output_locs_path = base + '_' + suffix + ext_locs    
    output_info_path = base + '_' + suffix + '.yaml'
    with _h5py.File(output_locs_path, "w") as locs_file:
        locs_file.create_dataset("locs", data=locs)
    _io.save_info(output_info_path, info, default_flow_style=False)

def assign_sector_ids(locs, x_edges, y_edges, grid):
    """
    Assign grid IDs to points based on the grid.
    
    Parameters:
        rec_data: Input recarray with x and y coordinates.
        x_edges: Bin edges for the x-axis.
        y_edges: Bin edges for the y-axis.
        grid: Grid dictionary with cell information.
    
    Returns:
        Updated recarray with grid IDs.
    """

    x = locs['x']
    y = locs['y']

    x_indices = _np.digitize(x, x_edges) - 1
    y_indices = _np.digitize(y, y_edges) - 1

    # # Ensure indices are within bounds
    # x_indices = _np.clip(x_indices, 0, len(x_edges) - 2)
    # y_indices = _np.clip(y_indices, 0, len(y_edges) - 2)

    # Fetch grid IDs
    grid_ids = []
    x_centers = []
    y_centers = []
    for xi, yi in zip(x_indices, y_indices):
        info = [grid.get((xi, yi), {"id": -1, "x_center": _np.nan, "y_center": _np.nan})]
        grid_ids.append(info[0]["id"])
        x_centers.append(info[0]["x_center"])
        y_centers.append(info[0]["y_center"])
    
    grid_ids = _np.array(grid_ids, dtype=_np.int32)
    x_centers = _np.array(x_centers, dtype=_np.int32)
    y_centers = _np.array(y_centers, dtype=_np.int32)

    # Add the grid IDs to the recarray
    locs_sector_id = _lib.append_to_rec(locs, grid_ids, 'sector_id')
    locs_sector_id = _lib.append_to_rec(locs_sector_id, x_centers, 'x_pixel_pos')
    locs_sector_id = _lib.append_to_rec(locs_sector_id, y_centers, 'y_pixel_pos')
    
    return locs_sector_id

In [None]:
# Create the grid to work on.

grid, x_edges, y_edges = create_grid(x_min, x_max, y_min, y_max, sector_size, x_offset, y_offset)

In [None]:
# Assign sector IDs to all the localizations in all the files.

for file in file_names:
    fpath = _ospath.join(folder, file)
    locs, info = _io.load_locs(fpath)
    protein_name = file.split('.')[0]
    locs_sector_id = assign_sector_ids(locs, x_edges, y_edges, grid)
    output_path = _ospath.join(output_folder, file)
    save_locs_withSuffix(output_path, locs_sector_id, info, suffix='sector_id')
    print('Done with assigning sector IDs for', protein_name)

In [None]:
# Save the sectoring information

# columns_to_drop = ['x_center', 'y_center']

# grid_info = grid.copy()

# for column in columns_to_drop:
#     grid_info.pop(column, None)

sector_info = {
    'x_min': x_min,
    'x_max': x_max,
    'y_min': y_min,
    'y_max': y_max,
    'number_of_sectors': x_pixels * y_pixels * sector_division * sector_division,
    'sector_size': sector_size,
    'x_offset': x_offset,
    'y_offset': y_offset,
    # 'grid_info' : grid_info # If needed, we will export later. 
}

# Save the sector information
with open(_ospath.join(output_folder, 'sector_info.yaml'), 'w') as f:
    _yaml.dump(sector_info, f)