# File Cleaner

#### Note: Use the picasso kernel only

What is this used for?
- Some imaging channels show sticking of imagers at one sight for 100s of frames. This code will remove the binding events longer that (max_bright_time) allowed. 

How does this work? 
- Link localizations of all the files.
- Find localizations with link groups that are longer than the (max_bright_time) allowed.
    - One can also append a column in the localizations list with the length of the link group. This can be used in the next step to remove. 
- Remove the localizations that fall in that link group.

Workflow

1. Define the cell folder. 
2. Define link localization parameters.
    - r_max
    - max_dark_time
3. Loop through each file and link localizations and filter. 
    - Link localizations (return link group for each index)
    - Count the number of each link group and make another list with the length information. 
    - Append both the list to the localizations recarray. 
    - Filter the localizations recarray with the thresholds.
4. Save the cleaned files in a separate folder. 
    - Make a new directory beside the previous directory.
    - Save all files with a suffix.
    - Save a text file with the amount of data lost. This will give us a record of how much we are losing. 

In [None]:
# Import Dependencies

import numpy as _np
import os.path as _ospath
import os as _os
import pandas as _pd
from picasso import lib as _lib
from picasso import io as _io
from picasso import postprocess as _postprocess
import epi_paint_picasso_utilis as eppu

In [None]:
# Define the folder location and the file extension inside the folder

folder = '' # <<< Set your folder path here
file_extn = '.hdf5'
file_names = [f for f in _os.listdir(folder) if f.endswith(file_extn)]

In [None]:
# Define the linking parameters

r_max = 1
max_dark_time = 10

# Clean up parameters

max_bright_time = 15

In [None]:
# Define the output folder

parent_folder, working_folder = _ospath.split(folder)
output_folder = _ospath.join(parent_folder, working_folder, 'Cleaned')
if not _ospath.exists(output_folder):
    _os.makedirs(output_folder)

In [None]:
# ---------------------------------------------------------
# Linking functions
# ---------------------------------------------------------

def locs_per_link_group(link_group):
    unique, counts = _np.unique(link_group, return_counts=True)
    lookup = dict(zip(unique, counts))
    return _np.vectorize(lookup.get)(link_group).astype(_np.int32)

# ---------------------------------------------------------
# Main linking function (fully fixed)
# ---------------------------------------------------------

def link_custom(locs, r_max, max_dark_time):
    # Always convert DataFrame â†’ structured array
    locs = eppu.ensure_numpy_structured(locs)

    if len(locs) == 0:
        # return an empty but valid structured array with appended fields
        linked_locs = locs.copy()

        if "frame" in linked_locs.dtype.names:
            linked_locs = _lib.append_to_rec(
                linked_locs, _np.array([], dtype=_np.int32), "len"
            )
            linked_locs = _lib.append_to_rec(
                linked_locs, _np.array([], dtype=_np.int32), "n"
            )

        if "photons" in linked_locs.dtype.names:
            linked_locs = _lib.append_to_rec(
                linked_locs, _np.array([], dtype=_np.float32), "photon_rate"
            )

        # no link groups
        return _np.zeros(0, dtype=_np.int32)

    # ---------------------------------------------------------
    # FIXED: sorting by frame (for structured arrays)
    # ---------------------------------------------------------
    locs.sort(order="frame", kind="mergesort")

    # group field handling
    if "group" in locs.dtype.names:
        group = locs["group"]
    else:
        group = _np.zeros(len(locs), dtype=_np.int32)

    # compute link groups
    frames = locs["frame"]
    x = locs["x"]
    y = locs["y"]
    link_group = _postprocess.get_link_groups(frames, x, y, r_max, max_dark_time, group)

    # Count how many frames per link group
    return locs_per_link_group(link_group)


# ---------------------------------------------------------
# Cleaning wrapper
# ---------------------------------------------------------

def locs_cleaner(locs, r_max, max_dark_time, max_bright_time):
    # Ensure consistent structured array
    locs = ensure_numpy_structured(locs)

    link_group_n = link_custom(locs, r_max, max_dark_time)

    if len(locs) != len(link_group_n):
        raise ValueError("Number of events are not matching.")

    # Filter events by max on-time frames
    mask = _np.array(link_group_n) <= max_bright_time

    return locs[mask]

In [None]:
# Data cleanup

for file in file_names:
    fpath = _ospath.join(folder, file)
    locs, info = _io.load_locs(fpath)
    protein_name = file.split('.')[0]
    print('Locs loaded for {} channel.'.format(protein_name))
    locs_cleaned = locs_cleaner(locs, r_max, max_dark_time, max_bright_time)
    percent_data_removed = 100 - (len(locs_cleaned)/len(locs))*100
    print('Percent data removed for {} channel is {}%'.format(protein_name, percent_data_removed))
    output_path = _ospath.join(output_folder, file)
    eppu.save_locs_withSuffix(output_path, locs_cleaned, info, suffix='cleaned')