# File Cleaner

#### Note: Use the picasso kernel only.

What is this used for?
- Some imaging channels show sticking of imagers at one sight for 100s of frames. This code will remove the binding events longer than (max_bright_time) allowed. 

How does this work? 
- Link localizations of all the files.
- Find localizations with link groups that are longer than the (max_bright_time) allowed.
    - One can also append a column in the localizations list with the length of the link group. This can be used in the next step to remove. 
- Remove the localizations that fall in that link group.

Workflow

1. Define folder path and the files in the folder. 
2. Define link localization parameters.
    - r_max
    - max_dark_time
3. Loop through each file and link localizations and filter. 
    - Link localizations (return link group for each index)
    - Count the number of each link group and make another list with the length information. 
    - Append both the list to the localizations recarray. 
    - Filter the localizations recarray with the thresholds.
4. Save the cleaned files in a separate folder. 
    - Make a new directory beside the previous directory.
    - Save all files with a suffix.
    - Save a text file with the amount of data lost. This will give us a record of how much we are losing. 

In [None]:
# Import Dependencies

import numpy as _np
import os.path as _ospath
import os as _os
import h5py as _h5py
from picasso import lib as _lib
from picasso import io as _io
from picasso import postprocess as _postprocess


In [None]:
# Define the folder location and the file extension inside the folder

folder = '' # Folder Location. This can also be the whole field of view, to reduce analysis time. 
file_extn = '.hdf5'
file_names = [f for f in _os.listdir(folder) if f.endswith(file_extn)]

In [None]:
# Define the output folder

parent_folder, working_folder = _ospath.split(folder)
output_folder = _ospath.join(parent_folder, working_folder, 'Cleaned')
if not _ospath.exists(output_folder):
    _os.makedirs(output_folder)

In [None]:
# Functions needed for linking

def locs_per_link_group(link_group):
    """
    Calculates the number of frames/locs per link group in a link_group
    numpy array.
    
    Parameters
    ----------
    link_group : numpy 1d array
        Link group number for each localization (locs).
        
    Returns
    -------
    link_group_n : numpy 1d array
        Number of frames per each link group.
    """
    # Count occurrences of each link group
    unique, counts = _np.unique(link_group, return_counts=True)
    frames_per_link_group_dic = dict(zip(unique, counts))
    
    # Map counts back to the original link_group array
    link_group_n = _np.vectorize(frames_per_link_group_dic.get)(link_group).astype(_np.int32)
    return link_group_n

def save_locs_withSuffix(path, locs, info, suffix=''):
    locs = _lib.ensure_sanity(locs, info)
    base, ext_locs = _ospath.splitext(path)
    output_locs_path = base + '_' + suffix + ext_locs    
    output_info_path = base + '_' + suffix + '.yaml'
    with _h5py.File(output_locs_path, "w") as locs_file:
        locs_file.create_dataset("locs", data=locs)
    _io.save_info(output_info_path, info, default_flow_style=False)

def link_custom(
    locs,
    r_max,
    max_dark_time,
):
    if len(locs) == 0:
        linked_locs = locs.copy()
        if hasattr(locs, "frame"):
            linked_locs = _lib.append_to_rec(
                linked_locs, _np.array([], dtype=_np.int32), "len"
            )
            linked_locs = _lib.append_to_rec(
                linked_locs, _np.array([], dtype=_np.int32), "n"
            )
        if hasattr(locs, "photons"):
            linked_locs = _lib.append_to_rec(
                linked_locs, _np.array([], dtype=_np.float32), "photon_rate"
            )
    else:
        locs.sort(kind="mergesort", order="frame")
        if hasattr(locs, "group"):
            group = locs.group
        else:
            group = _np.zeros(len(locs), dtype=_np.int32)
        
        link_group = _postprocess.get_link_groups(locs, r_max, max_dark_time, group)

    link_group_n = locs_per_link_group(link_group)
        
    return link_group_n

def locs_cleaner(locs, r_max, max_dark_time, max_bright_time):
    link_group_n = link_custom(locs, r_max, max_dark_time)
    if len(locs) != len(link_group_n):
        raise ValueError("Number of events are not matching.")
    mask = _np.array(link_group_n) <= max_bright_time
    locs_cleaned = locs[mask]
    return locs_cleaned


In [None]:
# Define the linking parameters

r_max = 1 # Radius/Distance between two consecutive events to link.
max_dark_time = 10 # Maximum number of dark frames between two events to link.

In [None]:
# Clean up parameters

max_bright_time = 15 # Discard events longer that specified number of frames.

In [None]:
# Data cleanup

for file in file_names:
    fpath = _ospath.join(folder, file)
    locs, info = _io.load_locs(fpath)
    protein_name = file.split('.')[0]
    print('Locs loaded for {} channel.'.format(protein_name))
    locs_cleaned = locs_cleaner(locs, r_max, max_dark_time, max_bright_time)
    percent_data_removed = 100 - (len(locs_cleaned)/len(locs))*100
    print('Percent data removed for {} channel is {}%'.format(protein_name, percent_data_removed))
    output_path = _ospath.join(output_folder, file)
    save_locs_withSuffix(output_path, locs_cleaned, info, suffix='cleaned')