# Quick lipofuscin, autofluorescent spot and fiducial removal

In [11]:
#general analysis packages
import glob
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

def find_probable_noise(df1, df2, radius=0.5):
    """
    Performs nearest neighbor search provided a given search radius.
    If the nearest neighbor has a euclidean pixel distance <= radius then the dots are colocalizing.

    Parameters
    ----------
    df1 = first set of dots
    df2 = second set of dots
    radius = search radius
    """
    
    #reset index for df just in case
    df1 = df1.reset_index(drop=True)
    df2 = df2.reset_index(drop=True)
    
    #using sklearn nearest neighbor algorithm to find nearest dots
    #initialize algorithm
    neigh = NearestNeighbors(n_neighbors=2, radius=radius, metric="euclidean", n_jobs=1)
    
    #initialize neighbor
    initial_seed = df1[["x","y"]]
    #find neighbors for df1
    neigh.fit(df2[["x","y"]])
    distances,neighbors = neigh.radius_neighbors(initial_seed, radius, return_distance=True, sort_results=True)
    
    #nearest neighbor dot
    neighbors_flattened = []
    for i in range(len(neighbors)):
        try:
            if len(neighbors[i]) > 3:
                neighbors_flattened.append([i,neighbors[i]])
        except IndexError:
            continue
    if len(neighbors_flattened) == 0:
        return None
    else:
        return np.array(neighbors_flattened, dtype=object)[:,0]

def remove_noise_across_channel_encoding(path, radius = 0.5, total_hybs = 20, total_rounds = 5):
    all_files = glob.glob(path)
    all_hybs = np.arange(0,total_hybs,1)
    hybs_per_round = int(total_hybs/total_rounds)
    rounds = []
    k = 0
    for _ in range(total_rounds):
        rounds.append(all_hybs[k:k+hybs_per_round]) 
        k += hybs_per_round

    for file in tqdm(all_files):
        filename = Path(file).name
        output_path = str(Path(file).parent / f"noise_removed_{filename}")
        df = pd.read_csv(file)
        new_df = []
        for barcode_round in rounds:
            df_hyb = df[df.hyb.isin(barcode_round)].reset_index(drop=True)
            #any spots in same barcoding round within 0.5 pixels is probably just noise
            remove = find_probable_noise(df_hyb, df_hyb, radius=radius)
            if type(remove) != type(None):
                lipo_removed = df_hyb.drop(remove)
                new_df.append(lipo_removed)
            else:
                new_df.append(df_hyb)
        new_df = pd.concat(new_df).reset_index(drop=True)
        new_df.to_csv(output_path)

In [12]:
path = "/groups/CaiLab/personal/Lex/raw/230608_4k_inv_5bs/pyfish_tools/output/dots_detected/Channel_All/*/locations_z_*"
remove_noise_across_channel_encoding(path, radius = 0.5, total_hybs = 20, total_rounds = 5)

100%|██████████| 523/523 [1:00:14<00:00,  6.91s/it]
