## benchmarks for flowsom algorithm, searching for improvements

In [1]:
import timeit
from collections.abc import Callable
from memory_profiler import memory_usage

# imports
import numpy as np
from flowio import FlowData
from sklearn.metrics import v_measure_score

import flowsom as fs

### code from docker to filter out NaN values

In [2]:


def read_labelled_fcs(path):
    # read in FCS file
    fcs_data = FlowData(path)
    # convert to numpy array
    npy_data = np.reshape(fcs_data.events, (-1, fcs_data.channel_count))
    # get label column
    # TODO: support more files
    y = npy_data[:, -1]
    # filter out unlabelled data
    mask = ~np.isnan(y)
    X = npy_data[mask, :-1]
    y = npy_data[mask, -1]
    # if no 0 in y, subtract 1 from all labels
    # this is to make sure that the labels start at 0, as sklearn clustering algorithms usually output
    if 0 not in y:
        y = y - 1
    # cast y to int
    y = y.astype(np.int32)
    return X, y


def score_fcs_file(path, flowsom_func,dimensions,cols_to_use,seed) -> float:
    # read in fcs file
    X, y = read_labelled_fcs(path)

    # finding the best number of clusters is not part of this test
    # here we use labelled data to find the number of unique labels
    n_clusters = np.unique(y).shape[0]

    # cluster data and predict labels
    fsom = flowsom_func(X, n_clusters = max(n_clusters, dimensions, len(cols_to_use)),xdim=10, ydim=10,cols_to_use=cols_to_use,seed=seed)
    y_pred = fsom.metacluster_labels

    # because the v_measure_score is independent of the absolute values of the labels
    # we don't need to make sure the predicted label values have the same value as the true labels
    # the v_measure_score will be the same regardless, as it only depends on homogeneity and completeness
    # alternatively, a lookup table from the cluster centers can be used to have a consistent label value mapping
    # https://stackoverflow.com/questions/44888415/how-to-set-k-means-clustering-labels-from-highest-to-lowest-with-python
    v_measure = v_measure_score(y, y_pred)
    print(f"V-measure score: {v_measure}")
    return v_measure

In [3]:
# simple time benchmark

ff = fs.io.read_FCS("../data/accuracy_benches/Levine_13dim.fcs")
ff.uns['meta']['channels']

Unnamed: 0_level_0,$PnN,$PnS,$PnB,$PnE,$PnR
n,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,CD45,,32,0,1024
2,CD45RA,,32,0,1024
3,CD19,,32,0,1024
4,CD11b,,32,0,1024
5,CD4,,32,0,1024
6,CD8,,32,0,1024
7,CD34,,32,0,1024
8,CD20,,32,0,1024
9,CD33,,32,0,1024
10,CD123,,32,0,1024


In [4]:
ff.var

Unnamed: 0,n,channel,marker,$PnB,$PnE,$PnR
CD45,1,CD45,,32,0,1024
CD45RA,2,CD45RA,,32,0,1024
CD19,3,CD19,,32,0,1024
CD11b,4,CD11b,,32,0,1024
CD4,5,CD4,,32,0,1024
CD8,6,CD8,,32,0,1024
CD34,7,CD34,,32,0,1024
CD20,8,CD20,,32,0,1024
CD33,9,CD33,,32,0,1024
CD123,10,CD123,,32,0,1024


### put together array with markers used

In [5]:
cols = list(range(13))
cols

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

### testing flowsom

In [6]:
score_fcs_file("../data/accuracy_benches/Levine_13dim.fcs",fs.FlowSOM,10,cols,42)

[32m2024-05-26 13:15:05.119[0m | [34m[1mDEBUG   [0m | [36mflowsom.main[0m:[36m__init__[0m:[36m84[0m - [34m[1mReading input.[0m
[32m2024-05-26 13:15:05.138[0m | [34m[1mDEBUG   [0m | [36mflowsom.main[0m:[36m__init__[0m:[36m86[0m - [34m[1mFitting model: clustering and metaclustering.[0m
[32m2024-05-26 13:15:08.666[0m | [34m[1mDEBUG   [0m | [36mflowsom.main[0m:[36m__init__[0m:[36m88[0m - [34m[1mUpdating derived values.[0m


V-measure score: 0.8838722722422866


0.8838722722422866

### Make benchmarking function for time as well as peak memory

In [7]:
def bench_file(path:str, flowsom_implementation, dimensions:int, cols_to_use:np.ndarray=None, seed:int=None):
    """
    Benchmark a file with the given implementation of flowsom, this includes time and v-measure score
    @param path: path to the fcs file
    @param flowsom_implementation: implementation of flowsom to use
    @param dimensions: number of dimensions to use
    @param cols_to_use: columns to use
    @param seed: random seed to use
    """
    # read in fcs file
    X, y = read_labelled_fcs(path)

    # finding the best number of clusters is not part of this test
    # here we use labelled data to find the number of unique labels
    n_clusters = np.unique(y).shape[0]

    # cluster data and predict labels
    fsom = []
    exec_time = timeit.timeit(lambda: fsom.append(flowsom_implementation(X, n_clusters = max(n_clusters, dimensions, len(cols_to_use)), xdim=10, ydim=10, cols_to_use=cols_to_use, seed=seed)),number=1)
    y_pred = fsom[0].metacluster_labels
    
    # Measure peak memory usage
    peak_memory = max(memory_usage(proc=(lambda: flowsom_implementation(X, n_clusters = max(n_clusters, dimensions, len(cols_to_use)), xdim=10, ydim=10, cols_to_use=cols_to_use, seed=seed)), interval=0.1))
    
    
    # because the v_measure_score is independent of the absolute values of the labels
    # we don't need to make sure the predicted label values have the same value as the true labels
    # the v_measure_score will be the same regardless, as it only depends on homogeneity and completeness
    # alternatively, a lookup table from the cluster centers can be used to have a consistent label value mapping
    # https://stackoverflow.com/questions/44888415/how-to-set-k-means-clustering-labels-from-highest-to-lowest-with-python
    v_measure = v_measure_score(y, y_pred)
    print(f"V-measure score: {v_measure}")
    print(f'Execution time: {exec_time}s')
    print(f"Peak memory usage: {peak_memory:.2f} MiB")
    return (v_measure,exec_time,peak_memory)


In [8]:
bench_file("../data/accuracy_benches/Levine_13dim.fcs",fs.FlowSOM,10,cols,42)

[32m2024-05-26 13:15:09.021[0m | [34m[1mDEBUG   [0m | [36mflowsom.main[0m:[36m__init__[0m:[36m84[0m - [34m[1mReading input.[0m
[32m2024-05-26 13:15:09.043[0m | [34m[1mDEBUG   [0m | [36mflowsom.main[0m:[36m__init__[0m:[36m86[0m - [34m[1mFitting model: clustering and metaclustering.[0m
[32m2024-05-26 13:15:11.254[0m | [34m[1mDEBUG   [0m | [36mflowsom.main[0m:[36m__init__[0m:[36m88[0m - [34m[1mUpdating derived values.[0m
[32m2024-05-26 13:15:11.588[0m | [34m[1mDEBUG   [0m | [36mflowsom.main[0m:[36m__init__[0m:[36m84[0m - [34m[1mReading input.[0m
[32m2024-05-26 13:15:11.611[0m | [34m[1mDEBUG   [0m | [36mflowsom.main[0m:[36m__init__[0m:[36m86[0m - [34m[1mFitting model: clustering and metaclustering.[0m
[32m2024-05-26 13:15:13.854[0m | [34m[1mDEBUG   [0m | [36mflowsom.main[0m:[36m__init__[0m:[36m88[0m - [34m[1mUpdating derived values.[0m


V-measure score: 0.8838722722422866
Execution time: 2.549564637000003s
Peak memory usage: 567.23 MiB


(0.8838722722422866, 2.549564637000003, 567.2265625)

### now test different improvements