Skip to content

Commit

Permalink
Disable inflection point search for small datasets in flexiplex-filter
Browse files Browse the repository at this point in the history
There are often times when it is desirable to have pipelines which work automatically on smaller datasets, especially
'example' or demo ones.

In these cases, inflection point searching is often not desirable due to the size of the datasets. In the cases where
it _is_ useful, it is strongly recommended that users first review the data manually and configure flexiplex-filter using
the -u and -l parameters. Hence, this commit disables the flexiplex-filter inflection point discovery automatically for
cases where there are under 50 unique counts, unless the -u and -l parameters are manually specified.
  • Loading branch information
olliecheng committed Mar 14, 2024
1 parent 729f78e commit 921bb6a
Showing 1 changed file with 26 additions and 3 deletions.
29 changes: 26 additions & 3 deletions scripts/flexiplex_filter/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import logging as log
import sys

from typing import Optional


def find_bounds(df, min_=None, max_=None):
"""
Expand Down Expand Up @@ -35,6 +37,11 @@ def find_bounds(df, min_=None, max_=None):
# select the index of the row with low_cnt as the count
max_ = int(df[df[col] == max_cnt].iloc[[0]].index[0])

# if max is 0, search the whole space
elif max_ == 0:
max_ = len(df) - 1
log.info(f"Setting lower bound to value {max_}, the last read")

if min_ is None:
log.debug("Setting upper bound to the count of read rank #50")
min_ = 50
Expand Down Expand Up @@ -274,7 +281,7 @@ def n_derivative(df, x, y, window_size=9):
)


def find_inflection(df, bounds, output_count=False):
def find_inflection(df, bounds, output_count: Optional[int]=False):
"""
Searches for the inflection point. This effectively minimises the
numerical derivative as the curve is always monotonically decreasing.
Expand Down Expand Up @@ -393,7 +400,7 @@ def parse_args():
metavar="<r>",
type=int,
required=False,
help="highest rank to search",
help="highest rank to search. set to 0 to search to the end",
)

group_inf_d = parser.add_argument_group(
Expand Down Expand Up @@ -454,7 +461,7 @@ def cli():
level=log.DEBUG if args.verbose else log.INFO,
)

log.info("FLEXIPLEX-FILTER 1.01")
log.info("FLEXIPLEX-FILTER 1.02")
if args.filename == sys.stdin:
log.info("No filename given... getting reads from stdin...")

Expand Down Expand Up @@ -482,8 +489,24 @@ def cli():

add_rank_index(df)

REQUIRED_UNIQUE_COUNTS = 50
unique_counts = len(df["count"].unique())

if args.no_inflection:
log.debug("--no-inflection was given, skipping inflection discovery")
df_filt = df
elif (( not args.max_rank ) and ( not args.min_rank ) and ( unique_counts < REQUIRED_UNIQUE_COUNTS ) and ( not args.graph )):
# when there are less than REQUIRED_UNIQUE_COUNTS unique counts, we will skip inflection point finding
# this is so that flexiplex-filter can still work for smaller 'example' datasets without requiring
# manual configuration.
#
# users who want to run flexiplex-filter on small datasets are highly recommended to specify the --graph argument,
# visualise the dataset, and then manually determine the maximum and minimum ranks for the search.

log.info(f"Number of unique counts is too small ({unique_counts}, threshold {REQUIRED_UNIQUE_COUNTS}), skipping inflection discovery by default.")
log.info("To force inflection discovery, manually inspect the data and then specify the -u and -l parameters.")
log.info("To show the graph, pass the parameters: --graph -l 1 -u 0")

df_filt = df
else:
rank = args.use_predetermined_rank
Expand Down

0 comments on commit 921bb6a

Please sign in to comment.