-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmain.py
535 lines (426 loc) · 15.2 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
import pandas as pd
import numpy as np
import argparse
import logging as log
import sys
from typing import Optional
def find_bounds(df, min_=None, max_=None):
"""
Find appropriate bounds to search for the inflection point within, with a
manual override as well.
Defaults to using [50, c_{0.95}] where c_{x} refers to the xth quantile.
Parameters:
df (dataframe): dataframe to search for; must contain the `rank` index
and `count` column
min_ (int): manual override for the lowest rank to be contained
max_ (int): manual override for the highest rank to be contained
Returns:
(int, int): the lower and upper rank bounds
"""
# column to be used when determining quantiles
col = "count"
quantile = 0.95
if max_ is None:
log.debug("Setting lower bound to the 0.95 quantile")
# interpolation = "lower" will guarantee that the result is valid
max_cnt = df[col].quantile(quantile, interpolation="lower")
# select the index of the row with low_cnt as the count
max_ = int(df[df[col] == max_cnt].iloc[[0]].index[0])
# if max is 0, search the whole space
elif max_ == 0:
max_ = len(df) - 1
log.info(f"Setting lower bound to value {max_}, the last read")
if min_ is None:
log.debug("Setting upper bound to the count of read rank #50")
min_ = 50
min_row = df[df.index == min_].iloc[[0]]
max_row = df[df.index == max_].iloc[[0]]
log.debug(
"Bounds interval: ranks [%d, %d] -> counts [%d, %d]",
min_,
max_,
# the counts should be flipped in magnitude i.e.
# the min row -> max count
max_row[col],
min_row[col],
)
if not min_ < max_:
raise ValueError(
"The lower bound must be greater than the upper bound. By default, u = 50"
)
return (min_, max_)
def filter_whitelist(df, whitelist_file):
"""
Filters out barcodes which are not contained within the whitelist.
Whitelist must be plaintext and cannot be gzipped.
Parameters:
df (DataFrame):
the dataframe with contains the barcodes in
a column named `barcode`
whitelist_file (file)
an open file context of whitelist barcodes
Returns:
DataFrame with all invalid barcodes filtered out
"""
log.debug("Reading whitelist...")
whitelist = set(x.rstrip() for x in whitelist_file.readlines())
log.debug("Read whitelist")
rows_orig = len(df)
filtered = df[df["barcode"].isin(whitelist)]
rows_filt = len(filtered)
diff = rows_orig - rows_filt
percent = diff / rows_orig * 100
log.info(
"Filtered with whitelist, removed %d out of %d barcodes (%d%% of all barcodes)",
diff,
rows_orig,
percent,
)
return filtered
def show_graph(df, bounds, rank):
"""
Draws a graph of the results.
Parameters:
df (DataFrame): the data to read from
bounds (int, int):
the lower and upper rank bounds, to show on the graph as
a shaded pink region
rank (int):
the rank of the discovered result, to show on the graph
as a dotted line
Returns:
None. Displays a graph visually.
"""
import matplotlib.pyplot as plt
count = df.at[rank, "count"]
df.drop_duplicates(subset="count", keep="first", inplace=True)
# draw vertical box region for the bounds
plt.axvspan(
xmin=bounds[0],
xmax=bounds[1],
color="mistyrose",
zorder=1,
)
# draw lines around the discovered inflection point
plt.axvline(x=rank, color="r", linestyle="dashed", zorder=2)
plt.axhline(y=count, color="r", linestyle="dashed", zorder=3)
# label rank and count of inflection point
plt.text(rank * 1.2, 0.7, "rank = " + str(rank))
plt.text(0.7, count * 1.2, "count = " + str(count))
# draw points
plt.scatter(
df.index,
df["count"],
facecolors="none",
edgecolors="black",
s=10,
zorder=10,
)
plt.xscale("log", base=10)
plt.yscale("log", base=10)
plt.ylabel("count")
plt.xlabel(
"rank\n\n"
"The light pink shading represents the search bounds of the inflection point.\n"
"The dotted red lines represent the rank and count of the discovered point."
)
# required for the bottom text to be readable
plt.subplots_adjust(bottom=0.3)
log.info(
"\nNote that in cases where multiple reads have the same count, only "
"the lowest-ranked read is preserved. This is done for performance "
"reasons."
)
plt.show()
def read_counts(file):
"""
Read barcodes and counts from a file and add necessary columns
Parameters:
file (file): a file context which contains barcode and count data
Returns:
DataFrame with the `barcode`, `count` columns and `rank` index
"""
df = pd.read_csv(file, sep="\t", names=("barcode", "count"))
# convert count col to ints
df["count"] = df["count"].astype(int)
return df
def add_rank_index(df):
"""
Sorts the data and adds a rank index where a rank of #1 corresponds
to the highest count.
Parameters:
df (DataFrame): the data
Returns:
The original dataframe, but sorted and with a rank index
"""
df.sort_values(by="count")
# sort by count and add to a rank column
df["rank"] = df["count"].rank(ascending=False, method="first").astype(int)
# convert rank column to index
df.set_index("rank", inplace=True)
return df
def write_df(df, file):
"""
Writes barcode and count information to a file context
Parameters:
df (DataFrame): the data to write; must contain a `barcode` and `count`
file (file): a file-like context to write the output to
(can be stdout)
Returns:
None
"""
df.to_csv(
file, sep="\t", columns=["barcode", "count"], header=False, index=False
)
def n_derivative(df, x, y, window_size=9):
"""
Calculate the numerical derivative of two columns of a dataframe.
A rolling window is used to smooth the derivative as well.
Parameters:
df (DataFrame): the dataframe to smooth
x (str): the column of df for the x values
y (str): the column of df for the y values
window_size (int): the size of the rolling window; must be odd
Returns:
np.array with length len(df), with the numerical derivative
"""
# cap window size, so that it is always smaller than the
# length of the input data
window_size = min(window_size, len(df) - (2 if (len(df) % 2) else 1))
# calculates the rolling mean of the numerical derivative
half_size = window_size // 2
convolution = np.ones(window_size)
# window_size must be odd
if not window_size % 2:
raise ValueError("window_size must be odd")
diff = np.diff(df[y], 1) / np.diff(df[x], 1)
diff_orig = diff.copy()
# rolling average of these differences
rolling = np.convolve(diff, convolution, "valid") / window_size
return np.concatenate(
(
diff_orig[:half_size],
rolling,
(
diff_orig[-half_size:] if half_size != 0 else []
), # if half_size is 0, this would just double the array
[np.NaN],
)
)
def find_inflection(df, bounds, output_count: Optional[int]=False):
"""
Searches for the inflection point. This effectively minimises the
numerical derivative as the curve is always monotonically decreasing.
Parameters:
df (DataFrame): data to search from
bounds (int, int): min and max rank to use while searching
output_count (bool): whether to print extra information
Returns:
int which represents the rank of the inflection
"""
# remove ranks not within the bounds
df_bounded = df[(df.index >= bounds[0]) & (df.index <= bounds[1])].copy()
# remove duplicates - they affect the numerical derivative calculations
# make sure that we keep the smaller ranks (keep = "first")
df_bounded.drop_duplicates(subset="count", keep="first", inplace=True)
# use log10 of both count and rank for derivative calculations
df_bounded["count_log10"] = np.log10(df_bounded["count"])
df_bounded["rank_log10"] = np.log10(df_bounded.index)
df_bounded["diff"] = n_derivative(
df_bounded, x="rank_log10", y="count_log10"
)
smallest = df_bounded.nsmallest(output_count or 1, "diff")
if output_count:
log.info(
(
"\n%d smallest derivatives:\n%s"
"\n\n"
"Once an appropriate rank cutoff <r> has been determined, use "
"the leftmost column (`rank`) as the parameter to:\n"
" --use-predetermined-rank <r>\n"
"This will produce a list of all valid barcodes using the "
"barcodes from ranks 1 to <r>.\n"
),
output_count,
smallest.to_string(),
)
return smallest["diff"].idxmin()
def parse_args():
"""
Set up help and parse command line arguments
"""
# fmt: off
parser = argparse.ArgumentParser(
prog="flexiplex-filter",
description=(
"finds the inflection point when demultiplexing using flexiplex"
),
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help=(
"output verbose and debugging information, and also "
"display more potential inflection points"
),
)
parser.add_argument(
"filename",
nargs="?",
type=argparse.FileType("r"),
default=sys.stdin,
help=(
"input file, typically called flexiplex_barcodes_counts.txt. "
"defaults to stdin if not given"
),
)
parser.add_argument(
"-o",
"--outfile",
metavar="<file>",
type=argparse.FileType("w"),
default=sys.stdout,
help=(
"output file, defaults to stdout if not given "
"(ignored if --dry-run is active)"
),
)
parser.add_argument(
"--dry-run",
action="store_true",
help=(
"only output discovered inflection points, "
"without performing the actual filtering"
),
)
group_inf = parser.add_argument_group("filter by inflection point")
group_inf.add_argument(
"--no-inflection",
action="store_true",
help="do not search for an inflection point",
)
group_inf.add_argument(
"-l",
"--min-rank",
metavar="<r>",
type=int,
required=False,
help="lowest rank to search"
)
group_inf.add_argument(
"-u",
"--max-rank",
metavar="<r>",
type=int,
required=False,
help="highest rank to search. set to 0 to search to the end",
)
group_inf_d = parser.add_argument_group(
"fine-tune/visualise an inflection point"
)
group_inf_d.add_argument(
"-g",
"--graph",
action="store_true",
help=(
"show a graph with the inflection point marked, requires "
"matplotlib. will also enable --dry-run."
),
)
group_inf_d.add_argument(
"--list-points",
type=int,
metavar="<n>",
required=False,
help="show multiple potential points. will also enable --dry-run.",
)
group_inf_d.add_argument(
"--use-predetermined-rank",
metavar="<r>",
type=int,
required=False,
help=(
"use predetermined inflection point. this will disable searching, "
"but will still filter for all ranks <= r"
),
)
group_wl = parser.add_argument_group("filter by whitelist file")
group_wl.add_argument(
"-w",
"--whitelist",
metavar="<file>",
type=argparse.FileType("r"),
help=(
"a whitelist file for known chemistry barcodes. "
"if not given, this program will not perform whitelist filtering."
),
)
# fmt: on
args = parser.parse_args()
return args
def cli():
# set up parser
args = parse_args()
# configure logging
log.basicConfig(
stream=sys.stderr,
format="%(message)s",
level=log.DEBUG if args.verbose else log.INFO,
)
log.info("FLEXIPLEX-FILTER 1.02")
if args.filename == sys.stdin:
log.info("No filename given... getting reads from stdin...")
if args.list_points and not args.dry_run:
args.dry_run = True
log.info("Setting --dry-run as --list-points was given")
elif args.graph and not args.dry_run:
args.dry_run = True
log.info("Setting --dry-run as --graph was given")
# force a --list-points if not already given
if args.verbose or args.graph:
if not args.list_points:
log.info("--list-points not given, setting it to 10")
args.list_points = 10
df = read_counts(args.filename)
# was a whitelist given?
if args.whitelist:
log.debug("Found whitelist file, filtering")
df = filter_whitelist(df, args.whitelist)
else:
log.debug("No whitelist file given, skipping")
add_rank_index(df)
REQUIRED_UNIQUE_COUNTS = 50
unique_counts = len(df["count"].unique())
if args.no_inflection:
log.debug("--no-inflection was given, skipping inflection discovery")
df_filt = df
elif (( not args.max_rank ) and ( not args.min_rank ) and ( unique_counts < REQUIRED_UNIQUE_COUNTS ) and ( not args.graph )):
# when there are less than REQUIRED_UNIQUE_COUNTS unique counts, we will skip inflection point finding
# this is so that flexiplex-filter can still work for smaller 'example' datasets without requiring
# manual configuration.
#
# users who want to run flexiplex-filter on small datasets are highly recommended to specify the --graph argument,
# visualise the dataset, and then manually determine the maximum and minimum ranks for the search.
log.info(f"Number of unique counts is too small ({unique_counts}, threshold {REQUIRED_UNIQUE_COUNTS}), skipping inflection discovery by default.")
log.info("To force inflection discovery, manually inspect the data and then specify the -u and -l parameters.")
log.info("To show the graph, pass the parameters: --graph -l 1 -u 0")
df_filt = df
else:
rank = args.use_predetermined_rank
if rank:
log.info("Using predetermined rank, not initiating discovery")
bounds = (0, 0)
else:
bounds = find_bounds(df, max_=args.max_rank, min_=args.min_rank)
rank = find_inflection(df, bounds, output_count=args.list_points)
log.info("Rank of inflection point: %d", rank)
if args.graph:
show_graph(df, bounds, rank)
if args.dry_run:
sys.exit(0)
log.debug("Filtering...")
df_filt = df[df.index <= rank]
write_df(df_filt, args.outfile)
if __name__ == "__main__":
cli()