Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bin/physlr-make
Original file line number Diff line number Diff line change
Expand Up @@ -979,7 +979,7 @@ min_path_size=200

# Map the draft assembly to the backbone graph and output BED.
%.backbone.map-split.$(draft).n10.bed: %.backbone.path $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.mol2-bcs.split.tsv $(draft).k$k-w$w.physlr.tsv
$(time) $(python) $(bin)/physlr map-split -V$V -n10 $^ >$@
$(time) $(python) $(bin)/physlr map --mx-type split --map-pos 10 -V$V -n10 $^ >$@

# Map the draft assembly to the backbone graph and output BED.
%.overlap.m$m.mol.backbone.map.$(draft).n10.bed: %.overlap.m$m.mol.backbone.path %.tsv $(draft).k$k-w$w.physlr.tsv
Expand Down
136 changes: 49 additions & 87 deletions physlr/physlr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1750,23 +1750,13 @@ def index_minimizers_in_backbones(backbones, bxtomxs):
mxtopos = {}
for tid, path in enumerate(progress(backbones)):
for pos, u in enumerate(path):
if u not in bxtomxs:
u = u.rsplit("_", 1)[0]
if u not in bxtomxs:
u = u.rsplit("_", 1)[0]
for mx in bxtomxs[u]:
mxtopos.setdefault(mx, set()).add((tid, pos))
print(
int(timeit.default_timer() - t0),
"Indexed", len(mxtopos), "minimizers", file=sys.stderr)
return mxtopos

@staticmethod
def index_minimizers_in_backbones_split(backbones, bxtomxs):
"Index the positions of the minimizers in the backbones."
mxtopos = {}
for tid, path in enumerate(progress(backbones)):
for pos, u in enumerate(path):
if Physlr.args.mx_type == "unsplit":
if u not in bxtomxs:
u = u.rsplit("_", 1)[0]
if u not in bxtomxs:
u = u.rsplit("_", 1)[0]
elif Physlr.args.mx_type == "split":
pass
for mx in bxtomxs[u]:
mxtopos.setdefault(mx, set()).add((tid, pos))
print(
Expand Down Expand Up @@ -1806,27 +1796,6 @@ def map_indexing(self):

return query_mxs, mxtopos, backbones

def map_indexing_split(self):
"Load data structures and indexes required for mapping."
if len(self.args.FILES) < 3:
sys.exit("physlr map: error: at least three file arguments are required")
path_filenames = [self.args.FILES[0]]
target_filenames = [self.args.FILES[1]]
query_filenames = self.args.FILES[2:]

# Index the positions of the minimizers in the backbone.
moltomxs = Physlr.read_minimizers(target_filenames)
query_mxs = moltomxs if target_filenames == query_filenames else \
Physlr.read_minimizers_list(query_filenames)

# Index the positions of the markers in the backbone.
backbones = Physlr.read_paths(path_filenames)
backbones = [backbone for backbone in backbones
if len(backbone) >= self.args.min_component_size]
mxtopos = Physlr.index_minimizers_in_backbones_split(backbones, moltomxs)

return query_mxs, mxtopos, backbones

def physlr_map_mkt(self):
"""
Map sequences to a physical map.
Expand Down Expand Up @@ -1907,6 +1876,15 @@ def physlr_map(self):
Map sequences to a physical map.
Usage: physlr map TPATHS.path TMARKERS.tsv QMARKERS.tsv... >MAP.bed
"""
if self.args.map_pos < 0:
print("--map-pos cannot be negative", file=sys.stderr)
print("See physlr --help for more information", file=sys.stderr)
sys.exit(1)

if self.args.mx_type not in ["unsplit", "split"]:
print("Invalid --mx-type argument", file=sys.stderr)
print("See physlr --help for more information", file=sys.stderr)
sys.exit(1)

query_mxs, mxtopos, _backbones = self.map_indexing()

Expand All @@ -1928,54 +1906,24 @@ def physlr_map(self):
for (tid, tpos), score in tidpos_to_n.items():
if score >= self.args.n:
mapped = True
orientation = Physlr.determine_orientation(
tidpos_to_qpos.get((tid, tpos - 1), None),
tidpos_to_qpos.get((tid, tpos + 0), None),
tidpos_to_qpos.get((tid, tpos + 1), None))
print(tid, tpos, tpos + 1, qid, score, orientation, sep="\t")
if mapped:
num_mapped += 1
print(
int(timeit.default_timer() - t0),
"Mapped", num_mapped, "sequences of", len(query_mxs),
f"({round(100 * num_mapped / len(query_mxs), 2)}%)", file=sys.stderr)

def physlr_map_split(self):
"""
Map sequences to a physical map.
Usage: physlr map TPATHS.path TMARKERS.tsv QMARKERS.tsv... >MAP.bed
"""

query_mxs, mxtopos, _backbones = self.map_indexing_split()

# Map the query sequences to the physical map.
num_mapped = 0
for qid, mxs in progress(query_mxs.items()):
tidpos_to_qpos = {}
for qpos, mx in enumerate(mxs):
for tidpos in mxtopos.get(mx, ()):
tidpos_to_qpos.setdefault(tidpos, []).append(qpos)
for tidpos, qpos in tidpos_to_qpos.items():
tidpos_to_qpos[tidpos] = statistics.median_low(qpos)
# Count the number of minimizers mapped to each target position.
tidpos_to_n = Counter(pos for mx in mxs for pos in mxtopos.get(mx, ()))

mapped = False
for (tid, tpos), score in tidpos_to_n.items():
if score >= self.args.n:
mapped = True
before = [tidpos_to_qpos.get((tid, tpos - i), None)
for i in range(10)
if tidpos_to_qpos.get((tid, tpos - i), None) is not None]
before_median = statistics.median_low(before)
after = [tidpos_to_qpos.get((tid, tpos + i), None)
for i in range(10)
if tidpos_to_qpos.get((tid, tpos + i), None) is not None]
after_median = statistics.median_high(after)
orientation = Physlr.determine_orientation(
before_median,
tidpos_to_qpos.get((tid, tpos + 0), None),
after_median)
if self.args.map_pos == 1:
orientation = Physlr.determine_orientation(
tidpos_to_qpos.get((tid, tpos - 1), None),
tidpos_to_qpos.get((tid, tpos + 0), None),
tidpos_to_qpos.get((tid, tpos + 1), None))
else:
before = [tidpos_to_qpos.get((tid, tpos - i), None)
for i in range(self.args.map_pos)
if tidpos_to_qpos.get((tid, tpos - i), None) is not None]
before_median = statistics.median_low(before)
after = [tidpos_to_qpos.get((tid, tpos + i), None)
for i in range(self.args.map_pos)
if tidpos_to_qpos.get((tid, tpos + i), None) is not None]
after_median = statistics.median_high(after)
orientation = Physlr.determine_orientation(
before_median,
tidpos_to_qpos.get((tid, tpos + 0), None),
after_median)
print(tid, tpos, tpos + 1, qid, score, orientation, sep="\t")
if mapped:
num_mapped += 1
Expand All @@ -1990,6 +1938,11 @@ def physlr_map_paf(self):
Usage: physlr map TGRAPH.path TMARKERS.tsv QMARKERS.tsv... >MAP.paf
"""

if self.args.mx_type not in ["unsplit", "split"]:
print("Invalid --mx-type argument", file=sys.stderr)
print("See physlr --help for more information", file=sys.stderr)
sys.exit(1)

query_mxs, mxtopos, backbones = self.map_indexing()

# Map the query sequences to the physical map.
Expand Down Expand Up @@ -2724,8 +2677,17 @@ def parse_arguments():
help="ARCS scaffold pairing distance estimation file.")
argparser.add_argument(
"--dist-type", action="store", dest="dist_type", type=str, default="avg",
choices=["min", "avg", "max"],
help="ARCS scaffold pairing distance type."
"Acceptable values are: min, avg, max")
"Accepted values are: min, avg, or max [avg].")
argparser.add_argument(
"--mx-type", action="store", dest="mx_type", type=str, default="unsplit",
choices=["unsplit", "split"],
help="Type of minimizers used to map sequence to backbone graph."
"Accepted values are: unsplit or split [unsplit].")
argparser.add_argument(
"--map-pos", action="store", dest="map_pos", type=int, default=1,
help="Number of positions to use during the orientation process [1].")
return argparser.parse_args()

def __init__(self):
Expand Down