diff --git a/.pylintrc b/.pylintrc index 1b122fda..702d7b16 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,5 +1,5 @@ [basic] -good-names = bx, c, e, g, g0, i, j, k, mx, n, q0, q1, q2, q3, q4, t0, u, un, us, vn, v, vs, w, ws, x, xs, y, z, hh, ht, th, tt +good-names = bx, c, e, g, g0, i, j, k, mx, n, q0, q1, q2, q3, q4, t0, u, un, us, vn, v, vs, w, ws, x, xs, y, z, hh, ht, th, tt, m max-branches=25 max-locals=50 max-module-lines=5000 diff --git a/bin/physlr-make b/bin/physlr-make index ad7a5a64..77f1ad5f 100755 --- a/bin/physlr-make +++ b/bin/physlr-make @@ -1054,7 +1054,7 @@ g=$(shell awk '{x += $$2} END{print x}' $(name)/$(ref).fa.fai) # Estimate the number of molecules per barcode. %.physlr.overlap.n20.countmol.tsv: %.physlr.overlap.tsv - $(python) $(bin)/physlr count-molecules -n20 -V$V $< >$@ + $(python) $(bin)/physlr count-molecules -m20 -V$V $< >$@ # Remove barcodes with more than one molecule. %.physlr.overlap.molecules.M2.tsv: %.physlr.overlap.molecules.tsv diff --git a/data/test.k1-w1.n1-2.c2-x.physlr.overlap.tsv b/data/test.k1-w1.n1-2.c2-x.physlr.overlap.tsv index dc1d9f95..10cf52b3 100644 --- a/data/test.k1-w1.n1-2.c2-x.physlr.overlap.tsv +++ b/data/test.k1-w1.n1-2.c2-x.physlr.overlap.tsv @@ -1,4 +1,4 @@ -U n +U m A 100 B 100 C 100 @@ -11,7 +11,7 @@ H 100 J 100 K 5 -U V n +U V m A B 100 A C 100 B C 100 diff --git a/physlr/physlr.py b/physlr/physlr.py index f2d06a74..1c2d6e2c 100644 --- a/physlr/physlr.py +++ b/physlr/physlr.py @@ -151,19 +151,19 @@ def read_paths(filenames): @staticmethod def write_tsv(g, fout): "Write a graph in TSV format." - if "m" in next(iter(g.nodes.values())): - print("U\tn\tm", file=fout) + if "mol" in next(iter(g.nodes.values())): + print("U\tm\tmol", file=fout) else: - print("U\tn", file=fout) + print("U\tm", file=fout) for u, prop in g.nodes.items(): - if "m" in prop: - print(u, prop["n"], prop["m"], sep="\t", file=fout) + if "mol" in prop: + print(u, prop["m"], prop["mol"], sep="\t", file=fout) else: - print(u, prop["n"], sep="\t", file=fout) - print("\nU\tV\tn", file=fout) + print(u, prop["m"], sep="\t", file=fout) + print("\nU\tV\tm", file=fout) for e, prop in g.edges.items(): u, v = sorted(e) - print(u, v, prop["n"], sep="\t", file=fout) + print(u, v, prop["m"], sep="\t", file=fout) @staticmethod def write_graph(g, fout, graph_format): @@ -185,7 +185,7 @@ def read_tsv(g, filename): line = fin.readline() if Physlr.args.verbose >= 2: progressbar.update(len(line)) - if line not in ["U\tn\n", "U\tn\tm\n"]: + if line not in ["U\tm\n", "U\tm\tmol\n"]: print("Unexpected header:", line, file=sys.stderr) sys.exit(1) reading_vertices = True @@ -196,7 +196,7 @@ def read_tsv(g, filename): line = fin.readline() if Physlr.args.verbose >= 2: progressbar.update(len(line)) - if line == "U\tV\tn\n": + if line == "U\tV\tm\n": reading_vertices = False else: print("Unexpected header:", line, file=sys.stderr) @@ -211,15 +211,15 @@ def read_tsv(g, filename): xs = line.split() if reading_vertices: if len(xs) == 2: - g.add_node(xs[0], n=int(xs[1])) + g.add_node(xs[0], m=int(xs[1])) elif len(xs) == 3: - g.add_node(xs[0], n=int(xs[1]), m=int(xs[2])) + g.add_node(xs[0], m=int(xs[1]), mol=int(xs[2])) else: print("Unexpected row:", line, file=sys.stderr) sys.exit(1) else: if len(xs) == 3: - g.add_edge(xs[0], xs[1], n=int(xs[2])) + g.add_edge(xs[0], xs[1], m=int(xs[2])) else: print("Unexpected row:", line, file=sys.stderr) sys.exit(1) @@ -232,9 +232,9 @@ def read_graphviz(g, filename): "Read a GraphViz file." graph = nx.drawing.nx_agraph.read_dot(filename) for vprop in graph.nodes().values(): - vprop["n"] = int(vprop["n"]) + vprop["m"] = int(vprop["m"]) for _, _, eprop in graph.edges.data(): - eprop["n"] = int(eprop["n"]) + eprop["m"] = int(eprop["m"]) return nx.algorithms.operators.binary.compose(g, graph) # Complement nucleotides. @@ -304,14 +304,14 @@ def remove_singletons(g): return len(singletons) @staticmethod - def filter_edges(g, arg_n): - "Remove edges with n < arg_n." - if arg_n == 0: + def filter_edges(g, arg_m): + "Remove edges with m < arg_m." + if arg_m == 0: return - edges = [(u, v) for u, v, n in progress(g.edges(data="n")) if n < arg_n] + edges = [(u, v) for u, v, m in progress(g.edges(data="m")) if m < arg_m] print( int(timeit.default_timer() - t0), - "Removed", len(edges), "edges with fewer than", arg_n, + "Removed", len(edges), "edges with fewer than", arg_m, "common minimizers of", g.number_of_edges(), f"({round(100 * len(edges) / g.number_of_edges(), 2)}%)", file=sys.stderr) g.remove_edges_from(edges) @@ -322,9 +322,9 @@ def filter_edges(g, arg_n): "Removed", num_singletons, "isolated vertices.", file=sys.stderr) @staticmethod - def keep_best_edges(g, bestn): + def keep_best_edges(g, bestm): """Keep the best edges of each vertex.""" - if bestn is None: + if bestm is None: return num_edges = g.number_of_edges() num_removed = 0 @@ -332,8 +332,8 @@ def keep_best_edges(g, bestn): us.sort(key=g.degree, reverse=True) for u in progress(us): vs = list(g[u]) - vs.sort(key=lambda v, u=u: g[u][v]["n"], reverse=True) - for v in vs[bestn:]: + vs.sort(key=lambda v, u=u: g[u][v]["m"], reverse=True) + for v in vs[bestm:]: g.remove_edge(u, v) num_removed += 1 print( @@ -532,7 +532,7 @@ def split_junctions_of_tree(prune_junctions, gin, keep_largest=0): junctions = Physlr.detect_junctions_of_tree(gin, prune_junctions) g = gin.copy() for junction in junctions: - edges = list(g.edges(junction, data="n")) + edges = list(g.edges(junction, data="m")) edges.sort(key=lambda e: e[2], reverse=True) if Physlr.args.verbose >= 3: print("Junction:", junction, "Edges:", *edges, file=sys.stderr) @@ -559,8 +559,8 @@ def determine_backbones_of_trees(g, prune_junctions): removed_count += rem_count for subcomponent in nx.connected_components(gcomponents): gsubcomponent = g.subgraph(subcomponent) - u, v, _ = Physlr.diameter_of_tree(gsubcomponent, weight="n") - path = nx.shortest_path(gsubcomponent, u, v, weight="n") + u, v, _ = Physlr.diameter_of_tree(gsubcomponent, weight="m") + path = nx.shortest_path(gsubcomponent, u, v, weight="m") paths.append(path) paths.sort(key=len, reverse=True) print( @@ -687,7 +687,7 @@ def physlr_cut_chimera(g): def determine_pruned_mst(g): """Return the pruned maximum spanning tree of the graph.""" # having tested kruskal and prim, we found the former is faster in our case - gmst = nx.maximum_spanning_tree(g, algorithm="kruskal", weight="n") + gmst = nx.maximum_spanning_tree(g, algorithm="kruskal", weight="m") print( int(timeit.default_timer() - t0), "Determined the maximum spanning tree.", file=sys.stderr, flush=True) @@ -891,9 +891,9 @@ def remove_small_components(g, min_component_size): def physlr_filter(self): "Filter a graph." g = self.read_graph(self.args.FILES) - Physlr.filter_edges(g, self.args.n) + Physlr.filter_edges(g, self.args.m) if self.args.M is not None: - vertices = [u for u, prop in g.nodes().items() if prop["m"] >= self.args.M] + vertices = [u for u, prop in g.nodes().items() if prop["mol"] >= self.args.M] g.remove_nodes_from(vertices) print( int(timeit.default_timer() - t0), @@ -905,8 +905,8 @@ def physlr_filter(self): def physlr_best_edges(self): """Keep the best edges of each vertex.""" g = self.read_graph(self.args.FILES) - Physlr.filter_edges(g, self.args.n) - Physlr.keep_best_edges(g, self.args.bestn) + Physlr.filter_edges(g, self.args.m) + Physlr.keep_best_edges(g, self.args.bestm) self.write_graph(g, sys.stdout, self.args.graph_format) print(int(timeit.default_timer() - t0), "Wrote graph", file=sys.stderr) @@ -928,24 +928,24 @@ def physlr_flesh_backbone(self): for neighbour in neighbours: if neighbour in backbone: continue - (max_n, max_index) = (float("-inf"), float("-inf")) + (max_m, max_index) = (float("-inf"), float("-inf")) for i, k in enumerate(backbone): if not g.has_edge(neighbour, k): continue - n = g[neighbour][k]["n"] - if n > max_n: - (max_n, max_index) = (n, i) + m = g[neighbour][k]["m"] + if m > max_m: + (max_m, max_index) = (m, i) # Put R or L of node with most shared minimizers? if max_index == 0: insert_index = max_index elif max_index == len(backbone) - 1: insert_index = max_index - 1 else: - r_n = g[neighbour][backbone[max_index+1]]['n'] \ + r_m = g[neighbour][backbone[max_index+1]]["m"] \ if g.has_edge(neighbour, backbone[max_index+1]) else 0 - l_n = g[neighbour][backbone[max_index-1]]['n'] \ + l_m = g[neighbour][backbone[max_index-1]]["m"] \ if g.has_edge(neighbour, backbone[max_index-1]) else 0 - if l_n > r_n: + if l_m > r_m: insert_index = max_index - 1 else: insert_index = max_index @@ -1021,8 +1021,8 @@ def physlr_count_minimizers(self): def physlr_intersect(self): "Print the minimizers in the intersection of each pair of barcodes." - if self.args.n == 0: - self.args.n = 1 + if self.args.m == 0: + self.args.m = 1 bxtomxs = self.read_minimizers(self.args.FILES) mxtobxs = self.construct_minimizers_to_barcodes(bxtomxs) if self.args.v: @@ -1031,7 +1031,7 @@ def physlr_intersect(self): pairs = {(u, v) for bxs in mxtobxs.values() for u, v in itertools.combinations(bxs, 2)} for u, v in pairs: common = bxtomxs[u] & bxtomxs[v] - if len(common) >= self.args.n: + if len(common) >= self.args.m: print(u, v, "", sep="\t", end="") print(*common) @@ -1103,17 +1103,17 @@ def physlr_filter_overlap(self): def physlr_degree(self): "Print the degree of each vertex." g = self.read_graph(self.args.FILES) - Physlr.filter_edges(g, self.args.n) - print("U\tn\tDegree") + Physlr.filter_edges(g, self.args.m) + print("U\tm\tDegree") for u, prop in progress(g.nodes.items()): - print(u, prop["n"], g.degree(u), sep="\t") + print(u, prop["m"], g.degree(u), sep="\t") print(int(timeit.default_timer() - t0), "Wrote degrees of vertices", file=sys.stderr) def physlr_common_neighbours(self): """Count the number of common neighbours for each edge.""" g = self.read_graph(self.args.FILES) for u, v in g.edges: - g[u][v]["n"] = len(list(nx.common_neighbors(g, u, v))) + g[u][v]["m"] = len(list(nx.common_neighbors(g, u, v))) print(int(timeit.default_timer() - t0), "Counted common neighbours.", file=sys.stderr, flush=True) self.write_graph(g, sys.stdout, self.args.graph_format) @@ -1228,7 +1228,7 @@ def physlr_biconnected_components(self): def physlr_tiling_graph(self): "Determine the minimum-tiling-path-induced subgraph." g = self.read_graph(self.args.FILES) - Physlr.filter_edges(g, self.args.n) + Physlr.filter_edges(g, self.args.m) backbones = self.determine_backbones(g) tiling = {u for path in backbones for u in nx.shortest_path(g, path[0], path[-1])} subgraph = g.subgraph(tiling) @@ -1237,7 +1237,7 @@ def physlr_tiling_graph(self): def physlr_count_molecules(self): "Estimate the nubmer of molecules per barcode." g = self.read_graph(self.args.FILES) - Physlr.filter_edges(g, self.args.n) + Physlr.filter_edges(g, self.args.m) print( int(timeit.default_timer() - t0), "Separating barcodes into molecules", file=sys.stderr) @@ -1245,7 +1245,7 @@ def physlr_count_molecules(self): for u, prop in progress(g.nodes.items()): subgraph = g.subgraph(g.neighbors(u)) # Ignore K3 (triangle) components. - prop["m"] = sum( + prop["mol"] = sum( 1 for component in nx.biconnected_components(subgraph) if len(component) >= 4) self.write_graph(g, sys.stdout, self.args.graph_format) @@ -1621,7 +1621,7 @@ def physlr_molecules(self): self.args.strategy.replace("+", " + "), file=sys.stderr) - Physlr.filter_edges(gin, self.args.n) + Physlr.filter_edges(gin, self.args.m) # Partition the neighbouring vertices of each barcode into molecules. if self.args.threads == 1: @@ -1640,10 +1640,10 @@ def physlr_molecules(self): # Add vertices. gout = nx.Graph() for u, vs in sorted(molecules.items()): - n = gin.nodes[u]["n"] + m = gin.nodes[u]["m"] nmolecules = 1 + max(vs.values()) if vs else 0 for i in range(nmolecules): - gout.add_node(f"{u}_{i}", n=n) + gout.add_node(f"{u}_{i}", m=m) print( int(timeit.default_timer() - t0), @@ -1659,7 +1659,7 @@ def physlr_molecules(self): continue u_molecule = molecules[u][v] v_molecule = molecules[v][u] - gout.add_edge(f"{u}_{u_molecule}", f"{v}_{v_molecule}", n=prop["n"]) + gout.add_edge(f"{u}_{u_molecule}", f"{v}_{v_molecule}", m=prop["m"]) print(int(timeit.default_timer() - t0), "Separated molecules", file=sys.stderr) num_singletons = Physlr.remove_singletons(gout) @@ -1698,7 +1698,7 @@ def subgraph_stats_process(u): def physlr_subgraphs_stats(self): "Retrieve subgraphs' stats." gin = self.read_graph(self.args.FILES) - Physlr.filter_edges(gin, self.args.n) + Physlr.filter_edges(gin, self.args.m) print( int(timeit.default_timer() - t0), "Computing statistics of the subgraphs...", file=sys.stderr) @@ -2007,7 +2007,7 @@ def physlr_annotate_graph(self): print(f'"{u}"') for e, prop in g.edges.items(): u, v = sorted(e) - print(f'"{u}" -- "{v}" [label={prop["n"]}]') + print(f'"{u}" -- "{v}" [label={prop["m"]}]') print("}") print(int(timeit.default_timer() - t0), "Wrote graph", file=sys.stderr) @@ -2526,11 +2526,14 @@ def parse_arguments(): "-M", "--max-molecules", action="store", dest="M", type=int, help="remove barcodes with M or more molecules [None]") argparser.add_argument( - "-n", "--min-n", action="store", dest="n", type=int, default=0, + "-n", action="store", dest="n", type=int, default=0, + help="remove mappings with score less than n [0]") + argparser.add_argument( + "-m", "--min-m", action="store", dest="m", type=int, default=0, help="remove edges with fewer than n shared minimizers [0]") argparser.add_argument( - "--bestn", action="store", dest="bestn", type=int, default=None, - help="Keep the best n edges of each vertex [None]") + "--bestm", action="store", dest="bestm", type=int, default=None, + help="Keep the best m edges of each vertex [None]") argparser.add_argument( "--min-length", action="store", dest="min_length", type=int, default=0, help="remove sequences with length less than N bp [0]") diff --git a/src/Makefile b/src/Makefile index 1da4ee9d..d69c47d6 100644 --- a/src/Makefile +++ b/src/Makefile @@ -60,8 +60,8 @@ check-physlr-makebf: all ./physlr-indexlr -t16 -k8 -w1 -r tiny.bf data/stLFR.tiny.fq | diff -q - data/stLFR.tiny.filtered.physlr.tsv.good check-physlr-overlap: all - ./physlr-overlap -t4 -n1 data/tiny.overlap.input.tsv | sort | diff -q - data/tiny.physlr.overlap.n1.sorted.tsv - ./physlr-overlap -t4 -n2 data/tiny.overlap.input.tsv | sort | diff -q - data/tiny.physlr.overlap.n2.sorted.tsv + ./physlr-overlap -t4 -m1 data/tiny.overlap.input.tsv | sort | diff -q - data/tiny.physlr.overlap.n1.sorted.tsv + ./physlr-overlap -t4 -m2 data/tiny.overlap.input.tsv | sort | diff -q - data/tiny.physlr.overlap.n2.sorted.tsv check-physlr-molecules: all ./physlr-molecules -s bc data/tiny.mol.input.tsv | diff -q - data/tiny.mol.tsv.good diff --git a/src/data/tiny.mol.input.tsv b/src/data/tiny.mol.input.tsv index a228bc88..ee58cbbc 100644 --- a/src/data/tiny.mol.input.tsv +++ b/src/data/tiny.mol.input.tsv @@ -1,4 +1,4 @@ -U n +U m 0 100 1 100 2 100 @@ -8,7 +8,7 @@ U n 6 100 7 100 -U V n +U V m 0 1 100 0 2 100 0 3 100 diff --git a/src/data/tiny.mol.tsv.good b/src/data/tiny.mol.tsv.good index b6d45d3d..25dba426 100644 --- a/src/data/tiny.mol.tsv.good +++ b/src/data/tiny.mol.tsv.good @@ -1,4 +1,4 @@ -U n +U m 0_0 100 1_0 100 2_0 100 @@ -10,7 +10,7 @@ U n 6_0 100 7_0 100 -U V n +U V m 0_0 1_0 100 0_0 2_0 100 0_0 3_0 100 diff --git a/src/data/tiny.physlr.overlap.n1.sorted.tsv b/src/data/tiny.physlr.overlap.n1.sorted.tsv index aa072a8f..71b10af0 100644 --- a/src/data/tiny.physlr.overlap.n1.sorted.tsv +++ b/src/data/tiny.physlr.overlap.n1.sorted.tsv @@ -14,5 +14,5 @@ C 543_288_93 1 C A 1 C AAAGTAGCATCTTAGG-1 1 C B 1 -U n -U V n +U m +U V m diff --git a/src/data/tiny.physlr.overlap.n2.sorted.tsv b/src/data/tiny.physlr.overlap.n2.sorted.tsv index 6a2f3f8d..ff38fd36 100644 --- a/src/data/tiny.physlr.overlap.n2.sorted.tsv +++ b/src/data/tiny.physlr.overlap.n2.sorted.tsv @@ -8,5 +8,5 @@ A 1 AAAGTAGCATCTTAGG-1 5 B 1 C 2 -U n -U V n +U m +U V m diff --git a/src/physlr-molecules.cc b/src/physlr-molecules.cc index 728f1584..2ba9d5f7 100644 --- a/src/physlr-molecules.cc +++ b/src/physlr-molecules.cc @@ -109,14 +109,14 @@ printUsage(const std::string& progname) void printGraph(const graph_t& g) { - std::cout << "U\tn" << std::endl; + std::cout << "U\tm" << std::endl; auto vertexItRange = boost::vertices(g); for (auto vertexIt = vertexItRange.first; vertexIt != vertexItRange.second; ++vertexIt) { auto& node1 = g[*vertexIt].name; auto& weight = g[*vertexIt].weight; std::cout << node1 << "\t" << weight << "\n"; } - std::cout << "\nU\tV\tn" << std::endl; + std::cout << "\nU\tV\tm" << std::endl; auto edgeItRange = boost::edges(g); for (auto edgeIt = edgeItRange.first; edgeIt != edgeItRange.second; ++edgeIt) { auto& weight = g[*edgeIt].weight; @@ -140,7 +140,7 @@ readTSV(graph_t& g, const std::vector& infiles, bool verbose) infile == "-" ? "/dev/stdin" : infile; std::ifstream infileStream(infile); for (std::string line; std::getline(infileStream, line);) { - if (line == "U\tn") { + if (line == "U\tm") { continue; } if (line.empty()) { @@ -170,7 +170,7 @@ readTSV(graph_t& g, const std::vector& infiles, bool verbose) #endif } for (std::string line; std::getline(infileStream, line);) { - if (line == "U\tV\tn") { + if (line == "U\tV\tm") { continue; } if (line.empty()) { diff --git a/src/physlr-overlap.cc b/src/physlr-overlap.cc index fb4e7b2d..4d35365b 100644 --- a/src/physlr-overlap.cc +++ b/src/physlr-overlap.cc @@ -25,7 +25,7 @@ * execution of the program. */ namespace opt { -static unsigned minN = 10; +static unsigned minM = 10; static unsigned threads = 1; } // namespace opt @@ -51,7 +51,7 @@ printHelpDialog() static const char dialog[] = "Usage: physlr-overlap [OPTION]... [MINIMIZERS.tsv]\n" "Read a sketch of linked reads and find overlapping barcodes.\n" - " -n, --min-n=INT Remove edges with fewer than n shared markers [10].\n" + " -m, --min-m=INT Remove edges with fewer than m shared markers [10].\n" " -t, --threads threads [1]\n" " -v, --version Print version\n" "Report bugs to ."; @@ -107,13 +107,13 @@ main(int argc, char* argv[]) int c; // long form arguments - static struct option long_options[] = { { "min-n", required_argument, nullptr, 'n' }, + static struct option long_options[] = { { "min-m", required_argument, nullptr, 'n' }, { "threads", required_argument, nullptr, 't' }, { "version", no_argument, nullptr, 'v' }, { nullptr, 0, nullptr, 0 } }; int i = 0; - while ((c = getopt_long(argc, argv, "n:t:v:", long_options, &i)) != -1) { + while ((c = getopt_long(argc, argv, "m:t:v:", long_options, &i)) != -1) { switch (c) { case 't': { std::stringstream convert(optarg); @@ -123,10 +123,10 @@ main(int argc, char* argv[]) } break; } - case 'n': { + case 'm': { std::stringstream convert(optarg); - if (!(convert >> opt::minN)) { - std::cerr << "Error - Invalid parameters! n: " << optarg << std::endl; + if (!(convert >> opt::minM)) { + std::cerr << "Error - Invalid parameters! m: " << optarg << std::endl; return 0; } break; @@ -221,7 +221,7 @@ main(int argc, char* argv[]) #endif std::cerr << "Memory usage: " << double(memory_usage()) / double(1048576) << "GB" << std::endl; - std::cout << "U\tn\n"; + std::cout << "U\tm\n"; std::string bufferString; // print out vertexes + counts for (const auto& itr : barcodes) { @@ -232,7 +232,7 @@ main(int argc, char* argv[]) bufferString += "\n"; std::cout << bufferString; } - std::cout << "\nU\tV\tn" << std::endl; + std::cout << "\nU\tV\tm" << std::endl; // store into 2d matrix / hash table using SimMat = tsl::robin_map; @@ -264,8 +264,8 @@ main(int argc, char* argv[]) #pragma omp atomic #endif edgeCount += 1; - // filter by n - if (opt::minN <= itr.second) { + // filter by m + if (opt::minM <= itr.second) { #if _OPENMP #pragma omp atomic #endif