diff --git a/ChangeLog b/ChangeLog index 7df985629e..43886103f6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -22,9 +22,10 @@ Release date: TBA Closes #3826 -* Improved the Similarity checker performance. +* Improved the Similarity checker performance. Fix issue with ``--min-similarity-lines`` used with ``--jobs``. Close #4120 + Close #4118 * Don't emit ``no-member`` error if guarded behind if statement. diff --git a/pylint/lint/parallel.py b/pylint/lint/parallel.py index e22e81afb3..ad6830721d 100644 --- a/pylint/lint/parallel.py +++ b/pylint/lint/parallel.py @@ -160,7 +160,7 @@ def check_parallel(linter, jobs, files, arguments=None): pool.join() _merge_mapreduce_data(linter, all_mapreduce_data) - linter.stats = _merge_stats(all_stats) + linter.stats = _merge_stats([linter.stats] + all_stats) # Insert stats data to local checkers. for checker in linter.get_checkers(): diff --git a/tests/test_check_parallel.py b/tests/test_check_parallel.py index 5818431445..931e5be308 100644 --- a/tests/test_check_parallel.py +++ b/tests/test_check_parallel.py @@ -67,6 +67,68 @@ def process_module(self, _astroid): self.data.append(record) +class ParallelTestChecker(BaseChecker): + """A checker that does need to consolidate data. + + To simulate the need to consolidate data, this checker only + reports a message for pairs of files. + + On non-parallel builds: it works on all the files in a single run. + + On parallel builds: lint.parallel calls ``open`` once per file. + + So if files are treated by separate processes, no messages will be + raised from the individual process, all messages will be raised + from reduce_map_data. + """ + + __implements__ = (pylint.interfaces.IRawChecker,) + + name = "parallel-checker" + test_data = "parallel" + msgs = { + "R9999": ( + "Test %s", + "parallel-test-check", + "Some helpful text.", + ) + } + + def __init__(self, linter, *args, **kwargs): + super().__init__(linter, *args, **kwargs) + self.data = [] + self.linter = linter + self.stats = None + + def open(self): + """init the checkers: reset statistics information""" + self.stats = self.linter.add_stats() + self.data = [] + + def close(self): + for _ in self.data[1::2]: # Work on pairs of files, see class docstring. + self.add_message("R9999", args=("From process_module, two files seen.",)) + + def get_map_data(self): + return self.data + + def reduce_map_data(self, linter, data): + recombined = type(self)(linter) + recombined.open() + aggregated = [] + for d in data: + aggregated.extend(d) + for _ in aggregated[1::2]: # Work on pairs of files, see class docstring. + self.add_message("R9999", args=("From reduce_map_data",)) + recombined.close() + + def process_module(self, _astroid): + """Called once per stream/file/astroid object""" + # record the number of invocations with the data object + record = self.test_data + str(len(self.data)) + self.data.append(record) + + class ExtraSequentialTestChecker(SequentialTestChecker): """A checker that does not need to consolidate data across run invocations""" @@ -74,6 +136,13 @@ class ExtraSequentialTestChecker(SequentialTestChecker): test_data = "extra-sequential" +class ExtraParallelTestChecker(ParallelTestChecker): + """A checker that does need to consolidate data across run invocations""" + + name = "extra-parallel-checker" + test_data = "extra-parallel" + + class ThirdSequentialTestChecker(SequentialTestChecker): """A checker that does not need to consolidate data across run invocations""" @@ -81,6 +150,13 @@ class ThirdSequentialTestChecker(SequentialTestChecker): test_data = "third-sequential" +class ThirdParallelTestChecker(ParallelTestChecker): + """A checker that does need to consolidate data across run invocations""" + + name = "third-parallel-checker" + test_data = "third-parallel" + + class TestCheckParallelFramework: """Tests the check_parallel() function's framework""" @@ -402,3 +478,69 @@ def test_compare_workers_to_single_proc(self, num_files, num_jobs, num_checkers) assert ( stats_check_parallel == expected_stats ), "The lint is returning unexpected results, has something changed?" + + @pytest.mark.parametrize( + "num_files,num_jobs,num_checkers", + [ + (2, 2, 1), + (2, 2, 2), + (2, 2, 3), + (3, 2, 1), + (3, 2, 2), + (3, 2, 3), + (3, 1, 1), + (3, 1, 2), + (3, 1, 3), + (3, 5, 1), + (3, 5, 2), + (3, 5, 3), + (10, 2, 1), + (10, 2, 2), + (10, 2, 3), + (2, 10, 1), + (2, 10, 2), + (2, 10, 3), + ], + ) + def test_map_reduce(self, num_files, num_jobs, num_checkers): + """Compares the 3 key parameters for check_parallel() produces the same results + + The intent here is to validate the reduce step: no stats should be lost. + + Checks regression of https://github.com/PyCQA/pylint/issues/4118 + """ + + # define the stats we expect to get back from the runs, these should only vary + # with the number of files. + file_infos = _gen_file_datas(num_files) + + # Loop for single-proc and mult-proc so we can ensure the same linter-config + for do_single_proc in range(2): + linter = PyLinter(reporter=Reporter()) + + # Assign between 1 and 3 checkers to the linter, they should not change the + # results of the lint + linter.register_checker(ParallelTestChecker(linter)) + if num_checkers > 1: + linter.register_checker(ExtraParallelTestChecker(linter)) + if num_checkers > 2: + linter.register_checker(ThirdParallelTestChecker(linter)) + + if do_single_proc: + # establish the baseline + assert ( + linter.config.jobs == 1 + ), "jobs>1 are ignored when calling _check_files" + linter._check_files(linter.get_ast, file_infos) + stats_single_proc = linter.stats + else: + check_parallel( + linter, + jobs=num_jobs, + files=file_infos, + arguments=None, + ) + stats_check_parallel = linter.stats + assert ( + stats_single_proc["by_msg"] == stats_check_parallel["by_msg"] + ), "Single-proc and check_parallel() should return the same thing"