From 52d1b459cafa3ec742d7f3aab82bb9b29c61954e Mon Sep 17 00:00:00 2001
From: bachmann <bachmann.matt@gmail.com>
Date: Sat, 17 Jan 2015 13:36:20 -0500
Subject: [PATCH] Parse pylint report such that duplicate code violations will
 flag all involved files, not just one

---
 .../tests/fixtures/git_diff_code_dupe.txt     |  21 ++++
 diff_cover/tests/fixtures/pylint_dupe.txt     | 102 ++++++++++++++++++
 .../pylint_dupe_violations_report.txt         |  12 +++
 diff_cover/tests/test_integration.py          |   7 ++
 diff_cover/tests/test_violations_reporter.py  |   2 +-
 diff_cover/violations_reporter.py             |  62 ++++++++---
 6 files changed, 192 insertions(+), 14 deletions(-)
 create mode 100644 diff_cover/tests/fixtures/git_diff_code_dupe.txt
 create mode 100644 diff_cover/tests/fixtures/pylint_dupe.txt
 create mode 100644 diff_cover/tests/fixtures/pylint_dupe_violations_report.txt

diff --git a/diff_cover/tests/fixtures/git_diff_code_dupe.txt b/diff_cover/tests/fixtures/git_diff_code_dupe.txt
new file mode 100644
index 00000000..605fdc85
--- /dev/null
+++ b/diff_cover/tests/fixtures/git_diff_code_dupe.txt
@@ -0,0 +1,21 @@
+diff --git a/fileone.py b/fileone.py
+index e69de29..3952627 100644
+--- a/fileone.py
++++ b/fileone.py
+@@ -0,0 +1,16 @@
++"""
++Fileone
++"""
++def selection_sort(to_sort):
++    """
++    The greatest sorting algorithm?
++    """
++    new_list = []
++    final_size = len(to_sort)
++    while len(new_list) < final_size:
++        candidate_index = 0
++        for index in xrange(len(to_sort)):
++            if to_sort[index] <= to_sort[candidate_index]:
++                candidate_index = index
++        new_list.append(to_sort.pop(candidate_index))
++    return new_list
\ No newline at end of file
diff --git a/diff_cover/tests/fixtures/pylint_dupe.txt b/diff_cover/tests/fixtures/pylint_dupe.txt
new file mode 100644
index 00000000..9b15f458
--- /dev/null
+++ b/diff_cover/tests/fixtures/pylint_dupe.txt
@@ -0,0 +1,102 @@
+************* Module filetwo
+filetwo.py:1: [R0801(duplicate-code), ] Similar lines in 2 files
+==fileone:3
+==filetwo:3
+def selection_sort(to_sort):
+    """
+    The greatest sorting algorithm?
+    """
+    new_list = []
+    final_size = len(to_sort)
+    while len(new_list) < final_size:
+        candidate_index = 0
+        for index in xrange(len(to_sort)):
+            if to_sort[index] <= to_sort[candidate_index]:
+                candidate_index = index
+        new_list.append(to_sort.pop(candidate_index))
+    return new_list
+
+
+Report
+======
+21 statements analysed.
+
+Statistics by type
+------------------
+
++---------+-------+-----------+-----------+------------+---------+
+|type     |number |old number |difference |%documented |%badname |
++=========+=======+===========+===========+============+=========+
+|module   |2      |2          |=          |100.00      |0.00     |
++---------+-------+-----------+-----------+------------+---------+
+|class    |0      |0          |=          |0           |0        |
++---------+-------+-----------+-----------+------------+---------+
+|method   |0      |0          |=          |0           |0        |
++---------+-------+-----------+-----------+------------+---------+
+|function |2      |2          |=          |100.00      |0.00     |
++---------+-------+-----------+-----------+------------+---------+
+
+
+
+Raw metrics
+-----------
+
++----------+-------+------+---------+-----------+
+|type      |number |%     |previous |difference |
++==========+=======+======+=========+===========+
+|code      |20     |52.63 |20       |=          |
++----------+-------+------+---------+-----------+
+|docstring |12     |31.58 |12       |=          |
++----------+-------+------+---------+-----------+
+|comment   |0      |0.00  |0        |=          |
++----------+-------+------+---------+-----------+
+|empty     |6      |15.79 |6        |=          |
++----------+-------+------+---------+-----------+
+
+
+
+Duplication
+-----------
+
++-------------------------+-------+---------+-----------+
+|                         |now    |previous |difference |
++=========================+=======+=========+===========+
+|nb duplicated lines      |13     |13       |=          |
++-------------------------+-------+---------+-----------+
+|percent duplicated lines |40.625 |40.625   |=          |
++-------------------------+-------+---------+-----------+
+
+
+
+Messages by category
+--------------------
+
++-----------+-------+---------+-----------+
+|type       |number |previous |difference |
++===========+=======+=========+===========+
+|convention |0      |0        |=          |
++-----------+-------+---------+-----------+
+|refactor   |1      |1        |=          |
++-----------+-------+---------+-----------+
+|warning    |0      |0        |=          |
++-----------+-------+---------+-----------+
+|error      |0      |0        |=          |
++-----------+-------+---------+-----------+
+
+
+
+Messages
+--------
+
++---------------+------------+
+|message id     |occurrences |
++===============+============+
+|duplicate-code |1           |
++---------------+------------+
+
+
+
+Global evaluation
+-----------------
+Your code has been rated at 9.52/10 (previous run: 9.52/10, +0.00)
+
diff --git a/diff_cover/tests/fixtures/pylint_dupe_violations_report.txt b/diff_cover/tests/fixtures/pylint_dupe_violations_report.txt
new file mode 100644
index 00000000..a06b554e
--- /dev/null
+++ b/diff_cover/tests/fixtures/pylint_dupe_violations_report.txt
@@ -0,0 +1,12 @@
+-------------
+Diff Quality
+Quality Report: pylint
+Diff: origin/master...HEAD, staged, and unstaged changes
+-------------
+fileone.py (93.8%):
+    3: R0801: (duplicate-code), : Similar lines in 2 files
+-------------
+Total:   16 lines
+Violations: 1 line
+% Quality: 93%
+-------------
\ No newline at end of file
diff --git a/diff_cover/tests/test_integration.py b/diff_cover/tests/test_integration.py
index 26566055..55c2f9cf 100644
--- a/diff_cover/tests/test_integration.py
+++ b/diff_cover/tests/test_integration.py
@@ -449,6 +449,13 @@ def test_pre_generated_pylint_report(self):
             ['diff-quality', '--violations=pylint', 'pylint_report.txt']
         )
 
+    def test_pylint_report_with_dup_code_violation(self):
+        self._check_console_report(
+            'git_diff_code_dupe.txt',
+            'pylint_dupe_violations_report.txt',
+            ['diff-quality', '--violations=pylint', 'pylint_dupe.txt']
+        )
+
     def _call_quality_expecting_error(self, tool_name, expected_error):
         """
         Makes calls to diff_quality that should fail to ensure
diff --git a/diff_cover/tests/test_violations_reporter.py b/diff_cover/tests/test_violations_reporter.py
index 59ff3378..6e63ef4a 100644
--- a/diff_cover/tests/test_violations_reporter.py
+++ b/diff_cover/tests/test_violations_reporter.py
@@ -736,7 +736,7 @@ def test_quality(self):
             file1.py:149: [C0324, Foo.__dict__] Comma not followed by a space
                 self.peer_grading._find_corresponding_module_for_location(Location('i4x','a','b','c','d'))
             file1.py:162: [R0801] Similar lines in 2 files
-            ==external_auth.views:1
+            ==file1:162
             ==student.views:4
             import json
             import logging
diff --git a/diff_cover/violations_reporter.py b/diff_cover/violations_reporter.py
index 0663815b..e0f45573 100644
--- a/diff_cover/violations_reporter.py
+++ b/diff_cover/violations_reporter.py
@@ -398,14 +398,14 @@ class PyflakesQualityReporter(BaseQualityReporter):
 class Flake8QualityReporter(BaseQualityReporter):
     """
     Report Flake8 violations.
-    
+
     Flake8 warning/error codes:
         E***/W***: pep8 errors and warnings
         F***: pyflakes codes
         C9**: mccabe complexity plugin
         N8**: pep8-naming plugin
         T000: flake8-todo plugin
-    
+
     http://flake8.readthedocs.org/en/latest/warnings.html
     """
     COMMAND = 'flake8'
@@ -422,11 +422,14 @@ class PylintQualityReporter(BaseQualityReporter):
     LEGACY_OPTIONS = ['-f', 'parseable', '--reports=no', '--include-ids=y']
     OPTIONS = MODERN_OPTIONS
     EXTENSIONS = ['py']
+    DUPE_CODE_VIOLATION = 'R0801'
 
     # Match lines of the form:
     # path/to/file.py:123: [C0111] Missing docstring
     # path/to/file.py:456: [C0111, Foo.bar] Missing docstring
     VIOLATION_REGEX = re.compile(r'^([^:]+):(\d+): \[(\w+),? ?([^\]]*)] (.*)$')
+    MULTI_LINE_VIOLATION_REGEX = re.compile(r'==(\w|.+):(.*)')
+    DUPE_CODE_MESSAGE_REGEX = re.compile(r'Similar lines in (\d+) files')
 
     def _run_command(self, src_path):
         try:
@@ -439,32 +442,65 @@ def _run_command(self, src_path):
             else:
                 raise
 
+    def _process_dupe_code_violation(self, lines, current_line, message):
+        """
+        The duplicate code violation is a multi line error. This pulls out
+        all the relevant files
+        """
+        src_paths = []
+        message_match = self.DUPE_CODE_MESSAGE_REGEX.match(message)
+        if message_match:
+            for _ in range(int(message_match.group(1))):
+                current_line += 1
+                match = self.MULTI_LINE_VIOLATION_REGEX.match(
+                    lines[current_line]
+                )
+                src_path, l_number = match.groups()
+                src_paths.append(('%s.py' % src_path, l_number))
+        return src_paths
+
     def _parse_output(self, output, src_path=None):
         """
         See base class docstring.
         """
         violations_dict = defaultdict(list)
 
-        for line in output.split('\n'):
+        output_lines = output.split('\n')
+
+        for output_line_number, line in enumerate(output_lines):
             match = self.VIOLATION_REGEX.match(line)
 
             # Ignore any line that isn't matched
             # (for example, snippets from the source code)
             if match is not None:
 
-                pylint_src_path, line_number, pylint_code, function_name, message = match.groups()
+                (pylint_src_path,
+                 line_number,
+                 pylint_code,
+                 function_name,
+                 message) = match.groups()
+                if pylint_code == self.DUPE_CODE_VIOLATION:
+                    files_involved = self._process_dupe_code_violation(
+                        output_lines,
+                        output_line_number,
+                        message
+                    )
+                else:
+                    files_involved = [(pylint_src_path, line_number)]
 
-                # If we're looking for a particular source file,
-                # ignore any other source files.
-                if src_path is None or src_path == pylint_src_path:
+                for violation in files_involved:
+                    pylint_src_path, line_number = violation
+                    # If we're looking for a particular source file,
+                    # ignore any other source files.
+                    if src_path is None or src_path == pylint_src_path:
 
-                    if function_name:
-                        error_str = u"{0}: {1}: {2}".format(pylint_code, function_name, message)
-                    else:
-                        error_str = u"{0}: {1}".format(pylint_code, message)
+                        if function_name:
+                            error_str = u"{0}: {1}: {2}".format(pylint_code, function_name, message)
+                        else:
+                            error_str = u"{0}: {1}".format(pylint_code, message)
 
-                    violation = Violation(int(line_number), error_str)
-                    violations_dict[pylint_src_path].append(violation)
+                        violation = Violation(int(line_number), error_str)
+                        violations_dict[pylint_src_path].append(violation)
 
         return violations_dict