indentation_kit.py: fix in char_count calculation.

elifarley · elifarley · commit c59bb6ff3bf7 · 2024-10-24T13:06:10.000-03:00
Now analyses the deltas in the sorted space counts first, and only then checks GCD
diff --git a/src/cedarscript_editor/cedarscript_editor.py b/src/cedarscript_editor/cedarscript_editor.py
@@ -146,9 +146,8 @@ def _update_command(self, cmd: UpdateCommand):
                 content_range = restrict_search_range_for_marker(
                     region, action, lines, RangeSpec.EMPTY, identifier_finder
                 )
-                content = content_range.read(lines)
-                content = IndentationInfo.from_content(lines).update_min_indent_level(content).shift_indentation(
-                    content, search_range.indent, relindent_level
+                content = IndentationInfo.shift_indentation(
+                    content_range.read(lines), lines, search_range.indent, relindent_level
                 )
                 content = (region, content)
             case _:
@@ -158,9 +157,8 @@ def _update_command(self, cmd: UpdateCommand):
                         #     region, action, lines, RangeSpec.EMPTY, identifier_finder
                         # )
                         # TODO Are the 3 lines above needed?
-                        content = move_src_range.read(lines)
-                        content = IndentationInfo.from_content(lines).update_min_indent_level(content).shift_indentation(
-                            content, search_range.indent, relindent_level
+                        content = IndentationInfo.shift_indentation(
+                            move_src_range.read(lines), lines, search_range.indent, relindent_level
                         )
                     case DeleteClause():
                         pass
diff --git a/src/text_manipulation/indentation_kit.py b/src/text_manipulation/indentation_kit.py
@@ -110,6 +110,54 @@ class IndentationInfo(NamedTuple):
     def default(cls) -> 'IndentationInfo':
         return cls(4, ' ', 0)
 
+    @classmethod
+    def shift_indentation(cls,
+        content: Sequence[str], target_lines: Sequence[str], target_reference_indentation_count: int,
+        relindent_level: int | None
+    ) -> list[str]:
+        """
+        Returns 'content' with shifted indentation based on a relative indent level and a reference indentation count.
+
+        This method adjusts the indentation of each non-empty line in the input sequence.
+        It calculates the difference between the target base indentation and the minimum
+        indentation found in the content, then applies this shift to all lines.
+
+        Args:
+            content (Sequence[str]): A sequence of strings representing the lines to be adjusted.
+            target_reference_indentation_count (int): The target base indentation count to adjust to.
+            relindent_level (int|None):
+
+        Returns:
+            list[str]: A new list of strings with adjusted indentation.
+
+        Note:
+            - Empty lines and lines with only whitespace are preserved as-is.
+            - The method uses the IndentationInfo of the instance to determine
+              the indentation character and count.
+            - This method is useful for uniformly adjusting indentation across all lines.
+
+        Example:
+            >>> info = IndentationInfo(4, ' ', 1, True)
+            >>> lines = ["    def example():", "        print('Hello')"]
+            >>> info.shift_indentation(content, 8)
+            ['        def example():', '            print('Hello')']
+        """
+        context_indent_char_count = cls.from_content(target_lines).char_count
+        return (cls.
+            from_content(content).
+            _replace(char_count=context_indent_char_count).
+            _shift_indentation(
+                content, target_lines, target_reference_indentation_count, relindent_level
+            )
+        )
+
+    def _shift_indentation(self,
+        content: Sequence[str], target_lines: Sequence[str], target_base_indentation_count: int, relindent_level: int | None
+    ) -> list[str]:
+        target_base_indentation_count += self.char_count * (relindent_level or 0)
+        raw_line_adjuster = self._shift_indentation_fun(target_base_indentation_count)
+        return [raw_line_adjuster(line) for line in content]
+
     @classmethod
     def from_content(cls, content: str | Sequence[str]) -> 'IndentationInfo':
         """
@@ -132,14 +180,16 @@ def from_content(cls, content: str | Sequence[str]) -> 'IndentationInfo':
               character count by analyzing patterns and using GCD.
         """
         # TODO Always send str?
-        lines = [x.lstrip() for x in content.splitlines() if x.strip()] if isinstance(content, str) else content
+        lines = [x for x in content.splitlines() if x.strip()] if isinstance(content, str) else content
 
         indentations = [extract_indentation(line) for line in lines if line.strip()]
+        has_zero_indent = any((i == '' for i in indentations))
+        indentations = [indent for indent in indentations if indent]
 
         if not indentations:
             return cls(4, ' ', 0, True, "No indentation found. Assuming 4 spaces (PEP 8).")
 
-        indent_chars = Counter(indent[0] for indent in indentations if indent)
+        indent_chars = Counter(indent[0] for indent in indentations)
         dominant_char = ' ' if indent_chars.get(' ', 0) >= indent_chars.get('\t', 0) else '\t'
 
         indent_lengths = [len(indent) for indent in indentations]
@@ -148,20 +198,26 @@ def from_content(cls, content: str | Sequence[str]) -> 'IndentationInfo':
             char_count = 1
         else:
             # For spaces, determine the most likely char_count
-            space_counts = [sc for sc in indent_lengths if sc % 2 == 0 and sc > 0]
+            space_counts = [sc for sc in indent_lengths if sc % 2 == 0]
             if not space_counts:
                 char_count = 2  # Default to 2 if no even space counts
             else:
-                # Sort top 5 space counts and find the largest GCD
-                sorted_counts = sorted([c[0] for c in Counter(space_counts).most_common(5)], reverse=True)
-                char_count = sorted_counts[0]
-                for i in range(1, len(sorted_counts)):
-                    new_gcd = gcd(char_count, sorted_counts[i])
-                    if new_gcd <= 1:
-                        break
-                    char_count = new_gcd
-
-        min_indent_chars = min(indent_lengths) if indent_lengths else 0
+                unique_space_counts = sorted(set(space_counts))
+                deltas = sorted([b - a for a, b in zip(unique_space_counts, unique_space_counts[1:])], reverse=True)
+                most_common_deltas = Counter(deltas).most_common(5)
+                ratio_most_common = most_common_deltas[0][1] / len(deltas)
+                if ratio_most_common > .6:
+                    char_count = most_common_deltas[0][0]
+                else:
+                    char_count = deltas[0]
+                    # find the largest GCD
+                    for i in range(1, len(most_common_deltas)):
+                        new_gcd = gcd(char_count, most_common_deltas[i][0])
+                        if new_gcd <= 1:
+                            break
+                        char_count = new_gcd
+
+        min_indent_chars = 0 if has_zero_indent else min(indent_lengths) if indent_lengths else 0
         min_indent_level = min_indent_chars // char_count
 
         consistency = all(len(indent) % char_count == 0 for indent in indentations if indent)
@@ -217,42 +273,6 @@ def level_to_chars(self, level: int) -> str:
         """
         return level * self.char_count * self.char
 
-    # TODO Revise
-    def shift_indentation(
-            self, lines: Sequence[str], target_base_indentation_count: int, relindent_level: int | None
-    ) -> list[str]:
-        """
-        Shift the indentation of a sequence of lines based on a target base indentation count.
-
-        This method adjusts the indentation of each non-empty line in the input sequence.
-        It calculates the difference between the target base indentation and the minimum
-        indentation found in the content, then applies this shift to all lines.
-
-        Args:
-            lines (Sequence[str]): A sequence of strings representing the lines to be adjusted.
-            target_base_indentation_count (int): The target base indentation count to adjust to.
-            relindent_level (int|None):
-
-        Returns:
-            list[str]: A new list of strings with adjusted indentation.
-
-        Note:
-            - Empty lines and lines with only whitespace are preserved as-is.
-            - The method uses the IndentationInfo of the instance to determine
-              the indentation character and count.
-            - This method is useful for uniformly adjusting indentation across all lines.
-
-        Example:
-            >>> info = IndentationInfo(4, ' ', 1, True)
-            >>> lines = ["    def example():", "        print('Hello')"]
-            >>> info.shift_indentation(lines, 8)
-            ['        def example():', '            print('Hello')']
-        """
-        target_base_indentation_count += self.char_count * (relindent_level or 0)
-        raw_line_adjuster = self._shift_indentation_fun(target_base_indentation_count)
-        # Return the transformed lines
-        return [raw_line_adjuster(line) for line in lines]
-
     def _shift_indentation_fun(self, target_base_indentation_count: int):
         # Calculate the indentation difference
         level_difference = self.level_difference(target_base_indentation_count)
diff --git a/tests/corpus/refactor.baseconv/baseconv.py b/tests/corpus/refactor.baseconv/baseconv.py
@@ -0,0 +1,115 @@
+# RemovedInDjango50Warning
+# Copyright (c) 2010 Guilherme Gondim. All rights reserved.
+# Copyright (c) 2009 Simon Willison. All rights reserved.
+# Copyright (c) 2002 Drew Perttula. All rights reserved.
+#
+# License:
+#   Python Software Foundation License version 2
+#
+# See the file "LICENSE" for terms & conditions for usage, and a DISCLAIMER OF
+# ALL WARRANTIES.
+#
+# This Baseconv distribution contains no GNU General Public Licensed (GPLed)
+# code so it may be used in proprietary projects just like prior ``baseconv``
+# distributions.
+#
+# All trademarks referenced herein are property of their respective holders.
+#
+
+"""
+Convert numbers from base 10 integers to base X strings and back again.
+
+Sample usage::
+
+  >>> base20 = BaseConverter('0123456789abcdefghij')
+  >>> base20.encode(1234)
+  '31e'
+  >>> base20.decode('31e')
+  1234
+  >>> base20.encode(-1234)
+  '-31e'
+  >>> base20.decode('-31e')
+  -1234
+  >>> base11 = BaseConverter('0123456789-', sign='$')
+  >>> base11.encode(-1234)
+  '$-22'
+  >>> base11.decode('$-22')
+  -1234
+
+"""
+import warnings
+
+from django.utils.deprecation import RemovedInDjango50Warning
+
+warnings.warn(
+    "The django.utils.baseconv module is deprecated.",
+    category=RemovedInDjango50Warning,
+    stacklevel=2,
+)
+
+BASE2_ALPHABET = "01"
+BASE16_ALPHABET = "0123456789ABCDEF"
+BASE56_ALPHABET = "23456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnpqrstuvwxyz"
+BASE36_ALPHABET = "0123456789abcdefghijklmnopqrstuvwxyz"
+BASE62_ALPHABET = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+BASE64_ALPHABET = BASE62_ALPHABET + "-_"
+
+
+class BaseConverter:
+    decimal_digits = "0123456789"
+
+    def __init__(self, digits, sign="-"):
+        self.sign = sign
+        self.digits = digits
+        if sign in self.digits:
+            raise ValueError("Sign character found in converter base digits.")
+
+    def __repr__(self):
+        return "<%s: base%s (%s)>" % (
+            self.__class__.__name__,
+            len(self.digits),
+            self.digits,
+        )
+
+    def encode(self, i):
+        neg, value = self.convert(i, self.decimal_digits, self.digits, "-")
+        if neg:
+            return self.sign + value
+        return value
+
+    def decode(self, s):
+        neg, value = self.convert(s, self.digits, self.decimal_digits, self.sign)
+        if neg:
+            value = "-" + value
+        return int(value)
+
+    def convert(self, number, from_digits, to_digits, sign):
+        if str(number)[0] == sign:
+            number = str(number)[1:]
+            neg = 1
+        else:
+            neg = 0
+
+        # make an integer out of the number
+        x = 0
+        for digit in str(number):
+            x = x * len(from_digits) + from_digits.index(digit)
+
+        # create the result in base 'len(to_digits)'
+        if x == 0:
+            res = to_digits[0]
+        else:
+            res = ""
+            while x > 0:
+                digit = x % len(to_digits)
+                res = to_digits[digit] + res
+                x = int(x // len(to_digits))
+        return neg, res
+
+
+base2 = BaseConverter(BASE2_ALPHABET)
+base16 = BaseConverter(BASE16_ALPHABET)
+base36 = BaseConverter(BASE36_ALPHABET)
+base56 = BaseConverter(BASE56_ALPHABET)
+base62 = BaseConverter(BASE62_ALPHABET)
+base64 = BaseConverter(BASE64_ALPHABET, sign="$")
diff --git a/tests/corpus/refactor.baseconv/chat.xml b/tests/corpus/refactor.baseconv/chat.xml
@@ -0,0 +1,33 @@
+<no-train>
+```CEDARScript
+-- Move the `convert` method from the `BaseConverter` class, placing it at the top level, just before the line where its class starts
+UPDATE CLASS "BaseConverter"
+FROM FILE "baseconv.py"
+MOVE FUNCTION "convert"
+INSERT BEFORE CLASS "BaseConverter"
+RELATIVE INDENTATION 0;
+
+-- Update the copied function to remove references to `self`
+UPDATE FUNCTION "convert"
+FROM FILE "baseconv.py"
+REPLACE LINE "def convert(self, number, from_digits, to_digits, sign):"
+WITH CONTENT '''
+@0:def convert(number, from_digits, to_digits, sign):
+''';
+
+-- Update ALL call sites of the method `convert` to call the new top-level function with the same name
+UPDATE FUNCTION "encode"
+FROM FILE "baseconv.py"
+REPLACE LINE "neg, value = self.convert(i, self.decimal_digits, self.digits, \"-\")"
+WITH CONTENT '''
+@0:neg, value = convert(i, self.decimal_digits, self.digits, "-")
+''';
+UPDATE FUNCTION "decode"
+FROM FILE "baseconv.py"
+REPLACE LINE "neg, value = self.convert(s, self.digits, self.decimal_digits, self.sign)"
+WITH CONTENT '''
+@0:neg, value = convert(s, self.digits, self.decimal_digits, self.sign)
+''';
+
+```
+</no-train>
diff --git a/tests/corpus/refactor.baseconv/expected.baseconv.py b/tests/corpus/refactor.baseconv/expected.baseconv.py