neuralmagic · horheynm · Mar 15, 2024 · Mar 18, 2024 · Mar 20, 2024 · Mar 20, 2024
diff --git a/src/sparseml/transformers/finetune/data/base.py b/src/sparseml/transformers/finetune/data/base.py
@@ -41,6 +41,7 @@ class TextGenerationDataset(RegistryMixin):
     """
 
     PROMPT_KEY = "prompt"
+    MASK_KEY = "mask"
 
     def __init__(
         self,

diff --git a/src/sparseml/transformers/finetune/data/custom.py b/src/sparseml/transformers/finetune/data/custom.py
@@ -107,7 +107,7 @@ def get_remove_columns_from_dataset(
             remove_columns.remove(self.text_column)
         if self.PROMPT_KEY in remove_columns:
             remove_columns.remove(self.PROMPT_KEY)
-        if "mask" in remove_columns:
-            remove_columns.remove("mask")
+        if self.MASK_KEY in remove_columns:
+            remove_columns.remove(self.MASK_KEY)
 
         return list(remove_columns)
diff --git a/src/sparseml/transformers/utils/helpers.py b/src/sparseml/transformers/utils/helpers.py
@@ -559,47 +559,53 @@ def fetch_recipe_path(target: str):
     return recipe_path
 
 
-def generate_mask(string: str, prompt: str, censor: str) -> str:
+def generate_mask(string: str, response: str, prompt: Optional[str] = None) -> str:
     """
-    Generate a mask based on provided prompt and censor strings to obscure
-    characters in the input string.
+    Generate a mask based on provided prompt and response strings to obscure
+    characters in the input string. Prompt will be masked and string in response
+    will be kept represented by 0 - remove and 1 - keep.
+    By default, non-reponse wrapped strings will be matched with 0
 
     Args:
     :param string: The input string to be masked.
-    :param prompt: The prompt string to identify characters to keep visible.
-    :param censor: The censor string to identify characters to obscure.
+    :param prompt: The prompt string to identify characters to obscure.
+    :param response: The response string to identify characters to keep visible.
 
     Returns:
         str: A string representing the mask where '1' indicates visible
         characters and '0' indicates obscured characters.
 
     """
+    if prompt is None:
+        prompt = ""
+
     mask = ["1"] * len(string)
     is_prompt = True
     counter = 0
     for i, char in enumerate(string):
-        if not is_prompt:
+        if is_prompt:
             mask[i] = "0"
 
         if counter > 0:
             if not is_prompt and char == prompt[counter]:
                 counter += 1
-            elif is_prompt and char == censor[counter]:
+            elif is_prompt and char == response[counter]:
                 counter += 1
             else:
                 counter = 0
 
-        if counter == len(prompt) and not is_prompt:
-            mask[i - counter + 1 : i + 1] = ["1"] * counter
+        if len(prompt) > 0 and counter == len(prompt) and not is_prompt:
+            mask[i - counter + 1 : i + 1] = ["0"] * counter
+
             counter = 0
             is_prompt = True
 
-        if counter == len(censor) and is_prompt:
-            mask[i - counter + 1 : i + 1] = ["0"] * counter
+        if counter == len(response) and is_prompt:
+            mask[i - counter + 1 : i + 1] = ["1"] * counter
+
             counter = 0
             is_prompt = False
 
-        if prompt.startswith(char) or censor.startswith(char):
+        if prompt.startswith(char) or response.startswith(char):
             counter = 1
-
     return "".join(mask)
diff --git a/tests/sparseml/transformers/utils/test_helpers.py b/tests/sparseml/transformers/utils/test_helpers.py
@@ -170,26 +170,43 @@ def test_save_zoo_directory(tmp_path, stub):
 
 
 @pytest.mark.parametrize(
-    "string, prompt, censor, expected_mask",
+    "string, response, prompt, expected_mask",
     [
-        ("[foo]hello\n\n[bar]world", "[foo]", "[bar]", "1111111111110000000000"),
+        (
+            ("[foo]hello\n\n" "[bar]world"),
+            "[bar]",
+            "[foo]",
+            ("000000000000" "1111111111"),
+        ),
         (
             (
                 "[Instruction]python is\n\n"  # 24
                 "[Response]great\n\n"  # 17
                 "[Instruction]What about Java"  # 28
                 "[Response]Meh"  # 13
             ),
-            "[Instruction]",
             "[Response]",
+            "[Instruction]",
             (
-                "111111111111111111111111"  # 24
-                "00000000000000000"  # 17
-                "1111111111111111111111111111"  # 28
-                "0000000000000"  # 13
+                "000000000000000000000000"  # 24
+                "11111111111111111"  # 17
+                "0000000000000000000000000000"  # 28
+                "1111111111111"  # 13
             ),
         ),
+        (
+            ("[foo]hello\n\n" "[bar]world"),
+            "[bar]",
+            None,
+            ("000000000000" "1111111111"),
+        ),
+        (
+            ("hello\n\n" "[bar]world"),
+            "[bar]",
+            None,
+            ("0000000" "1111111111"),
+        ),
     ],
 )
-def test_generate_mask(string, prompt, censor, expected_mask):
-    assert generate_mask(string, prompt, censor) == expected_mask
+def test_generate_mask(string, response, prompt, expected_mask):
+    assert generate_mask(string, response, prompt) == expected_mask