fix(zip-dataset): handle nested model (#72)

CaptchaAgent · Oct 31, 2023 · 9e0b003 · 9e0b003
1 parent c26f6ea
commit 9e0b003
Show file tree

Hide file tree

Showing 4 changed files with 85 additions and 56 deletions.
diff --git a/automation/assets_manager.py b/automation/assets_manager.py
@@ -110,7 +110,7 @@ def download_datasets(self, issue_url):
         return td
 
     def get_download_links(self, issue_url: str):
-        prefix = "https://github.com/CaptchaAgent/hcaptcha-whistleblower/releases/download/automation-archive/"
+        prefix = "hcaptcha-whistleblower/releases/download/automation-archive/"
 
         res = self.client.get(issue_url)
         soup = BeautifulSoup(res.text, "html.parser")
@@ -139,11 +139,10 @@ def merge(self, fd: Path, td: Path):
 def run():
     # the largest animal https://github.com/QIN2DIM/hcaptcha-challenger/issues/797
     # red panda https://github.com/QIN2DIM/hcaptcha-challenger/issues/896
-    #
-    sources = "https://github.com/QIN2DIM/hcaptcha-challenger/issues/797"
-    sources = "the largest animal"
-    sources = "please click on the largest animal"
-    # sources = "the smallest animal"
+    sources = "https://github.com/QIN2DIM/hcaptcha-challenger/issues/826"  # the smallest animal
+    # sources = "the largest animal"
+    # sources = "please click on the largest animal"
+    sources = "the smallest animal"
     # sources = "natural landscape"
     am = AssetsManager.from_sources(sources)
     am.execute()

diff --git a/automation/auto_labeling.py b/automation/auto_labeling.py
@@ -193,7 +193,7 @@ def run():
 
     for card in flow_card:
         # Filter out the task cards we care about
-        if "fff" not in card["joined_dirs"]:
+        if "f1-bird" not in card["joined_dirs"]:
             continue
         # Generating a dataclass from serialized data
         dl = DataLake(

diff --git a/automation/flow_card.py b/automation/flow_card.py
@@ -76,4 +76,19 @@
         ],
         "joined_dirs": ["natural_landscape"],
     },
+    {
+        "positive_labels": ["starfish"],
+        "negative_labels": ["panda", "dog", "cow", "elephant", "guinea pig", "dolphins",
+                            "bird", "goat", "lion", "bear", ""],
+        "joined_dirs": ["the_smallest_animal", "f1-star"],
+    },
+    {
+        "positive_labels": ["bird"],
+        "negative_labels": ["panda", "dog", "cow", "dolphins", "goat", "lion", "bear", "giraffe"],
+        "joined_dirs": ["the_smallest_animal", "f1-bird"],
+        "substack": {
+            "nested_smallest_bird": {"yes": ["bird"], "bad": ["panda", "dog", "cow", "dolphins", "goat", "lion", "bear", "giraffe"]},
+        },
+    },
+
 ]
diff --git a/automation/zip_dataset.py b/automation/zip_dataset.py
@@ -53,64 +53,80 @@ def zip_dataset(prompt: str):
     return task_name
 
 
+def parse_stander_model(modelhub, task_name):
+    label = task_name.replace("_", " ")
+    onnx_archive_name = ""
+
+    if onnx_archive := modelhub.label_alias.get(label):
+        oan = onnx_archive.replace(".onnx", "")
+        v = ""
+        for char in reversed(oan):
+            if char.isdigit():
+                v = char + v
+            else:
+                break
+        if v and v.isdigit():
+            v = int(v) + 1
+            onnx_archive_name = f"{task_name}{str(v)}"
+
+    return onnx_archive_name
+
+
+def parse_nested_model(modelhub, task_name, nested_prompt, ):
+    onnx_archive_name = ""
+
+    for i in modelhub.nested_categories.get(nested_prompt, []):
+        print(f"{nested_prompt} => {i}")
+
+    print(nested_prompt in modelhub.nested_categories)
+
+    if nested_models := modelhub.nested_categories.get(nested_prompt, []):
+        if not isinstance(nested_models, list):
+            if nested_models:
+                raise TypeError(
+                    f"NestedTypeError ({nested_prompt}) 的模型映射列表应该是个 List[str] 类型，但实际上是 {type(nested_models)}"
+                )
+            nested_models = []
+        v = ""
+        for i, model_name in enumerate(nested_models):
+            filter_chars = [".onnx", task_name]
+            for fc in filter_chars:
+                model_name = model_name.replace(fc, "")
+            if not model_name.isdigit():
+                continue
+
+            v = model_name
+            break
+
+        if v and v.isdigit():
+            v = int(v) + 1
+            onnx_archive_name = f"{task_name}{str(v)}"
+
+    return onnx_archive_name
+
+
 def print_quick_start_info(task_name: str, nested_prompt: str = ""):
     """
     task_name: like natural_landscape, nested_largest_tiger
     """
     diagnose_task(task_name)
-
     if task_name.startswith("nested_") and not nested_prompt:
         raise ValueError("生成嵌套类型模版需要提供其配对的提示词")
 
     install(upgrade=True)
     modelhub = ModelHub.from_github_repo()
     modelhub.parse_objects()
 
-    label = task_name.replace("_", " ")
-
-    onnx_archive_name = ""
-
+    # 常规模型
     if not nested_prompt:
-        if onnx_archive := modelhub.label_alias.get(label):
-            oan = onnx_archive.replace(".onnx", "")
-            v = ""
-            for char in reversed(oan):
-                if char.isdigit():
-                    v = char + v
-                else:
-                    break
-            if v and v.isdigit():
-                v = int(v) + 1
-                onnx_archive_name = f"{task_name}{str(v)}"
-        else:
-            onnx_archive_name = f"{task_name}2309"
-
+        onnx_archive_name = parse_stander_model(modelhub, task_name)
+    # 嵌套模型
     else:
-        for i in modelhub.nested_categories.get(nested_prompt, []):
-            print(f"{nested_prompt} => {i}")
-
-        if nested_models := modelhub.nested_categories.get(nested_prompt, []):
-            if not isinstance(nested_models, list):
-                if nested_models:
-                    raise TypeError(
-                        f"NestedTypeError ({nested_prompt}) 的模型映射列表应该是个 List[str] 类型，但实际上是 {type(nested_models)}"
-                    )
-                nested_models = []
-            v = ""
-            for i, model_name in enumerate(nested_models):
-                filter_chars = [".onnx", task_name]
-                for fc in filter_chars:
-                    model_name = model_name.replace(fc, "")
-                if not model_name.isdigit():
-                    continue
-                else:
-                    v = model_name
-                    break
-            if v and v.isdigit():
-                v = int(v) + 1
-                onnx_archive_name = f"{task_name}{str(v)}"
-        else:
-            onnx_archive_name = f"{task_name}2309"
+        onnx_archive_name = parse_nested_model(modelhub, task_name, nested_prompt)
+
+    if not onnx_archive_name:
+        onnx_archive_name = f"{task_name}2309"
+    onnx_archive_name = onnx_archive_name.replace(".onnx", "")
 
     _t = CELL_TEMPLATE.format(
         github_token=os.getenv("GITHUB_TOKEN", ""),
@@ -123,22 +139,21 @@ def print_quick_start_info(task_name: str, nested_prompt: str = ""):
 
 
 def run():
-    prompt = "nested_largest_squirrel"
+    prompt = "nested_smallest_bird"
 
     # 生成嵌套类型模版需要提供其配对的提示词
     # the smallest animal
     # please click on the largest animal
-    nested_prompt = "please click on the largest animal"
+    # the largest animal
+    nested_prompt = "the smallest animal"
 
     # 压缩数据集
     tn = zip_dataset(prompt=prompt)
 
     # 打印配置模版
     print_quick_start_info(task_name=tn, nested_prompt=nested_prompt)
 
-    import webbrowser
-
-    webbrowser.open(NOTEBOOK)
+    print(f"Open In Colab -> {NOTEBOOK}")
 
 
 if __name__ == "__main__":