perf(automation): Enhance expandability (#71)

CaptchaAgent · Oct 30, 2023 · 5a65824 · 5a65824
1 parent 96b295e
commit 5a65824
Show file tree

Hide file tree

Showing 7 changed files with 143 additions and 12 deletions.
diff --git a/automation/assets_manager.py b/automation/assets_manager.py
@@ -110,7 +110,7 @@ def download_datasets(self, issue_url):
         return td
 
     def get_download_links(self, issue_url: str):
-        prefix = "https://github.com/captcha-challenger/hcaptcha-whistleblower/releases/download/automation-archive/"
+        prefix = "https://github.com/CaptchaAgent/hcaptcha-whistleblower/releases/download/automation-archive/"
 
         res = self.client.get(issue_url)
         soup = BeautifulSoup(res.text, "html.parser")
@@ -138,7 +138,13 @@ def merge(self, fd: Path, td: Path):
 
 def run():
     # the largest animal https://github.com/QIN2DIM/hcaptcha-challenger/issues/797
+    # red panda https://github.com/QIN2DIM/hcaptcha-challenger/issues/896
+    #
+    sources = "https://github.com/QIN2DIM/hcaptcha-challenger/issues/797"
     sources = "the largest animal"
+    sources = "please click on the largest animal"
+    # sources = "the smallest animal"
+    # sources = "natural landscape"
     am = AssetsManager.from_sources(sources)
     am.execute()
 

diff --git a/automation/auto_labeling.py b/automation/auto_labeling.py
@@ -193,7 +193,7 @@ def run():
 
     for card in flow_card:
         # Filter out the task cards we care about
-        if "the_largest_animal" not in card["joined_dirs"]:
+        if "fff" not in card["joined_dirs"]:
             continue
         # Generating a dataclass from serialized data
         dl = DataLake(

diff --git a/automation/datasets_downloader.py b/automation/datasets_downloader.py
@@ -14,7 +14,7 @@
 collected = []
 per_times = 60
 tmp_dir = Path(__file__).parent.joinpath("tmp_dir")
-sitekey = SiteKey.epic
+sitekey = SiteKey.discord
 
 
 async def collete_datasets(context: ASyncContext):

diff --git a/automation/flow_card.py b/automation/flow_card.py
@@ -48,4 +48,32 @@
             "nested_largest_fox": {"yes": ["fox"], "bad": ["crab", "bird", "dragonfly", "ant"]},
         },
     },
+    {
+        "positive_labels": ["red panda"],
+        "negative_labels": ["cactus", "door", "guinea pig", "meerkat"],
+        "joined_dirs": ["red_panda"],
+    },
+    {
+        "positive_labels": ["tiger", "squirrel"],
+        "negative_labels": ["dog", "bat", "raccoon", "ant", "ladybug"],
+        "joined_dirs": ["please_click_on_the_largest_animal", "fff"],
+        # = ↑↑ = 和常规情况一样，先对整体数据集进行多目标分类
+        # = ↓↓ = 再根据具体的 yes/bad 映射关系进行数据集二次移动
+        "substack": {
+            "nested_largest_tiger": {"yes": ["tiger"], "bad": ["dog", "bat", "raccoon"]},
+            "nested_largest_squirrel": {"yes": ["squirrel"], "bad": ["ant", "ladybug"]},
+        },
+    },
+    {
+        "positive_labels": ["natural landscape", "Mountain", "forest"],
+        "negative_labels": [
+            "chess",
+            "laptop",
+            "helicopter",
+            "meerkat",
+            "roller coaster",
+            "Recreational facilities",
+        ],
+        "joined_dirs": ["natural_landscape"],
+    },
 ]
diff --git a/automation/mini_workflow.py b/automation/mini_workflow.py
@@ -92,8 +92,7 @@ def upgrade_objects(aid_):
         # "nested_smallest_turtle": "nested_smallest_turtle2309",
         # "nested_largest_dog": "nested_largest_dog2309",
         # "bicycle": "bicycle2309",
-        "nested_largest_fox": "nested_largest_fox2309",
-
+        # "nested_largest_fox": "nested_largest_fox2309",
     }
     # fmt:on
 

diff --git a/automation/roboflow_resnet.ipynb b/automation/roboflow_resnet.ipynb
@@ -87,6 +87,7 @@
     "import hcaptcha_challenger as solver\n",
     "\n",
     "os.environ[\"GITHUB_TOKEN\"] = GITHUB_TOKEN\n",
+    "onnx_archive_name = onnx_archive_name.replace(\".onnx\", \"\")\n",
     "\n",
     "solver.diagnose_task(task_name)"
    ],

diff --git a/automation/zip_dataset.py b/automation/zip_dataset.py
@@ -7,16 +7,26 @@
 
 import os
 import shutil
-import webbrowser
 import zipfile
 from pathlib import Path
 
-import hcaptcha_challenger as solver
+from hcaptcha_challenger import prompt2task, ModelHub, diagnose_task, install
+
+CELL_TEMPLATE = """
+>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+GITHUB_TOKEN = "{github_token}"
+task_name = "{task_name}"
+onnx_archive_name = "{onnx_archive_name}"
+NESTED_PROMPT = "{nested_prompt}"
+<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+"""
+
+NOTEBOOK = "https://colab.research.google.com/github/captcha-challenger/hcaptcha-model-factory/blob/main/automation/roboflow_resnet.ipynb"
 
 
 def zip_dataset(prompt: str):
     prompt = prompt.replace("_", " ")
-    task_name = solver.prompt2task(prompt)
+    task_name = prompt2task(prompt)
 
     project_dir = Path(__file__).parent.parent
     images_dir = project_dir.joinpath("database2309", task_name)
@@ -40,9 +50,96 @@ def zip_dataset(prompt: str):
                     zip_file.write(os.path.join(root, file), f"bad/{file}")
 
     print(f">> OUTPUT - {zip_path=}")
+    return task_name
+
+
+def print_quick_start_info(task_name: str, nested_prompt: str = ""):
+    """
+    task_name: like natural_landscape, nested_largest_tiger
+    """
+    diagnose_task(task_name)
+
+    if task_name.startswith("nested_") and not nested_prompt:
+        raise ValueError("生成嵌套类型模版需要提供其配对的提示词")
+
+    install(upgrade=True)
+    modelhub = ModelHub.from_github_repo()
+    modelhub.parse_objects()
+
+    label = task_name.replace("_", " ")
+
+    onnx_archive_name = ""
+
+    if not nested_prompt:
+        if onnx_archive := modelhub.label_alias.get(label):
+            oan = onnx_archive.replace(".onnx", "")
+            v = ""
+            for char in reversed(oan):
+                if char.isdigit():
+                    v = char + v
+                else:
+                    break
+            if v and v.isdigit():
+                v = int(v) + 1
+                onnx_archive_name = f"{task_name}{str(v)}"
+        else:
+            onnx_archive_name = f"{task_name}2309"
+
+    else:
+        for i in modelhub.nested_categories.get(nested_prompt, []):
+            print(f"{nested_prompt} => {i}")
+
+        if nested_models := modelhub.nested_categories.get(nested_prompt, []):
+            if not isinstance(nested_models, list):
+                if nested_models:
+                    raise TypeError(
+                        f"NestedTypeError ({nested_prompt}) 的模型映射列表应该是个 List[str] 类型，但实际上是 {type(nested_models)}"
+                    )
+                nested_models = []
+            v = ""
+            for i, model_name in enumerate(nested_models):
+                filter_chars = [".onnx", task_name]
+                for fc in filter_chars:
+                    model_name = model_name.replace(fc, "")
+                if not model_name.isdigit():
+                    continue
+                else:
+                    v = model_name
+                    break
+            if v and v.isdigit():
+                v = int(v) + 1
+                onnx_archive_name = f"{task_name}{str(v)}"
+        else:
+            onnx_archive_name = f"{task_name}2309"
+
+    _t = CELL_TEMPLATE.format(
+        github_token=os.getenv("GITHUB_TOKEN", ""),
+        task_name=task_name,
+        onnx_archive_name=onnx_archive_name,
+        nested_prompt=nested_prompt,
+    )
+
+    print(_t)
+
+
+def run():
+    prompt = "nested_largest_squirrel"
+
+    # 生成嵌套类型模版需要提供其配对的提示词
+    # the smallest animal
+    # please click on the largest animal
+    nested_prompt = "please click on the largest animal"
+
+    # 压缩数据集
+    tn = zip_dataset(prompt=prompt)
+
+    # 打印配置模版
+    print_quick_start_info(task_name=tn, nested_prompt=nested_prompt)
+
+    import webbrowser
+
+    webbrowser.open(NOTEBOOK)
 
 
-zip_dataset(prompt="nested_largest_dog")
-webbrowser.open(
-    "https://colab.research.google.com/github/captcha-challenger/hcaptcha-model-factory/blob/main/automation/roboflow_resnet.ipynb"
-)
+if __name__ == "__main__":
+    run()