ffe

CaptchaAgent · Nov 2, 2023 · fa3358a · fa3358a
1 parent 92e22d4
commit fa3358a
Show file tree

Hide file tree

Showing 7 changed files with 203 additions and 97 deletions.
diff --git a/automation/annotator.py b/automation/annotator.py
@@ -49,7 +49,7 @@ def from_modelhub(cls, modelhub: ModelHub):
     def to_yaml(self, path: Path | None = None):
         path = path or Path("objects-tmp.yaml")
         with open(path, "w", encoding="utf8") as file:
-            yaml.safe_dump(self.__dict__, file, sort_keys=False, allow_unicode=True)
+            yaml.safe_dump(self.__dict__, file, sort_keys=False, allow_unicode=True,)
         return path
 
     @staticmethod
@@ -209,5 +209,21 @@ def rolling_upgrade(asset_id=None, matched_label: str = ""):
         logger.warning(err)
 
 
+def find_asset_id(name_prefix: str):
+    """如果工作流在滚动更新前中断，可以通过此函数根据模型名前缀匹配到资源的 asset_id"""
+    repo = Annotator.repo
+    modelhub_title = "ONNX ModelHub"
+
+    for release in repo.get_releases():
+        if release.title != modelhub_title:
+            continue
+        for asset in release.get_assets():
+            if not asset.name.startswith(name_prefix):
+                continue
+            print(asset.name, asset.id)
+            break
+
+
 if __name__ == "__main__":
+    find_asset_id("nested_smallest_bird2310")
     rolling_upgrade()
diff --git a/automation/assets_manager.py b/automation/assets_manager.py
@@ -139,11 +139,15 @@ def merge(self, fd: Path, td: Path):
 def run():
     # the largest animal https://github.com/QIN2DIM/hcaptcha-challenger/issues/797
     # red panda https://github.com/QIN2DIM/hcaptcha-challenger/issues/896
-    sources = "https://github.com/QIN2DIM/hcaptcha-challenger/issues/826"  # the smallest animal
+    # sources = "https://github.com/QIN2DIM/hcaptcha-challenger/issues/826"  # the smallest animal
+    sources = "https://github.com/QIN2DIM/hcaptcha-challenger/issues/896"  # red panda
+    sources = "https://github.com/QIN2DIM/hcaptcha-challenger/issues/897"  # keyboard
     # sources = "the largest animal"
     # sources = "please click on the largest animal"
     sources = "the smallest animal"
     # sources = "natural landscape"
+    # sources = "https://github.com/QIN2DIM/hcaptcha-challenger/issues/857"  # flamingo
+    # sources = "https://github.com/QIN2DIM/hcaptcha-challenger/issues/853"
     am = AssetsManager.from_sources(sources)
     am.execute()
 

diff --git a/automation/auto_labeling.py b/automation/auto_labeling.py
@@ -12,18 +12,14 @@
 from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
-from typing import Tuple, List, Dict
+from typing import Tuple, List, Dict, NoReturn
 
 import hcaptcha_challenger as solver
 from PIL import Image
 from hcaptcha_challenger import DataLake, ModelHub, ZeroShotImageClassifier, register_pipline
 from tqdm import tqdm
 
-from flow_card import flow_card
-
-logging.basicConfig(
-    level=logging.INFO, stream=sys.stdout, format="%(asctime)s - %(levelname)s - %(message)s"
-)
+from flow_card import flow_card, flow_card_nested_animal
 
 solver.install(upgrade=True)
 
@@ -181,8 +177,45 @@ def execute(self, model, substack: Dict[str, Dict[str, List[str]]] = None, **kwa
                 stk.transform(base_dir=self.output_dir)
 
 
-def run():
-    images_dir = Path(__file__).parent.parent.joinpath("database2309")
+def check_card(pending_card: list) -> NoReturn | bool:
+    require_keys = ["positive_labels", "negative_labels", "joined_dirs"]
+    ok = True
+    for i, card in enumerate(pending_card):
+        for rk in require_keys:
+            if rk not in card or not isinstance(card[rk], list):
+                logging.error(f"card 缺少必要的键值对 - {i=} key={rk} {card=}")
+                ok = False
+            elif len(card[rk]) == 0:
+                logging.error(f"card 的必要信息不能为空 - {i=} key={rk} {require_keys=}")
+                ok = False
+            elif len(card[rk]) < 2:
+                logging.error(f"CLIP 模型在二分类任务上的准确率可能会比较糟糕 - {i=} ")
+                ok = False
+
+            for label in card[rk]:
+                if not isinstance(label, str):
+                    logging.error(
+                        f"card 的 require_keys 的值必须是 List[str] 类型 - {i=} {label=} {type(label)=}"
+                    )
+                    ok = False
+                if len(label) <= 1:
+                    logging.error(f"这可能是一个异常输入 - {i=} {label=}")
+                    ok = False
+
+    if not ok:
+        raise ValueError("card 存在致命的写法问题，请逐一排查后再启动程序！")
+    return True
+
+
+def run(suffix_filter: str, cards: list, base_dirname: str = "database2309"):
+    if not suffix_filter:
+        return
+
+    logging.info("正在检查运行配置")
+    check_card(cards)
+    logging.info("运行配置准确无误！")
+
+    images_dir = Path(__file__).parent.parent.joinpath(base_dirname)
 
     modelhub = ModelHub.from_github_repo()
     modelhub.parse_objects()
@@ -191,9 +224,9 @@ def run():
     # the NVIDIA graphics card is available
     model = register_pipline(modelhub, fmt="transformers")
 
-    for card in flow_card:
+    for card in cards:
         # Filter out the task cards we care about
-        if "f1-bird" not in card["joined_dirs"]:
+        if suffix_filter not in card["joined_dirs"]:
             continue
         # Generating a dataclass from serialized data
         dl = DataLake(
@@ -210,4 +243,7 @@ def run():
 
 
 if __name__ == "__main__":
-    run()
+    logging.info(f"Loading {len(flow_card)=}")
+    logging.info(f"Loading {len(flow_card_nested_animal)=}")
+
+    run("s1_bird", cards=flow_card_nested_animal)
diff --git a/automation/check_yolo_det_model.py b/automation/check_yolo_det_model.py
@@ -10,58 +10,95 @@
 import sys
 from pathlib import Path
 
+import cv2
 import onnxruntime
 from hcaptcha_challenger import install, YOLOv8
-from hcaptcha_challenger.components.yolo_mocker import CcYOLO
 from hcaptcha_challenger.onnx.modelhub import request_resource
+from tqdm import tqdm
 
 install(upgrade=True)
+model_url = "https://github.com/QIN2DIM/hcaptcha-challenger/releases/download/model/"
 
+this_dir = Path(__file__).parent
+models_dir = this_dir.joinpath("tmp_models")
+models_dir.mkdir(exist_ok=True)
 
-class CbYOLO(CcYOLO):
-    def __init__(self, model_name: str, images_absolute_dir: Path, this_dir: Path, classes=None):
-        super().__init__(model_name, images_absolute_dir, this_dir)
-        self.classes = classes
-
-    def get_model(self) -> YOLOv8 | None:
-        classes = self.modelhub.ashes_of_war.get(self.model_name)
-        if not classes:
-            if not self.classes:
-                raise AttributeError(f"Model name not found - {self.model_name=}")
-            print(f">> Match model - {self.model_name=}")
-            model_path = Path(self.model_name)
-            if not model_path.exists():
-                request_resource(self.model_url + self.model_name, model_path)
-            try:
-                session = onnxruntime.InferenceSession(
-                    model_path, providers=onnxruntime.get_available_providers()
-                )
-                detector = YOLOv8.from_pluggable_model(session, self.classes)
-            except Exception as err:
-                print(err)
-                shutil.rmtree(model_path, ignore_errors=True)
-            else:
-                return detector
+
+def load_model(model_path, classes):
+    session = onnxruntime.InferenceSession(
+        model_path, providers=onnxruntime.get_available_providers()
+    )
+    detector = YOLOv8.from_pluggable_model(session, classes)
+    return detector
+
+
+def execute(input_dir: Path | str, model_name: str, model_path: Path, classes: list):
+    def draw():
+        alts = sorted(results, key=lambda x: x[-1])
+        text, ps, pe, _ = alts[-1]
+        image = cv2.imread(str(image_path))
+        pt1 = int(ps[0]), int(ps[1])
+        pt2 = int(pe[0]), int(pe[1])
+        cv2.rectangle(image, pt1, pt2, (87, 241, 126), 2)
+        cv2.imwrite(str(output_path), image)
+
+    if isinstance(input_dir, str):
+        input_dir = Path(input_dir)
+    input_dir = input_dir.absolute()
+
+    if not model_path.exists():
+        request_resource(model_url + model_name, model_path)
+
+    detector = load_model(model_path, classes)
+
+    output_dir = this_dir.joinpath("yolo_mocker", input_dir.name)
+    output_miss_dir = output_dir.joinpath("miss")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_miss_dir.mkdir(parents=True, exist_ok=True)
+
+    pending_image_paths = []
+    for image_name in os.listdir(input_dir):
+        image_path = input_dir.joinpath(image_name)
+        if image_path.is_file() and not output_dir.joinpath(image_name).exists():
+            pending_image_paths.append(image_path)
+
+    total = len(pending_image_paths)
+    handle, miss = 0, 0
+
+    with tqdm(total=total, desc=f"Labeling | ") as progress:
+        for image_path in pending_image_paths:
+            results = detector(image_path, shape_type="bounding_box")
+            progress.update(1)
+            if not results:
+                output_miss_path = output_miss_dir.joinpath(image_path.name)
+                shutil.copyfile(image_path, output_miss_path)
+                miss += 1
+                continue
+            output_path = output_dir.joinpath(image_path.name)
+            draw()
+            handle += 1
+    print(f">> Statistic - {total=} {handle=} {miss=}")
+
+    return output_dir
 
 
 def run():
+    images_dir = r"zip_dir/click_on_the_turtle_s_head_default"
+
     # model_name = "burl_head_of_the_lion_2309_yolov8s.onnx"
     model_name = "head_of_the_animal_turtle_2309_yolov8s.onnx"
+    # model_name = "head_of_the_meerkat_2311_yolov8n.onnx"
     classes = ["animal-head"]
-    images_dir = r"zip_dir/click_on_the_turtle_s_head_default"
-
-    this_dir = Path(__file__).parent
-    output_dir = this_dir.joinpath("yolo_mocker")
-
-    if isinstance(images_dir, str):
-        images_dir = Path(images_dir)
-    images_dir = images_dir.absolute()
 
-    ccy = CbYOLO(model_name, images_dir, output_dir, classes)
-    ccy.spawn()
+    output_dir = execute(
+        input_dir=images_dir,
+        model_name=model_name,
+        model_path=models_dir.joinpath(model_name),
+        classes=classes,
+    )
 
     if "win32" in sys.platform:
-        os.startfile(ccy.output_dir)
+        os.startfile(output_dir)
 
 
 if __name__ == "__main__":

diff --git a/automation/datasets_downloader.py b/automation/datasets_downloader.py
@@ -14,7 +14,7 @@
 collected = []
 per_times = 60
 tmp_dir = Path(__file__).parent.joinpath("tmp_dir")
-sitekey = SiteKey.discord
+sitekey = SiteKey.epic
 
 
 async def collete_datasets(context: ASyncContext):

diff --git a/automation/flow_card.py b/automation/flow_card.py
@@ -4,6 +4,12 @@
 # GitHub     : https://github.com/QIN2DIM
 # Description:
 # Run `assets_manager.py` to get test data from GitHub issues
+import logging
+import sys
+
+logging.basicConfig(
+    level=logging.INFO, stream=sys.stdout, format="%(asctime)s - %(levelname)s - %(message)s"
+)
 
 flow_card = [
     {
@@ -26,44 +32,11 @@
         "negative_labels": ["bicycle", "off-road vehicle"],
         "joined_dirs": ["sedan_car"],
     },
-    {
-        "positive_labels": ["turtle"],
-        "negative_labels": ["horse", "bear", "giraffe", "dolphins"],
-        "joined_dirs": ["please_click_on_the_smallest_animal", "nested_smallest_turtle"],
-    },
-    {
-        "positive_labels": ["dog"],
-        "negative_labels": ["frog", "hedgehog", "squirrel", "hummingbird"],
-        "joined_dirs": ["please_click_on_the_largest_animal", "nested_largest_dog"],
-    },
-    # multi classification for nested prompt
-    {
-        "positive_labels": ["dog", "fox"],
-        "negative_labels": ["crab", "bird", "dragonfly", "ant"],
-        "joined_dirs": ["the_largest_animal"],
-        # = ↑↑ = 和常规情况一样，先对整体数据集进行多目标分类
-        # = ↓↓ = 再根据具体的 yes/bad 映射关系进行数据集二次移动
-        "substack": {
-            "nested_largest_dog": {"yes": ["dog"], "bad": ["crab", "bird", "dragonfly", "ant"]},
-            "nested_largest_fox": {"yes": ["fox"], "bad": ["crab", "bird", "dragonfly", "ant"]},
-        },
-    },
     {
         "positive_labels": ["red panda"],
-        "negative_labels": ["cactus", "door", "guinea pig", "meerkat"],
+        "negative_labels": ["cactus", "door", "guinea pig", "meerkat", "bird"],
         "joined_dirs": ["red_panda"],
     },
-    {
-        "positive_labels": ["tiger", "squirrel"],
-        "negative_labels": ["dog", "bat", "raccoon", "ant", "ladybug"],
-        "joined_dirs": ["please_click_on_the_largest_animal", "fff"],
-        # = ↑↑ = 和常规情况一样，先对整体数据集进行多目标分类
-        # = ↓↓ = 再根据具体的 yes/bad 映射关系进行数据集二次移动
-        "substack": {
-            "nested_largest_tiger": {"yes": ["tiger"], "bad": ["dog", "bat", "raccoon"]},
-            "nested_largest_squirrel": {"yes": ["squirrel"], "bad": ["ant", "ladybug"]},
-        },
-    },
     {
         "positive_labels": ["natural landscape", "Mountain", "forest"],
         "negative_labels": [
@@ -77,18 +50,50 @@
         "joined_dirs": ["natural_landscape"],
     },
     {
-        "positive_labels": ["starfish"],
-        "negative_labels": ["panda", "dog", "cow", "elephant", "guinea pig", "dolphins",
-                            "bird", "goat", "lion", "bear", ""],
-        "joined_dirs": ["the_smallest_animal", "f1-star"],
+        "positive_labels": ["keyboard"],
+        "negative_labels": ["panda", "goat", "headphones", "bird", "trunk"],
+        "joined_dirs": ["keyboard"],
+    },
+]
+
+flow_card_nested_animal = [
+    {
+        "positive_labels": ["panda"],
+        "negative_labels": ["raccoon", "dog", "meerkat", "koala"],
+        "joined_dirs": ["the_largest_animal", "l1_panda"],
+        "substack": {
+            "nested_largest_panda": {
+                "yes": ["panda"],
+                "bad": ["raccoon", "dog", "meerkat", "koala"],
+            }
+        },
+    },
+    {
+        "positive_labels": ["horse"],
+        "negative_labels": ["elephant", "whale"],
+        "joined_dirs": ["the_smallest_animal", "s1_horse"],
+        "substack": {"nested_smallest_horse": {"yes": ["horse"], "bad": ["elephant", "whale"]}},
     },
     {
         "positive_labels": ["bird"],
-        "negative_labels": ["panda", "dog", "cow", "dolphins", "goat", "lion", "bear", "giraffe"],
-        "joined_dirs": ["the_smallest_animal", "f1-bird"],
+        "negative_labels": ["ladybug", "butterfly", "dragonfly", "bees", "crab", "frog", "ant"],
+        "joined_dirs": ["the_largest_animal", "l1_bird"],
         "substack": {
-            "nested_smallest_bird": {"yes": ["bird"], "bad": ["panda", "dog", "cow", "dolphins", "goat", "lion", "bear", "giraffe"]},
+            "nested_largest_bird": {
+                "yes": ["bird"],
+                "bad": ["ladybug", "butterfly", "dragonfly", "bees", "crab", "frog", "ant"],
+            }
+        },
+    },
+    {
+        "positive_labels": ["bird"],
+        "negative_labels": ["panda", "giraffe", "dolphins", "lion"],
+        "joined_dirs": ["the_smallest_animal", "s1_bird"],
+        "substack": {
+            "nested_smallest_bird": {
+                "yes": ["bird"],
+                "bad": ["panda", "giraffe", "dolphins", "lion"],
+            }
         },
     },
-
 ]