feat(auto-labeling): positive labels group (#68)

CaptchaAgent · Oct 23, 2023 · 0b3a8d5 · 0b3a8d5
1 parent ba31efe
commit 0b3a8d5
Show file tree

Hide file tree

Showing 6 changed files with 59 additions and 36 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+*.png *.pt binary
+*.yaml text eol=crlf
diff --git a/automation/assets_manager.py b/automation/assets_manager.py
@@ -137,7 +137,7 @@ def merge(self, fd: Path, td: Path):
 
 
 def run():
-    sources = "https://github.com/QIN2DIM/hcaptcha-challenger/issues/855"
+    sources = "https://github.com/QIN2DIM/hcaptcha-challenger/issues/851"
     am = AssetsManager.from_sources(sources)
     am.execute()
 

diff --git a/automation/auto_labeling.py b/automation/auto_labeling.py
@@ -24,12 +24,13 @@
 
 @dataclass
 class AutoLabeling:
-    positive_label: str = field(default=str)
+    positive_labels: List[str] = field(default_factory=list)
     candidate_labels: List[str] = field(default_factory=list)
     images_dir: Path = field(default=Path)
     pending_tasks: List[Path] = field(default_factory=list)
 
-    checkpoint = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
+    checkpoint = "laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K"
+    # checkpoint = "QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_224to336"
 
     output_dir: Path = None
 
@@ -45,7 +46,7 @@ def load_zero_shot_model(self):
         return detector
 
     @classmethod
-    def from_prompt(cls, positive_label: str, candidate_labels: List[str], images_dir: Path):
+    def from_prompt(cls, positive_labels: List[str], candidate_labels: List[str], images_dir: Path):
         images_dir.mkdir(parents=True, exist_ok=True)
 
         pending_tasks: List[Path] = []
@@ -55,7 +56,7 @@ def from_prompt(cls, positive_label: str, candidate_labels: List[str], images_di
                 pending_tasks.append(image_path)
 
         return cls(
-            positive_label=positive_label,
+            positive_labels=positive_labels,
             candidate_labels=candidate_labels,
             images_dir=images_dir,
             pending_tasks=pending_tasks,
@@ -109,7 +110,7 @@ def execute(self, limit: int | str = None):
 
                 # Move positive cases to yes/
                 # Move negative cases to bad/
-                if predictions[0]["label"] == self.positive_label:
+                if predictions[0]["label"] in self.positive_labels:
                     output_path = yes_dir.joinpath(image_path.name)
                 else:
                     output_path = bad_dir.joinpath(image_path.name)
@@ -120,24 +121,31 @@ def execute(self, limit: int | str = None):
 
 @dataclass
 class DataGroup:
-    positive: str
+    positive_labels: List[str] | str
     joined_dirs: List[str]
     negative_labels: List[str]
 
     def __post_init__(self):
-        self.positive = self.positive.replace("_", " ")
+        if isinstance(self.positive_labels, str):
+            self.positive_labels = [self.positive_labels]
 
     @property
     def input_dir(self):
         return db_dir.joinpath(*self.joined_dirs).absolute()
 
     def auto_labeling(self, **kwargs):
-        positive_label = split_prompt_message(label_cleaning(self.positive), "en")
-        candidate_labels = [positive_label]
+        pls = []
+        for pl in self.positive_labels:
+            pl = pl.replace("_", " ")
+            pl = split_prompt_message(label_cleaning(pl), "en")
+            pls.append(pl)
+
+        candidate_labels = pls.copy()
+
         if isinstance(self.negative_labels, list) and len(self.negative_labels) != 0:
             candidate_labels.extend(self.negative_labels)
 
-        al = AutoLabeling.from_prompt(positive_label, candidate_labels, self.input_dir)
+        al = AutoLabeling.from_prompt(pls, candidate_labels, self.input_dir)
         al.execute(limit=kwargs.get("limit"))
 
         return al
@@ -147,30 +155,12 @@ def edit_in_the_common_cases():
     # prompt to negative labels
     # input_dir = /[Project_dir]/database2309/*[joined_dirs]
 
-    # nox = DataGroup(
-    #     positive="plant",
-    #     joined_dirs=["plant"],
-    #     negative_labels=["phone", "playground", "laptop", "chess", "helicopter", "icecream"],
-    # ).auto_labeling(limit="all")
-
-    # nox = DataGroup(
-    #     positive="natural_landscape",
-    #     joined_dirs=["natural_landscape"],
-    #     negative_labels=["laptop", "helicopter", "chess", "playground"]
-    # ).auto_labeling(limit="all")
-
-    nox = DataGroup(
-        positive="electronic device",
-        joined_dirs=["electronic_device"],
-        negative_labels=[
-            "helicopter",
-            "chess",
-            "playground",
-            "natural landscape",
-            "plant",
-            "somthing can be eaten",
-        ],
-    ).auto_labeling(limit="all")
+    dg = DataGroup(
+        positive_labels=["helicopter", "excavator"],
+        joined_dirs=["motorized_machine"],
+        negative_labels=["laptop", "chess", "plant", "natural landscape", "mountain"],
+    )
+    nox = dg.auto_labeling(limit=1)
 
     if "win32" in sys.platform and nox.output_dir:
         os.startfile(nox.output_dir)

diff --git a/automation/zip_dataset.py b/automation/zip_dataset.py
@@ -42,7 +42,7 @@ def zip_dataset(prompt: str):
     print(f">> OUTPUT - {zip_path=}")
 
 
-zip_dataset(prompt="electronic_device")
+zip_dataset(prompt="motorized_machine")
 webbrowser.open(
     "https://colab.research.google.com/github/captcha-challenger/hcaptcha-model-factory/blob/main/automation/roboflow_resnet.ipynb"
 )
diff --git a/server/onnx-flow.yaml b/server/onnx-flow.yaml
@@ -0,0 +1,20 @@
+# https://clip-as-service.jina.ai/user-guides/server/#yaml-config
+jtype: Flow
+version: '1'
+with:
+  port: 51000
+  monitoring: true
+  port_monitoring: 9000
+executors:
+  - name: clip_o
+    uses:
+      jtype: CLIPEncoder
+      with:
+#        name: 'ViT-H-14::laion2b-s32b-b79k'
+#        model_path: "custom-model"
+        device:
+      metas:
+        py_modules:
+          - clip_server.executors.clip_onnx
+    monitoring: true
+    port_monitoring: 9091
diff --git a/server/readme.md b/server/readme.md
@@ -0,0 +1,11 @@
+## Install
+
+```markdown
+pip install clip-server
+```
+
+## Start the server
+
+```markdown
+python -m clip_server server/onnx-flow.yaml
+```