Skip to content

Commit

Permalink
feat(auto-labeling): positive labels group (#68)
Browse files Browse the repository at this point in the history
  • Loading branch information
QIN2DIM committed Oct 23, 2023
1 parent ba31efe commit 0b3a8d5
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 36 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.png *.pt binary
*.yaml text eol=crlf
2 changes: 1 addition & 1 deletion automation/assets_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def merge(self, fd: Path, td: Path):


def run():
sources = "https://github.com/QIN2DIM/hcaptcha-challenger/issues/855"
sources = "https://github.com/QIN2DIM/hcaptcha-challenger/issues/851"
am = AssetsManager.from_sources(sources)
am.execute()

Expand Down
58 changes: 24 additions & 34 deletions automation/auto_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,13 @@

@dataclass
class AutoLabeling:
positive_label: str = field(default=str)
positive_labels: List[str] = field(default_factory=list)
candidate_labels: List[str] = field(default_factory=list)
images_dir: Path = field(default=Path)
pending_tasks: List[Path] = field(default_factory=list)

checkpoint = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
checkpoint = "laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K"
# checkpoint = "QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_224to336"

output_dir: Path = None

Expand All @@ -45,7 +46,7 @@ def load_zero_shot_model(self):
return detector

@classmethod
def from_prompt(cls, positive_label: str, candidate_labels: List[str], images_dir: Path):
def from_prompt(cls, positive_labels: List[str], candidate_labels: List[str], images_dir: Path):
images_dir.mkdir(parents=True, exist_ok=True)

pending_tasks: List[Path] = []
Expand All @@ -55,7 +56,7 @@ def from_prompt(cls, positive_label: str, candidate_labels: List[str], images_di
pending_tasks.append(image_path)

return cls(
positive_label=positive_label,
positive_labels=positive_labels,
candidate_labels=candidate_labels,
images_dir=images_dir,
pending_tasks=pending_tasks,
Expand Down Expand Up @@ -109,7 +110,7 @@ def execute(self, limit: int | str = None):

# Move positive cases to yes/
# Move negative cases to bad/
if predictions[0]["label"] == self.positive_label:
if predictions[0]["label"] in self.positive_labels:
output_path = yes_dir.joinpath(image_path.name)
else:
output_path = bad_dir.joinpath(image_path.name)
Expand All @@ -120,24 +121,31 @@ def execute(self, limit: int | str = None):

@dataclass
class DataGroup:
positive: str
positive_labels: List[str] | str
joined_dirs: List[str]
negative_labels: List[str]

def __post_init__(self):
self.positive = self.positive.replace("_", " ")
if isinstance(self.positive_labels, str):
self.positive_labels = [self.positive_labels]

@property
def input_dir(self):
return db_dir.joinpath(*self.joined_dirs).absolute()

def auto_labeling(self, **kwargs):
positive_label = split_prompt_message(label_cleaning(self.positive), "en")
candidate_labels = [positive_label]
pls = []
for pl in self.positive_labels:
pl = pl.replace("_", " ")
pl = split_prompt_message(label_cleaning(pl), "en")
pls.append(pl)

candidate_labels = pls.copy()

if isinstance(self.negative_labels, list) and len(self.negative_labels) != 0:
candidate_labels.extend(self.negative_labels)

al = AutoLabeling.from_prompt(positive_label, candidate_labels, self.input_dir)
al = AutoLabeling.from_prompt(pls, candidate_labels, self.input_dir)
al.execute(limit=kwargs.get("limit"))

return al
Expand All @@ -147,30 +155,12 @@ def edit_in_the_common_cases():
# prompt to negative labels
# input_dir = /[Project_dir]/database2309/*[joined_dirs]

# nox = DataGroup(
# positive="plant",
# joined_dirs=["plant"],
# negative_labels=["phone", "playground", "laptop", "chess", "helicopter", "icecream"],
# ).auto_labeling(limit="all")

# nox = DataGroup(
# positive="natural_landscape",
# joined_dirs=["natural_landscape"],
# negative_labels=["laptop", "helicopter", "chess", "playground"]
# ).auto_labeling(limit="all")

nox = DataGroup(
positive="electronic device",
joined_dirs=["electronic_device"],
negative_labels=[
"helicopter",
"chess",
"playground",
"natural landscape",
"plant",
"somthing can be eaten",
],
).auto_labeling(limit="all")
dg = DataGroup(
positive_labels=["helicopter", "excavator"],
joined_dirs=["motorized_machine"],
negative_labels=["laptop", "chess", "plant", "natural landscape", "mountain"],
)
nox = dg.auto_labeling(limit=1)

if "win32" in sys.platform and nox.output_dir:
os.startfile(nox.output_dir)
Expand Down
2 changes: 1 addition & 1 deletion automation/zip_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def zip_dataset(prompt: str):
print(f">> OUTPUT - {zip_path=}")


zip_dataset(prompt="electronic_device")
zip_dataset(prompt="motorized_machine")
webbrowser.open(
"https://colab.research.google.com/github/captcha-challenger/hcaptcha-model-factory/blob/main/automation/roboflow_resnet.ipynb"
)
20 changes: 20 additions & 0 deletions server/onnx-flow.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# https://clip-as-service.jina.ai/user-guides/server/#yaml-config
jtype: Flow
version: '1'
with:
port: 51000
monitoring: true
port_monitoring: 9000
executors:
- name: clip_o
uses:
jtype: CLIPEncoder
with:
# name: 'ViT-H-14::laion2b-s32b-b79k'
# model_path: "custom-model"
device:
metas:
py_modules:
- clip_server.executors.clip_onnx
monitoring: true
port_monitoring: 9091
11 changes: 11 additions & 0 deletions server/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
## Install

```markdown
pip install clip-server
```

## Start the server

```markdown
python -m clip_server server/onnx-flow.yaml
```

0 comments on commit 0b3a8d5

Please sign in to comment.