Skip to content

Commit

Permalink
perf(automation): Enhance expandability (#71)
Browse files Browse the repository at this point in the history
  • Loading branch information
QIN2DIM committed Oct 30, 2023
1 parent 96b295e commit 5a65824
Show file tree
Hide file tree
Showing 7 changed files with 143 additions and 12 deletions.
8 changes: 7 additions & 1 deletion automation/assets_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def download_datasets(self, issue_url):
return td

def get_download_links(self, issue_url: str):
prefix = "https://github.com/captcha-challenger/hcaptcha-whistleblower/releases/download/automation-archive/"
prefix = "https://github.com/CaptchaAgent/hcaptcha-whistleblower/releases/download/automation-archive/"

res = self.client.get(issue_url)
soup = BeautifulSoup(res.text, "html.parser")
Expand Down Expand Up @@ -138,7 +138,13 @@ def merge(self, fd: Path, td: Path):

def run():
# the largest animal https://github.com/QIN2DIM/hcaptcha-challenger/issues/797
# red panda https://github.com/QIN2DIM/hcaptcha-challenger/issues/896
#
sources = "https://github.com/QIN2DIM/hcaptcha-challenger/issues/797"
sources = "the largest animal"
sources = "please click on the largest animal"
# sources = "the smallest animal"
# sources = "natural landscape"
am = AssetsManager.from_sources(sources)
am.execute()

Expand Down
2 changes: 1 addition & 1 deletion automation/auto_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def run():

for card in flow_card:
# Filter out the task cards we care about
if "the_largest_animal" not in card["joined_dirs"]:
if "fff" not in card["joined_dirs"]:
continue
# Generating a dataclass from serialized data
dl = DataLake(
Expand Down
2 changes: 1 addition & 1 deletion automation/datasets_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
collected = []
per_times = 60
tmp_dir = Path(__file__).parent.joinpath("tmp_dir")
sitekey = SiteKey.epic
sitekey = SiteKey.discord


async def collete_datasets(context: ASyncContext):
Expand Down
28 changes: 28 additions & 0 deletions automation/flow_card.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,32 @@
"nested_largest_fox": {"yes": ["fox"], "bad": ["crab", "bird", "dragonfly", "ant"]},
},
},
{
"positive_labels": ["red panda"],
"negative_labels": ["cactus", "door", "guinea pig", "meerkat"],
"joined_dirs": ["red_panda"],
},
{
"positive_labels": ["tiger", "squirrel"],
"negative_labels": ["dog", "bat", "raccoon", "ant", "ladybug"],
"joined_dirs": ["please_click_on_the_largest_animal", "fff"],
# = ↑↑ = 和常规情况一样,先对整体数据集进行多目标分类
# = ↓↓ = 再根据具体的 yes/bad 映射关系进行数据集二次移动
"substack": {
"nested_largest_tiger": {"yes": ["tiger"], "bad": ["dog", "bat", "raccoon"]},
"nested_largest_squirrel": {"yes": ["squirrel"], "bad": ["ant", "ladybug"]},
},
},
{
"positive_labels": ["natural landscape", "Mountain", "forest"],
"negative_labels": [
"chess",
"laptop",
"helicopter",
"meerkat",
"roller coaster",
"Recreational facilities",
],
"joined_dirs": ["natural_landscape"],
},
]
3 changes: 1 addition & 2 deletions automation/mini_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,7 @@ def upgrade_objects(aid_):
# "nested_smallest_turtle": "nested_smallest_turtle2309",
# "nested_largest_dog": "nested_largest_dog2309",
# "bicycle": "bicycle2309",
"nested_largest_fox": "nested_largest_fox2309",

# "nested_largest_fox": "nested_largest_fox2309",
}
# fmt:on

Expand Down
1 change: 1 addition & 0 deletions automation/roboflow_resnet.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@
"import hcaptcha_challenger as solver\n",
"\n",
"os.environ[\"GITHUB_TOKEN\"] = GITHUB_TOKEN\n",
"onnx_archive_name = onnx_archive_name.replace(\".onnx\", \"\")\n",
"\n",
"solver.diagnose_task(task_name)"
],
Expand Down
111 changes: 104 additions & 7 deletions automation/zip_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,26 @@

import os
import shutil
import webbrowser
import zipfile
from pathlib import Path

import hcaptcha_challenger as solver
from hcaptcha_challenger import prompt2task, ModelHub, diagnose_task, install

CELL_TEMPLATE = """
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
GITHUB_TOKEN = "{github_token}"
task_name = "{task_name}"
onnx_archive_name = "{onnx_archive_name}"
NESTED_PROMPT = "{nested_prompt}"
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
"""

NOTEBOOK = "https://colab.research.google.com/github/captcha-challenger/hcaptcha-model-factory/blob/main/automation/roboflow_resnet.ipynb"


def zip_dataset(prompt: str):
prompt = prompt.replace("_", " ")
task_name = solver.prompt2task(prompt)
task_name = prompt2task(prompt)

project_dir = Path(__file__).parent.parent
images_dir = project_dir.joinpath("database2309", task_name)
Expand All @@ -40,9 +50,96 @@ def zip_dataset(prompt: str):
zip_file.write(os.path.join(root, file), f"bad/{file}")

print(f">> OUTPUT - {zip_path=}")
return task_name


def print_quick_start_info(task_name: str, nested_prompt: str = ""):
"""
task_name: like natural_landscape, nested_largest_tiger
"""
diagnose_task(task_name)

if task_name.startswith("nested_") and not nested_prompt:
raise ValueError("生成嵌套类型模版需要提供其配对的提示词")

install(upgrade=True)
modelhub = ModelHub.from_github_repo()
modelhub.parse_objects()

label = task_name.replace("_", " ")

onnx_archive_name = ""

if not nested_prompt:
if onnx_archive := modelhub.label_alias.get(label):
oan = onnx_archive.replace(".onnx", "")
v = ""
for char in reversed(oan):
if char.isdigit():
v = char + v
else:
break
if v and v.isdigit():
v = int(v) + 1
onnx_archive_name = f"{task_name}{str(v)}"
else:
onnx_archive_name = f"{task_name}2309"

else:
for i in modelhub.nested_categories.get(nested_prompt, []):
print(f"{nested_prompt} => {i}")

if nested_models := modelhub.nested_categories.get(nested_prompt, []):
if not isinstance(nested_models, list):
if nested_models:
raise TypeError(
f"NestedTypeError ({nested_prompt}) 的模型映射列表应该是个 List[str] 类型,但实际上是 {type(nested_models)}"
)
nested_models = []
v = ""
for i, model_name in enumerate(nested_models):
filter_chars = [".onnx", task_name]
for fc in filter_chars:
model_name = model_name.replace(fc, "")
if not model_name.isdigit():
continue
else:
v = model_name
break
if v and v.isdigit():
v = int(v) + 1
onnx_archive_name = f"{task_name}{str(v)}"
else:
onnx_archive_name = f"{task_name}2309"

_t = CELL_TEMPLATE.format(
github_token=os.getenv("GITHUB_TOKEN", ""),
task_name=task_name,
onnx_archive_name=onnx_archive_name,
nested_prompt=nested_prompt,
)

print(_t)


def run():
prompt = "nested_largest_squirrel"

# 生成嵌套类型模版需要提供其配对的提示词
# the smallest animal
# please click on the largest animal
nested_prompt = "please click on the largest animal"

# 压缩数据集
tn = zip_dataset(prompt=prompt)

# 打印配置模版
print_quick_start_info(task_name=tn, nested_prompt=nested_prompt)

import webbrowser

webbrowser.open(NOTEBOOK)


zip_dataset(prompt="nested_largest_dog")
webbrowser.open(
"https://colab.research.google.com/github/captcha-challenger/hcaptcha-model-factory/blob/main/automation/roboflow_resnet.ipynb"
)
if __name__ == "__main__":
run()

0 comments on commit 5a65824

Please sign in to comment.