Skip to content

Commit

Permalink
fix(zip-dataset): handle nested model (#72)
Browse files Browse the repository at this point in the history
  • Loading branch information
QIN2DIM committed Oct 31, 2023
1 parent c26f6ea commit 9e0b003
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 56 deletions.
11 changes: 5 additions & 6 deletions automation/assets_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def download_datasets(self, issue_url):
return td

def get_download_links(self, issue_url: str):
prefix = "https://github.com/CaptchaAgent/hcaptcha-whistleblower/releases/download/automation-archive/"
prefix = "hcaptcha-whistleblower/releases/download/automation-archive/"

res = self.client.get(issue_url)
soup = BeautifulSoup(res.text, "html.parser")
Expand Down Expand Up @@ -139,11 +139,10 @@ def merge(self, fd: Path, td: Path):
def run():
# the largest animal https://github.com/QIN2DIM/hcaptcha-challenger/issues/797
# red panda https://github.com/QIN2DIM/hcaptcha-challenger/issues/896
#
sources = "https://github.com/QIN2DIM/hcaptcha-challenger/issues/797"
sources = "the largest animal"
sources = "please click on the largest animal"
# sources = "the smallest animal"
sources = "https://github.com/QIN2DIM/hcaptcha-challenger/issues/826" # the smallest animal
# sources = "the largest animal"
# sources = "please click on the largest animal"
sources = "the smallest animal"
# sources = "natural landscape"
am = AssetsManager.from_sources(sources)
am.execute()
Expand Down
2 changes: 1 addition & 1 deletion automation/auto_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def run():

for card in flow_card:
# Filter out the task cards we care about
if "fff" not in card["joined_dirs"]:
if "f1-bird" not in card["joined_dirs"]:
continue
# Generating a dataclass from serialized data
dl = DataLake(
Expand Down
15 changes: 15 additions & 0 deletions automation/flow_card.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,4 +76,19 @@
],
"joined_dirs": ["natural_landscape"],
},
{
"positive_labels": ["starfish"],
"negative_labels": ["panda", "dog", "cow", "elephant", "guinea pig", "dolphins",
"bird", "goat", "lion", "bear", ""],
"joined_dirs": ["the_smallest_animal", "f1-star"],
},
{
"positive_labels": ["bird"],
"negative_labels": ["panda", "dog", "cow", "dolphins", "goat", "lion", "bear", "giraffe"],
"joined_dirs": ["the_smallest_animal", "f1-bird"],
"substack": {
"nested_smallest_bird": {"yes": ["bird"], "bad": ["panda", "dog", "cow", "dolphins", "goat", "lion", "bear", "giraffe"]},
},
},

]
113 changes: 64 additions & 49 deletions automation/zip_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,64 +53,80 @@ def zip_dataset(prompt: str):
return task_name


def parse_stander_model(modelhub, task_name):
label = task_name.replace("_", " ")
onnx_archive_name = ""

if onnx_archive := modelhub.label_alias.get(label):
oan = onnx_archive.replace(".onnx", "")
v = ""
for char in reversed(oan):
if char.isdigit():
v = char + v
else:
break
if v and v.isdigit():
v = int(v) + 1
onnx_archive_name = f"{task_name}{str(v)}"

return onnx_archive_name


def parse_nested_model(modelhub, task_name, nested_prompt, ):
onnx_archive_name = ""

for i in modelhub.nested_categories.get(nested_prompt, []):
print(f"{nested_prompt} => {i}")

print(nested_prompt in modelhub.nested_categories)

if nested_models := modelhub.nested_categories.get(nested_prompt, []):
if not isinstance(nested_models, list):
if nested_models:
raise TypeError(
f"NestedTypeError ({nested_prompt}) 的模型映射列表应该是个 List[str] 类型,但实际上是 {type(nested_models)}"
)
nested_models = []
v = ""
for i, model_name in enumerate(nested_models):
filter_chars = [".onnx", task_name]
for fc in filter_chars:
model_name = model_name.replace(fc, "")
if not model_name.isdigit():
continue

v = model_name
break

if v and v.isdigit():
v = int(v) + 1
onnx_archive_name = f"{task_name}{str(v)}"

return onnx_archive_name


def print_quick_start_info(task_name: str, nested_prompt: str = ""):
"""
task_name: like natural_landscape, nested_largest_tiger
"""
diagnose_task(task_name)

if task_name.startswith("nested_") and not nested_prompt:
raise ValueError("生成嵌套类型模版需要提供其配对的提示词")

install(upgrade=True)
modelhub = ModelHub.from_github_repo()
modelhub.parse_objects()

label = task_name.replace("_", " ")

onnx_archive_name = ""

# 常规模型
if not nested_prompt:
if onnx_archive := modelhub.label_alias.get(label):
oan = onnx_archive.replace(".onnx", "")
v = ""
for char in reversed(oan):
if char.isdigit():
v = char + v
else:
break
if v and v.isdigit():
v = int(v) + 1
onnx_archive_name = f"{task_name}{str(v)}"
else:
onnx_archive_name = f"{task_name}2309"

onnx_archive_name = parse_stander_model(modelhub, task_name)
# 嵌套模型
else:
for i in modelhub.nested_categories.get(nested_prompt, []):
print(f"{nested_prompt} => {i}")

if nested_models := modelhub.nested_categories.get(nested_prompt, []):
if not isinstance(nested_models, list):
if nested_models:
raise TypeError(
f"NestedTypeError ({nested_prompt}) 的模型映射列表应该是个 List[str] 类型,但实际上是 {type(nested_models)}"
)
nested_models = []
v = ""
for i, model_name in enumerate(nested_models):
filter_chars = [".onnx", task_name]
for fc in filter_chars:
model_name = model_name.replace(fc, "")
if not model_name.isdigit():
continue
else:
v = model_name
break
if v and v.isdigit():
v = int(v) + 1
onnx_archive_name = f"{task_name}{str(v)}"
else:
onnx_archive_name = f"{task_name}2309"
onnx_archive_name = parse_nested_model(modelhub, task_name, nested_prompt)

if not onnx_archive_name:
onnx_archive_name = f"{task_name}2309"
onnx_archive_name = onnx_archive_name.replace(".onnx", "")

_t = CELL_TEMPLATE.format(
github_token=os.getenv("GITHUB_TOKEN", ""),
Expand All @@ -123,22 +139,21 @@ def print_quick_start_info(task_name: str, nested_prompt: str = ""):


def run():
prompt = "nested_largest_squirrel"
prompt = "nested_smallest_bird"

# 生成嵌套类型模版需要提供其配对的提示词
# the smallest animal
# please click on the largest animal
nested_prompt = "please click on the largest animal"
# the largest animal
nested_prompt = "the smallest animal"

# 压缩数据集
tn = zip_dataset(prompt=prompt)

# 打印配置模版
print_quick_start_info(task_name=tn, nested_prompt=nested_prompt)

import webbrowser

webbrowser.open(NOTEBOOK)
print(f"Open In Colab -> {NOTEBOOK}")


if __name__ == "__main__":
Expand Down

0 comments on commit 9e0b003

Please sign in to comment.