In [12]:
import os
import subprocess
from pathlib import Path

MODEL = "o4-mini"
TARGET = "geometry"
TEST = "float-line"
import subprocess

def run_command(command: str):
    print(f"\n▶ Running: {command}\n")
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)

    output_lines = []  # 로그 저장용

    for line in process.stdout:
        print(line, end='')         # 실시간 출력
        output_lines.append(line)   # 로그 저장

    process.wait()
    if process.returncode != 0:
        error_message = (
            f"❌ Command failed: {command}\n"
            f"🔻 Combined Output:\n{''.join(output_lines)}"
        )
        raise RuntimeError(error_message)


In [16]:
data = ['```json\n{\n  "libraries": "from common import *\\nimport numpy as np\\nimport random",\n  "main_code": "def main(input_grid):\\n    sky_color=Color.BLUE\\n    out=np.tile(input_grid,5)\\n    # find the dark silhouette\\n    comps=find_connected_components(input_grid,background=sky_color,monochromatic=True)\\n    sil=[c for c in comps if np.any(c==Color.MAROON)][0]\\n    top_row,left_col,width,height=bounding_box(sil,background=sky_color)\\n    # add mist plume in frames 2–4\\n    for i in range(5):\\n        seg_off=i*9\\n        if i==1:\\n            r=top_row-1; c0=seg_off+left_col; out[r,c0:seg_off+left_col+width]=Color.PINK\\n        elif i==2:\\n            c0=seg_off+max(0,left_col-1); c1=seg_off+min(9,left_col+width+1)\\n            out[top_row-1,c0:c1]=Color.PINK; out[top_row-2,c0:c1]=Color.PINK\\n        elif i==3:\\n            mid=seg_off+left_col+width//2; out[top_row-2,mid]=Color.PINK\\n    return out",\n  "generate_input_code": "def generate_input():\\n    # 9×9 sky\\n    grid=np.full((9,9),Color.BLUE)\\n    # draw a centered triangular silhouette\\n    h=random.randint(2,4)\\n    for d in range(h):\\n        w=2*(h-1-d)+1; start=4-w//2; row=8-d\\n        grid[row,start:start+w]=Color.MAROON\\n    # scatter a few cloud blocks above silhouette\\n    for _ in range(random.randint(2,5)):\\n        sz=random.randint(2,3)\\n        cloud=np.full((sz,sz),Color.GREY,dtype=int)\\n        max_row=8-h-sz\\n        x=random.randint(0,max(0,max_row)); y=random.randint(0,9-sz)\\n        blit_sprite(grid,cloud,x,y)\\n    return grid",\n  "total_code": "from common import *\\nimport numpy as np\\nimport random\\n\\n# concepts:\\n# fluid buoyancy, temporal stacking, looping repetition, wind‑driven drift\\n# description:\\n# Input = 9×9 sky (blue) with scattered grey clouds above a fixed maroon triangular silhouette at bottom.\\n# Output = 9×45: five side‑by‑side 9×9 frames showing mist plume life cycle:\\n# Frame1: clear; Frame2: pink mist peeks above silhouette; Frame3: plume rises two pixels and widens; Frame4: plume thins; Frame5: clear (loop).\\n\\ndef main(input_grid):\\n    sky_color=Color.BLUE\\n    out=np.tile(input_grid,5)\\n    # find the dark silhouette\\n    comps=find_connected_components(input_grid,background=sky_color,monochromatic=True)\\n    sil=[c for c in comps if np.any(c==Color.MAROON)][0]\\n    top_row,left_col,width,height=bounding_box(sil,background=sky_color)\\n    # add mist plume in frames 2–4\\n    for i in range(5):\\n        seg_off=i*9\\n        if i==1:\\n            r=top_row-1; c0=seg_off+left_col\\n            out[r,c0:seg_off+left_col+width]=Color.PINK\\n        elif i==2:\\n            c0=seg_off+max(0,left_col-1); c1=seg_off+min(9,left_col+width+1)\\n            out[top_row-1,c0:c1]=Color.PINK; out[top_row-2,c0:c1]=Color.PINK\\n        elif i==3:\\n            mid=seg_off+left_col+width//2\\n            out[top_row-2,mid]=Color.PINK\\n    return out\\n\\ndef generate_input():\\n    # 9×9 sky\\n    grid=np.full((9,9),Color.BLUE)\\n    # draw a centered triangular silhouette\\n    h=random.randint(2,4)\\n    for d in range(h):\\n        w=2*(h-1-d)+1; start=4-w//2; row=8-d\\n        grid[row,start:start+w]=Color.MAROON\\n    # scatter a few cloud blocks above silhouette\\n    for _ in range(random.randint(2,5)):\\n        sz=random.randint(2,3)\\n        cloud=np.full((sz,sz),Color.GREY,dtype=int)\\n        max_row=8-h-sz\\n        x=random.randint(0,max(0,max_row)); y=random.randint(0,9-sz)\\n        blit_sprite(grid,cloud,x,y)\\n    return grid"\n}\n```', '```json\n{\n  "libraries": "from common import *\\nimport numpy as np\\nimport random",\n  "main_code": "# concepts: fluid buoyancy, temporal stacking, looping repetition, wind‑driven drift\\n# description: expand a 9×9 sky+silhouette snapshot into five side‑by‑side frames showing a mist plume popping up, rising/spreading, thinning, then vanishing\\n\\ndef main(input_grid):\\n    n,_=input_grid.shape\\n    out=np.zeros((n,5*n),int)\\n    for f in range(5): out[:,f*n:(f+1)*n]=input_grid\\n    x,y,w,h=bounding_box(input_grid,background=Color.BLACK)\\n    apex=x;mid=y+w//2;col=Color.GREY\\n    if apex>0: out[apex-1,1*n+mid]=col\\n    if apex>1:\\n        out[apex-2,2*n+mid]=col\\n        for d in(-1,0,1):\\n            c=mid+d\\n            if 0<=c<n: out[apex-1,2*n+c]=col\\n        out[apex-2,3*n+mid]=col\\n    return out",\n  "generate_input_code": "def generate_input():\\n    n=9;m=9\\n    grid=np.zeros((n,m),int)\\n    grid[:,:]=Color.BLUE\\n    # scatter purple tint in sky\\n    for _ in range(random.randint(n*m//4,n*m//2)):\\n        i,j=random.randint(0,n-1),random.randint(0,m-1)\\n        if grid[i,j]==Color.BLUE: grid[i,j]=Color.MAROON\\n    # scatter clouds\\n    for _ in range(random.randint(4,8)):\\n        i,j=random.randint(0,n-1),random.randint(0,m-1)\\n        if grid[i,j]!=Color.BLACK: grid[i,j]=Color.GREY\\n    # draw triangular silhouette at bottom\\n    h=random.randint(2,4)\\n    center=random.randint(h-1,n-h)\\n    for i in range(h):\\n        r=n-h+i\\n        for dx in range(-i,i+1): grid[r,center+dx]=Color.BLACK\\n    return grid",\n  "total_code": "from common import *\\nimport numpy as np\\nimport random\\n\\n# concepts: fluid buoyancy, temporal stacking, looping repetition, wind‑driven drift\\n# description: expand a 9×9 sky+silhouette snapshot into five side‑by‑side frames showing a mist plume popping up, rising/spreading, thinning, then vanishing\\n\\ndef main(input_grid):\\n    n,_=input_grid.shape\\n    out=np.zeros((n,5*n),int)\\n    for f in range(5): out[:,f*n:(f+1)*n]=input_grid\\n    x,y,w,h=bounding_box(input_grid,background=Color.BLACK)\\n    apex=x;mid=y+w//2;col=Color.GREY\\n    if apex>0: out[apex-1,1*n+mid]=col\\n    if apex>1:\\n        out[apex-2,2*n+mid]=col\\n        for d in(-1,0,1):\\n            c=mid+d\\n            if 0<=c<n: out[apex-1,2*n+c]=col\\n        out[apex-2,3*n+mid]=col\\n    return out\\n\\ndef generate_input():\\n    n=9;m=9\\n    grid=np.zeros((n,m),int)\\n    grid[:,:]=Color.BLUE\\n    # scatter purple tint in sky\\n    for _ in range(random.randint(n*m//4,n*m//2)):\\n        i,j=random.randint(0,n-1),random.randint(0,m-1)\\n        if grid[i,j]==Color.BLUE: grid[i,j]=Color.MAROON\\n    # scatter clouds\\n    for _ in range(random.randint(4,8)):\\n        i,j=random.randint(0,n-1),random.randint(0,m-1)\\n        if grid[i,j]!=Color.BLACK: grid[i,j]=Color.GREY\\n    # draw triangular silhouette at bottom\\n    h=random.randint(2,4)\\n    center=random.randint(h-1,n-h)\\n    for i in range(h):\\n        r=n-h+i\\n        for dx in range(-i,i+1): grid[r,center+dx]=Color.BLACK\\n    return grid"\n}\n```']

In [None]:
data = ['```json\n{\n  "libraries": "from common import *\\nimport numpy as np\\nimport random",\n  "main_code": "def main(input_grid):\\n    core = np.argwhere(input_grid == core_color)[0]\\n    cx, cy = core\\n    facets = list(map(tuple, np.argwhere(input_grid == Color.RED)))\\n    # compute unit dirs and body length\\n    dirs = []\\n    for fx, fy in facets:\\n        dx, dy = fx-cx, fy-cy\\n        step = max(abs(dx), abs(dy))\\n        dirs.append((dx//step, dy//step))\\n        body_len = step-1\\n    # simulate 2 cycles (8 quarter‑steps)\\n    highlights = []\\n    rot = 0\\n    for phase in range(1, 9):\\n        if phase & 1:\\n            for dx, dy in dirs:\\n                # rotate dir by rot*90° CW\\n                rdx, rdy = dx, dy\\n                for _ in range(rot): rdx, rdy = rdy, -rdx\\n                # extended tip at body_len+2\\n                tx, ty = cx + rdx*(body_len+2), cy + rdy*(body_len+2)\\n                highlights.append((tx, ty))\\n        else:\\n            rot = (rot + 1) & 3\\n    out = input_grid.copy()\\n    for x, y in highlights:\\n        out[x, y] = Color.GREY\\n    return out",\n  "generate_input_code": "def generate_input():\\n    # random odd size to center\\n    n = random.randint(5, 29) | 1\\n    grid = np.full((n, n), Color.BLACK)\\n    cx = cy = n//2\\n    grid[cx, cy] = core_color\\n    # random body length and spikes\\n    max_len = n//2 - 2\\n    body_len = random.randint(1, max_len)\\n    choices = [(1,0),(-1,0),(0,1),(0,-1),(1,1),(1,-1),(-1,1),(-1,-1)]\\n    k = random.randint(3, len(choices))\\n    dirs = random.sample(choices, k)\\n    for dx, dy in dirs:\\n        # draw body\\n        for i in range(1, body_len+1):\\n            grid[cx+dx*i, cy+dy*i] = body_color\\n        # facet\\n        fx, fy = cx+dx*(body_len+1), cy+dy*(body_len+1)\\n        grid[fx, fy] = Color.RED\\n    return grid",\n  "total_code": "from common import *\\nimport numpy as np\\nimport random\\n\\n# concepts: rotational kinematics, radial oscillation, periodic highlights, facet exposure, central symmetry\\n# description:\\n# Input: black grid with a single yellow core at center, surrounded by a ring of teal triangular spikes\\n# each of length L and a red facet at tip. Output: overlay all white‑highlighted tips (grey) from each\\n# extension phase over two full rotation+oscillation cycles (8 quarter‑steps), keep final red facets.\\n\\ncore_color = Color.YELLOW\\nbody_color = Color.TEAL\\n\\n" + "def main(input_grid):\\n    core = np.argwhere(input_grid == core_color)[0]\\n    cx, cy = core\\n    facets = list(map(tuple, np.argwhere(input_grid == Color.RED)))\\n    dirs = []\\n    for fx, fy in facets:\\n        dx, dy = fx-cx, fy-cy\\n        step = max(abs(dx), abs(dy))\\n        dirs.append((dx//step, dy//step))\\n        body_len = step-1\\n    highlights = []\\n    rot = 0\\n    for phase in range(1, 9):\\n        if phase & 1:\\n            for dx, dy in dirs:\\n                rdx, rdy = dx, dy\\n                for _ in range(rot): rdx, rdy = rdy, -rdx\\n                tx, ty = cx + rdx*(body_len+2), cy + rdy*(body_len+2)\\n                highlights.append((tx, ty))\\n        else:\\n            rot = (rot + 1) & 3\\n    out = input_grid.copy()\\n    for x, y in highlights:\\n        out[x, y] = Color.GREY\\n    return out\\n\\n" + "def generate_input():\\n    n = random.randint(5, 29) | 1\\n    grid = np.full((n, n), Color.BLACK)\\n    cx = cy = n//2\\n    grid[cx, cy] = core_color\\n    max_len = n//2 - 2\\n    body_len = random.randint(1, max_len)\\n    choices = [(1,0),(-1,0),(0,1),(0,-1),(1,1),(1,-1),(-1,1),(-1,-1)]\\n    k = random.randint(3, len(choices))\\n    dirs = random.sample(choices, k)\\n    for dx, dy in dirs:\\n        for i in range(1, body_len+1):\\n            grid[cx+dx*i, cy+dy*i] = body_color\\n        fx, fy = cx+dx*(body_len+1), cy+dy*(body_len+1)\\n        grid[fx, fy] = Color.RED\\n    return grid"\n}\n```', '```json\n{\n  "libraries": "from common import *\\nimport numpy as np\\nimport random",\n  "main_code": "def main(input_grid):\\n    # after two full cycles the pattern returns to its start\\n    return input_grid.copy()",\n  "generate_input_code": "def generate_input():\\n    # grid size between 5 and 29 (odd so there\'s a center cell)\\n    n = random.randrange(5,30,2)\\n    grid = np.full((n,n), Color.BLACK)\\n    cx, cy = n//2, n//2\\n    grid[cx,cy] = Color.WHITE  # fixed white core\\n    # choose 4 or 8 spikes evenly spaced around\\n    dirs = random.choice([4,8])\\n    if dirs==4:\\n        vecs = [(1,0),(-1,0),(0,1),(0,-1)]\\n    else:\\n        vecs = [(1,0),(-1,0),(0,1),(0,-1),(1,1),(1,-1),(-1,1),(-1,-1)]\\n    for dx,dy in vecs:\\n        # draw the black spike\\n        x1,y1 = cx+dx, cy+dy\\n        grid[x1,y1] = Color.BLACK\\n        # put the red facet one cell further\\n        x2,y2 = cx+2*dx, cy+2*dy\\n        if 0<=x2<n and 0<=y2<n:\\n            grid[x2,y2] = Color.RED\\n    return grid",\n  "total_code": "from common import *\\nimport numpy as np\\nimport random\\n\\n# concepts:\\n# rotational kinematics, radial oscillation, periodic highlights, facet exposure, central symmetry\\n# description:\\n# Input: a black grid with a single white core pixel at center surrounded by a ring of identical black spikes, each ending in one red facet.\\n# Output: after simulating two full cycles of 90° rotations and in‑and‑out pulsations the pattern returns exactly to its start; we render that final frame.\\n\\ndef main(input_grid):\\n    # after two full cycles the pattern returns to its start\\n    return input_grid.copy()\\n\\n\\ndef generate_input():\\n    # grid size between 5 and 29 (odd so there\'s a center cell)\\n    n = random.randrange(5,30,2)\\n    grid = np.full((n,n), Color.BLACK)\\n    cx, cy = n//2, n//2\\n    grid[cx,cy] = Color.WHITE  # fixed white core\\n    # choose 4 or 8 spikes evenly spaced around\\n    dirs = random.choice([4,8])\\n    if dirs==4:\\n        vecs = [(1,0),(-1,0),(0,1),(0,-1)]\\n    else:\\n        vecs = [(1,0),(-1,0),(0,1),(0,-1),(1,1),(1,-1),(-1,1),(-1,-1)]\\n    for dx,dy in vecs:\\n        # draw the black spike\\n        x1,y1 = cx+dx, cy+dy\\n        grid[x1,y1] = Color.BLACK\\n        # put the red facet one cell further\\n        x2,y2 = cx+2*dx, cy+2*dy\\n        if 0<=x2<n and 0<=y2<n:\\n            grid[x2,y2] = Color.RED\\n    return grid"\n}\n```']

In [17]:
def parse_code_json(json_str):
    import json
    # 마크다운 코드블록 제거
    if json_str.startswith("```json"):
        json_str = json_str[len("```json"):].strip()
    if json_str.endswith("```"):
        json_str = json_str[:json_str.rfind("```")].strip()
    
    # JSON 파싱
    data = json.loads(json_str)
    return data['total_code']

parse_code_jsodn = parse_code_json(data[0])

In [4]:
parse_code_jsodn 

'from common import *\nimport numpy as np\nimport random\n\n# concepts:\n# fluid buoyancy, temporal stacking, looping repetition, wind‑driven drift\n# description:\n# Input = 9×9 sky (blue) with scattered grey clouds above a fixed maroon triangular silhouette at bottom.\n# Output = 9×45: five side‑by‑side 9×9 frames showing mist plume life cycle:\n# Frame1: clear; Frame2: pink mist peeks above silhouette; Frame3: plume rises two pixels and widens; Frame4: plume thins; Frame5: clear (loop).\n\ndef main(input_grid):\n    sky_color=Color.BLUE\n    out=np.tile(input_grid,5)\n    # find the dark silhouette\n    comps=find_connected_components(input_grid,background=sky_color,monochromatic=True)\n    sil=[c for c in comps if np.any(c==Color.MAROON)][0]\n    top_row,left_col,width,height=bounding_box(sil,background=sky_color)\n    # add mist plume in frames 2–4\n    for i in range(5):\n        seg_off=i*9\n        if i==1:\n            r=top_row-1; c0=seg_off+left_col\n            out[r,c0:seg

In [5]:

def generate_descriptions():
    desc_dir = f"[intergrated]generated_descriptions/{TARGET}"
    os.makedirs(desc_dir, exist_ok=True)

    run_command(
        f"python generate_descriptions.py --target {TARGET} --samples 1 --intergrated "
        f"--outdir \"{desc_dir}\" --model {MODEL} --num_generations 2 "
        f"--max_tokens 40000 --batch_size 1 --num_descriptions 75 --rng_offset 777"
    )
    return desc_dir

def generate_code():
    desc_dir = f"{TEST}-{MODEL}-generated_descriptions/{TARGET}"
    code_dir = f"{TEST}-{MODEL}-generated_code/{TARGET}"
    os.makedirs(code_dir, exist_ok=True)

    for filename in Path(desc_dir).glob("*.jsonl"):
        run_command(
            f"python generate_code.py --test {TEST} --outdir \"{code_dir}\" "
            f"--ignore_cache_samples --prompt_model {MODEL} "
            f"--max_tokens 40000 -n 2 -s 4 --nohtml --jsonl \"{filename}\""
        )
    return code_dir

def generate_problems():
    code_dir = f"{TEST}-{MODEL}-generated_code/{TARGET}"
    prob_dir = f"{TEST}-{MODEL}-generated_problems/{TARGET}"
    os.makedirs(prob_dir, exist_ok=True)

    for filename in Path(code_dir).glob("*.jsonl"):
        run_command(
            f"python generate_problems.py --jsonl \"{filename}\" "
            f"--outdir \"{prob_dir}\" --total_timeout 300"
        )
    return prob_dir

def visualize_problems():
    prob_dir = f"{TEST}-{MODEL}-generated_problems/{TARGET}"
    vis_dir = os.path.join(f"{TEST}-{MODEL}-generated_problems", "visualized", TARGET)
    os.makedirs(vis_dir, exist_ok=True)

    for filename in Path(prob_dir).glob("*.jsonl"):
        run_command(
            f"python visualize_problems.py --jsonl \"{filename}\" "
            f"--outdir \"{vis_dir}\""
        )
    return vis_dir

In [8]:
def main():
    # try:
    #     desc_dir = generate_descriptions()
    # except Exception as e:
    #     print(f"❌ An error occurred: {e}")
    #     print("🔄 Retrying the pipeline...")
    
    try:
        generate_code()
    except Exception as e:
        print(f"❌ An error occurred: {e}")
        print("🔄 Retrying the pipeline...")
        
    try:
        prob_dir = generate_problems()
    except Exception as e:
        print(f"❌ An error occurred: {e}")
        print("🔄 Retrying the pipeline...")
        
    try:
        vis_dir = visualize_problems()
    except Exception as e:
        print(f"❌ An error occurred: {e}")
        print("🔄 Retrying the pipeline...")

    print(f"✅ Pipeline completed successfully.\nGenerated visualization: {vis_dir}")


In [7]:
main()


▶ Running: python generate_problems.py --jsonl "float-line-o4-mini-generated_code/geometry/geometry_23.jsonl" --outdir "float-line-o4-mini-generated_problems/geometry" --total_timeout 300

Reading from float-line-o4-mini-generated_code/geometry/geometry_23.jsonl
Saving to float-line-o4-mini-generated_problems/geometry/geometry_23.jsonl

  0%|                                                    | 0/1 [00:00<?, ?it/s]+1 problem with 10 examples
so far, generated 1 problems

100%|████████████████████████████████████████████| 1/1 [00:02<00:00,  2.67s/it]
100%|████████████████████████████████████████████| 1/1 [00:02<00:00,  2.68s/it]
Generated 1 problems
Overall stats: {'non_deterministic': 0, 'non_color_invariant': {'transformation_fail': 0, 'non_well_formed': 0, 'non_color_invariant': 0}, 'identity': 0, 'non_well_formed_output': 0, 'black_output': 0, 'timeout': 0, 'non_well_formed_input': 0, 'duplicate_input': 0, 'total': 0}

▶ Running: python generate_problems.py --jsonl "float-line-o4-m

In [2]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [pandas]2m2/3[0m [pandas]
[1A[2KSuccessfully installed pandas-2.2.3 pytz-2025.2 tzdata-2025.2


In [9]:
for enc in ["cp949", "euc-kr", "ISO-8859-1", "latin1"]:
    try:
        df = pd.read_csv("all_gifs_metadata.csv", encoding=enc)
        print(f"✅ 성공: {enc}, shape={df.shape}")
        break              # 읽기 성공 시 반복 종료
    except UnicodeDecodeError as e:
        print(f"❌ {enc} 실패: {e}")

❌ cp949 실패: 'cp949' codec can't decode byte 0xc9 in position 91924: illegal multibyte sequence
❌ euc-kr 실패: 'euc_kr' codec can't decode byte 0x98 in position 24405: illegal multibyte sequence
✅ 성공: ISO-8859-1, shape=(10196, 13)


In [10]:
import pandas as pd

df = pd.read_csv("all_gifs_metadata.csv",encoding='ISO-8859-1')          # CSV 읽기
dupes = df[df.duplicated(subset=["id"], keep=False)]   # 해당 열에서 중복된 행만 추출

if dupes.empty:
    print("중복 없음 ✅")
else:
    print(f"{len(dupes)}개의 중복 행 발견 👀")
    print(dupes)               

중복 없음 ✅


In [7]:
from hyeonseok_utils.data_collector import *
DATA_DIR = f"./data/GIF/"
MAX_SIZE = 10 * 1024 * 1024 
AVAILABLE_DATA_FORMATS = [".gif", ".webm"]
MAX_SAMPLES =  -1 #  arguments.samples
TARGET = 1
SELECTOR_FILE = f"./hyeonseok_data_batch/uuid_batchs/batch_{TARGET}.txt"
print(SELECTOR_FILE," will load as batch")


# 처리할 데이터를 불러오는 부분
data_path_list, missing_path_list = process_data_list_loader(SELECTOR_FILE, MAX_SAMPLES, DATA_DIR, AVAILABLE_DATA_FORMATS, SPLITOR=',\n', ENCODING='ISO-8859-1')
if len(missing_path_list) > 0:
        raise Exception("missing_path_list exist")

./hyeonseok_data_batch/uuid_batchs/batch_1.txt  will load as batch

Successfully selected 300 files for processing
Missing files: []
Please check the files in the data directory


In [8]:
data_path_list

[PosixPath('/home/hyunseok/BARC/data/GIF/41ec2511-69ba-4c58-93b3-fff16cb86597.gif'),
 PosixPath('/home/hyunseok/BARC/data/GIF/de72f162-faae-4b07-89a9-e52bb7104260.gif'),
 PosixPath('/home/hyunseok/BARC/data/GIF/a1232131-127b-467d-9232-b8341a9356d4.gif'),
 PosixPath('/home/hyunseok/BARC/data/GIF/a0d29d00-c732-4634-b259-fbb43fb24b3a.gif'),
 PosixPath('/home/hyunseok/BARC/data/GIF/538d7ad1-56de-4dac-9b0f-8d2672227c78.gif'),
 PosixPath('/home/hyunseok/BARC/data/GIF/73585ad1-0951-468e-9b55-914dfad9a341.gif'),
 PosixPath('/home/hyunseok/BARC/data/GIF/94ea3d2e-564b-45c8-9024-609cf42cf3d5.gif'),
 PosixPath('/home/hyunseok/BARC/data/GIF/0d728b53-49fc-44df-b953-b329669d4200.gif'),
 PosixPath('/home/hyunseok/BARC/data/GIF/1e7d1366-82c9-494c-bbe6-a128f02f61ee.gif'),
 PosixPath('/home/hyunseok/BARC/data/GIF/0bc8cfd9-c121-4126-be58-711be64ce3e2.gif'),
 PosixPath('/home/hyunseok/BARC/data/GIF/d0384651-31c0-441e-99a6-8dbd2352ab63.gif'),
 PosixPath('/home/hyunseok/BARC/data/GIF/08714919-b145-4590-90f6-

In [10]:
data_name_list_parser_from_file(
            data_select_list_file_path = SELECTOR_FILE,
            MAX_SAMPLES=MAX_SAMPLES,
            splitor=',',
            ENCODING='ISO-8859-1'
        )


Successfully selected 300 files for processing


['41ec2511-69ba-4c58-93b3-fff16cb86597',
 'de72f162-faae-4b07-89a9-e52bb7104260',
 'a1232131-127b-467d-9232-b8341a9356d4',
 'a0d29d00-c732-4634-b259-fbb43fb24b3a',
 '538d7ad1-56de-4dac-9b0f-8d2672227c78',
 '73585ad1-0951-468e-9b55-914dfad9a341',
 '94ea3d2e-564b-45c8-9024-609cf42cf3d5',
 '0d728b53-49fc-44df-b953-b329669d4200',
 '1e7d1366-82c9-494c-bbe6-a128f02f61ee',
 '0bc8cfd9-c121-4126-be58-711be64ce3e2',
 'd0384651-31c0-441e-99a6-8dbd2352ab63',
 '08714919-b145-4590-90f6-e30db3996b6f',
 '8e43e853-56d7-4e1d-907e-206676b5254d',
 'bb2aaa80-76cc-43f3-84be-a3be7fd9aa4b',
 '782e9e49-91e6-4167-a259-9af72adcc92b',
 'c53f2ecd-1146-45b3-b892-0e779a464fcf',
 '2eb1d7e4-703f-452f-a11a-8535ed80fd72',
 'ac16d782-3718-41e4-9ab9-68467b7c61a5',
 '99a331df-0d1b-4bba-a5b7-c71bc1117e84',
 '80133aba-748f-4173-bd04-d9c694aaf7a5',
 'c8dbea75-f05b-447f-9946-c36522c2a34f',
 '7f24831f-0825-40b0-8120-ccd26d8d9609',
 'a711f89b-1b89-4c6c-98f7-227d6f65dd39',
 'aaeaf910-8151-4c98-ab38-78492f758fff',
 '67893238-888b-

In [None]:
from hyeonseok_utils.csv_key_unique_check import find_value_in_column
import os
path = '/home/hyunseok/BARC/results/success/0a0e563e-ed67-41c8-9a16-36bee66f8e8b.jsonl'
prev_step_id = os.path.splitext(os.path.basename(path))[0]
print(prev_step_id)
# 사용 예시
path  = "/home/hyunseok/BARC/results/metadata/step_descriptions_metadata.csv"
col   = "id"
val   = "746dbe8e-5269-4fdf-a01f-7a5295b42a43"
# val = 0

# 1) 단일 열에서 유일성 검사
result = find_value_in_column(csv_path=path, value=val, column=col)
for rec in  result['records']:
    print(rec['gif-id'])
   
print("유일성 결과:",result['unique'])
print( result['records'][0]['gif-id'])

# # 2) 특정 영역(region)에서 유일성 검사
# #    예: 100~199번째 행, 'user_id'와 'order_id' 두 열만 검사
# region_unique = is_value_unique(
#     csv_path=path,
#     rows=slice(100, 200),
#     cols=["gif-id"],
#     value=val
# )
# print(f"[영역 검사] 지정 영역에서 값 {val} 유일 여부:", region_unique)

0a0e563e-ed67-41c8-9a16-36bee66f8e8b
유일성 결과: False


IndexError: list index out of range

In [5]:
#!/usr/bin/env python3
import os
import glob
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed

# 1. 파라미터 설정
TEST = "float-line"                 # --test 에 들어갈 값
MODEL = "o3-mini"                  # --prompt_model 에 들어갈 값
TARGET = "0"                       # 출력 디렉터리 명에 포함될 값

# 2. 처리할 파일 목록 준비 (.jsonl 파일을 모두 가져오는 예시)
#    필요에 따라 glob 패턴을 바꾸거나, 직접 리스트를 정의하세요.
filenames = glob.glob("./results/success_desc/*.jsonl")

# 3. 출력 디렉터리 생성
output_dir = f"./results/success_code/"
os.makedirs(output_dir, exist_ok=True)

def process_file(filename):
    """
    단일 파일에 대해 subprocess로 generate_code.py를 실행하고,
    (filename, returncode, stdout, stderr)를 리턴합니다.
    """
    cmd = [
        "python", "generate_code.py",
        "--test", TEST,
        "--outdir", output_dir,
        "--ignore_cache_samples",
        "--prompt_model", MODEL,
        "--max_tokens", "40000",
        "-n", "2",
        "-s", "4",
        "--nohtml",
        "--jsonl", filename,
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    return filename, result.returncode, result.stdout, result.stderr


In [6]:

def main():
    max_workers = 2  # 동시에 띄울 프로세스 수
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Future 객체와 filename 매핑
        futures = {executor.submit(process_file, fn): fn for fn in filenames}
        
        # 완료되는 순서대로 결과 처리
        for future in as_completed(futures):
            fn = futures[future]
            try:
                filename, code, out, err = future.result()
                if code == 0:
                    print(f"✅ SUCCESS: {filename}")
                else:
                    print(f"❌ FAIL:    {filename} (exit {code})")
                    print(f"--- stdout ---\n{out}")
                    print(f"--- stderr ---\n{err}")
            except Exception as e:
                print(f"❌ EXCEPTION: {fn} -> {e}")

 
main()

✅ SUCCESS: ./results/success_desc/c78f3913-03c7-464d-9989-c0a960a67c70.jsonl
✅ SUCCESS: ./results/success_desc/84d4192b-b044-4fa6-aef2-1e47dec8935e.jsonl
✅ SUCCESS: ./results/success_desc/a960acb8-98e8-4089-b0ff-c1089730fda6.jsonl
✅ SUCCESS: ./results/success_desc/84b021ef-f9e6-4d42-b1e4-2f1f1aeb47de.jsonl
✅ SUCCESS: ./results/success_desc/aad0bff2-7a8b-4ba9-b7c5-542d20344b61.jsonl
✅ SUCCESS: ./results/success_desc/18638a27-2f33-4d88-9aa7-bf0bf33a0266.jsonl
✅ SUCCESS: ./results/success_desc/ebb66ea2-cd60-40d7-9ee5-d44bcc7add0b.jsonl
✅ SUCCESS: ./results/success_desc/f65f5fb5-fa87-46fd-af70-9abbd7cdc67a.jsonl
✅ SUCCESS: ./results/success_desc/62e63c12-a68b-4f6e-98a4-843b0f63ec19.jsonl


KeyboardInterrupt: 

In [10]:
import re

def extract_uuids_from_log(log_path):
    """
    주어진 로그 파일에서 UUID들만 추출해 리스트로 반환합니다.
    """
    uuid_list = []
    # UUID 형식: 8-4-4-4-12 의 36자16진수 문자열
    pattern = re.compile(r'/([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-'
                         r'[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-'
                         r'[0-9a-fA-F]{12})\.gif')

    with open(log_path, 'r', encoding='utf-8') as f:
        for line in f:
            m = pattern.search(line)
            if m:
                uuid_list.append(m.group(1))
    return uuid_list

if __name__ == '__main__':
    log_file = '/home/hyunseok/BARC/results/failure_desc/test.txt'  # 여기에 실제 로그 파일 경로 입력
    uuids = extract_uuids_from_log(log_file)
    print(len(uuids))

300


In [18]:
from hyeonseok_utils.csv_processor import list_file_names_without_ext, load_names_from_csv
# 사용 예시
dir_path    = "/home/hyunseok/BARC/results/success_desc"   # 파일 이름을 뽑을 폴더 경로
csv_path    = "/home/hyunseok/BARC/results/metadata/step_descriptions_metadata.csv"    # 비교할 CSV 파일 경로
column_name = "id"                     # CSV에서 이름이 들어있는 컬럼명

# 1) 디렉토리에서 파일명(확장자 제외) 뽑기
# file_list = set(list_file_names_without_ext(dir_path))

log_file = '/home/hyunseok/BARC/results/failure_desc/failed_1.txt'  # 여기에 실제 로그 파일 경로 입력
file_list = set(extract_uuids_from_log(log_file))

# 2) CSV에서 이름 리스트 불러오기
csv_list = set(load_names_from_csv(csv_path, column_name))

# 3) 교집합 & 차집합 계산
intersection   = file_list & csv_list  # 두 리스트 모두에 있는 이름
only_in_files  = file_list - csv_list  # 파일에는 있지만 CSV에는 없는 이름
only_in_csv    = csv_list - file_list  # CSV에는 있지만 파일에는 없는 이름

# 4) 결과 출력
print("🔹 교집합 (Intersection):", intersection)
print("🔹 파일 전용 (In files only):", only_in_files)
print("🔹 CSV 전용  (In CSV only):", only_in_csv)


🔹 교집합 (Intersection): set()
🔹 파일 전용 (In files only): {'edfec6c0-a206-4f2f-a389-b1566e6aea0e', '1d336bbe-2fa8-4803-a1e9-1cb649ca8319', 'e4ff22fe-89f7-45c4-978f-54294017210d', '35ef97cd-6cd1-41d5-b013-8202b03bf432', '785a98a9-5a03-4767-8c71-ac3f9e9254af', '932897a1-fecc-4aec-93dc-fa32a5fb06f7', '2bd83f48-4aa4-40a6-b317-964159993498', '5582e135-0062-4041-82a0-5c899a710fb0', '69f63ba3-6e0a-4244-a46d-8fa0779533ad', '3abd08c9-ff30-4559-9f44-199bb6841f63', 'cb3ae313-63c5-4629-95cb-6fa98316b386', 'c29d60ce-b28e-455c-aa42-c2cc3f39e285', '064423f0-af6d-46c4-a65f-d6b2b98490d4', '4b256856-2f04-4e33-a5cb-5c5c190361d2', '5fde9edd-ec68-4850-a113-14ea228c5977', '28416d0c-9e12-4e0c-9bf9-30e638bf45ce', 'a5ee2a21-7f4d-4a6f-b752-5c13bb6b71b0', '4d48b5ec-5398-4444-b9a1-90e82c918075', '22aa4dcf-7ec5-4b0e-b08c-b2a9c4cde8fa', 'bbf11c69-562e-4a9b-8b37-452995807e91', '6abb03fe-09d7-4827-8fc6-3490e6178092', '14e5815a-943f-4e7d-8f05-b8a762c10a32', 'ad11980d-23c5-4d6e-b38b-97879428bca3', 'bc0b2c1e-b2b8-48b5-be95-8

In [19]:
from hyeonseok_utils.csv_processor import list_file_names_without_ext, load_names_from_csv
# 사용 예시
dir_path_   = "/home/hyunseok/BARC/results/success_desc"   # 파일 이름을 뽑을 폴더 경로
csv_path   = "/home/hyunseok/BARC/results/metadata/step_descriptions_metadata.csv"    # 비교할 CSV 파일 경로
column_name = "id"                     # CSV에서 이름이 들어있는 컬럼명

# 1) 디렉토리에서 파일명(확장자 제외) 뽑기
# file_list = set(list_file_names_without_ext(dir_path))

log_file = '/home/hyunseok/BARC/results/failure_desc/failed_2.txt'  # 여기에 실제 로그 파일 경로 입력
file_list_ = set(extract_uuids_from_log(log_file))

# 2) CSV에서 이름 리스트 불러오기
csv_list_ = set(load_names_from_csv(csv_path, column_name))

# 3) 교집합 & 차집합 계산
intersection_   = file_list_ & csv_list_  # 두 리스트 모두에 있는 이름
only_in_files_  = file_list_ - csv_list_  # 파일에는 있지만 CSV에는 없는 이름
only_in_csv_    = csv_list_ - file_list_  # CSV에는 있지만 파일에는 없는 이름

# 4) 결과 출력
print("🔹 교집합 (Intersection):", intersection_)
print("🔹 파일 전용 (In files only):", only_in_files_)
print("🔹 CSV 전용  (In CSV only):", only_in_csv_)

🔹 교집합 (Intersection): set()
🔹 파일 전용 (In files only): {'b4f3fbfa-f7e9-4b7a-a65b-a908aa316582', '6e42ff1e-097b-4e67-aafe-42cef1a02a46', '2f3c1933-4cd3-4dbc-927a-9b48c7689c8b', 'a884f09a-1d2a-454d-84a0-842ab37a6971', '9e1ccaee-cbd7-4226-abeb-13561c31567b', '13bbaa32-ad9a-4370-a5b6-0cc441a3bb75', '77970240-bc95-4f85-bec6-e444bd982cda', '42534a07-51d5-4987-bfd6-53bd4d5486ea', 'a5dcde7c-60a9-4480-ba08-aae632f39d18', '6b999721-334f-4d05-844f-b474e66fa05b', '8fde91b4-d36a-4d53-bbfb-3d5310b48c40', 'aad628ad-dd9d-40b8-a201-cb320856dba7', '419ddddd-ba4d-4a48-973f-5c230661b6b1', '2245e66a-cbfe-471a-999d-ceeeb108b455', 'abfff1b5-7695-43ef-8395-5dffa4d9341a', 'c892a0be-b616-4c81-ad40-c1e7cae6794c', '459ef4f2-7638-4b78-91c2-9b25594225f5', '4d4b56db-f05a-4769-8fa4-29a9eb6675bc', '5f255f6e-974b-47d5-88da-3c93a5dfed56', 'a74da1e9-8acd-44e3-9028-b77756d0729f', '9ab74805-a21c-439e-b198-bbfeeae51090', '1fd6fa47-5b01-49a5-b6b4-8d46265c20e9', 'a8093b18-80d4-4a42-8c59-5967fd2b0296', 'ca67e64d-c4f0-494d-8cc5-6

In [None]:
only_in_csv_ + only_in_csv

In [None]:
from hyeonseok_utils.result_recoder import parse_step_code_result
from hyeonseok_utils.generate_metadata_desc import generate_metadata_csv_of_step_descriptions
from hyeonseok_utils.csv_key_unique_check import find_value_in_column

import uuid
from datetime import datetime, timezone
METADATA_CSV_PATH='/home/hyunseok/BARC/results/metadata/step_problem_metadata.csv'
METADATA_PREV_CSV_PATH='/home/hyunseok/BARC/results/metadata/step_code_metadata.csv'
col = "id"
tet = '0a0ca1f6-1545-4c57-b989-48f8489fec0e'
prev_step_id = os.path.splitext(os.path.basename(tet))[0]
result = find_value_in_column(csv_path=METADATA_PREV_CSV_PATH, value=prev_step_id, column=col)
result['records'][0]['gif-id']

In [25]:
import os
from hyeonseok_utils.csv_key_unique_check import find_value_in_column
if __name__ == "__main__":
    base_dir = '/home/hyunseok/BARC/results/success_problem'
    path='/home/hyunseok/BARC/results/metadata/step_problem_metadata.csv'
    val  = '0a6fed1a-46b7-4db9-87c1-8968ccf6328f'

    # 예시1: user_id 열에서 값 찾기
    result = find_value_in_column(path, column="id", value=val)
    print(f"[열 검사] 유일 여부: {result}")
    gif_id = result['records'][0]['id']
    prob_path = os.path.join(base_dir, result['records'][0]['id']+'.jsonl')
    if os.path.exists(prob_path):
        print(prob_path)
 

[열 검사] 유일 여부: {'unique': True, 'records': [{'id': '0a6fed1a-46b7-4db9-87c1-8968ccf6328f', 'step-name': 'description', 'prev-step-id': 'GIF', 'gif-id': '979ee1f8-b760-4d5a-b9c3-7a49d3ff341d', 'gen-model': 'o3-mini', 'result_code': 1, 'result_path': '/home/hyunseok/BARC/results/success_problem/0a6fed1a-46b7-4db9-87c1-8968ccf6328f.jsonl', 'error_message': nan, 'createAt': '2025-05-10T06:30:27+00:00', 'token-usage': '""', 'visualization_path': nan}]}
./data/GIF/979ee1f8-b760-4d5a-b9c3-7a49d3ff341d.gif
