In [1]:
import os
import csv
import base64
import re
import ast
from dotenv import load_dotenv
from groq import Groq

load_dotenv()

def split_top_level_commas(s):
    """
    대괄호 레벨을 고려해서 top-level 쉼표 기준으로 분리
    """
    parts = []
    current = ''
    level = 0
    for c in s:
        if c == '[':
            level += 1
            current += c
        elif c == ']':
            level -= 1
            current += c
        elif c == ',' and level == 0:
            parts.append(current.strip())
            current = ''
        else:
            current += c
    if current:
        parts.append(current.strip())
    return parts

def parse_problem_info(input_str):
    """
    annotation.txt 한 줄 파싱
    """
    # 파일명 부분과 나머지 분리
    top_level_parts = split_top_level_commas(input_str)
    if len(top_level_parts) < 4:
        raise ValueError(f"형식 오류: {input_str}")

    file_part = top_level_parts[0]
    problem_number = int(top_level_parts[1])
    type_str = top_level_parts[2]
    answer_str = ','.join(top_level_parts[3:])  # 나머지는 answer로 합침

    # 파일명 파싱
    file_match = re.match(r"(.+?) - (\d+)_section_(\d+)_conf[\d.]+\.jpg", file_part)
    if not file_match:
        raise ValueError(f"파일명 형식 오류: {file_part}")

    book = file_match.group(1)
    page = int(file_match.group(2))
    section_number = int(file_match.group(3))

    # type과 answer 변환
    def parse_sub_question(s):
        s = s.strip()
        if s.startswith('[') and s.endswith(']'):
            s_inner = s[1:-1]
            items = split_top_level_commas(s_inner)
            result = {}
            for item in items:
                if ':' in item:
                    key, value = item.split(':', 1)
                    key, value = key.strip(), value.strip()
                    if value.startswith('[') and value.endswith(']'):
                        value = ast.literal_eval(value)
                    else:
                        try:
                            value = int(value)
                        except:
                            pass
                    result[key] = value
                else:
                    try:
                        return ast.literal_eval("[" + s_inner + "]")
                    except:
                        return s_inner
            return result
        else:
            return s

    return {
        "file": file_part,
        "book": book,
        "page": page,
        "section_number": section_number,
        "problem_number": problem_number,
        "type": type_str,
        "answer": answer_str
    }

def compare_answer(pred, gold):
    """모델 응답과 실제 정답 비교"""
    try:
        return str(pred).strip() == str(gold).strip()
    except:
        return False

def load_annotation_dict(annotation_file):
    """annotation.txt 파일을 {파일명: info} dict로 변환"""
    mapping = {}
    with open(annotation_file, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = [p.strip() for p in line.split(',', 3)]
            if len(parts) < 4 or not parts[1].isdigit():
                continue
            info = parse_problem_info(line)
            mapping[os.path.basename(info["file"])] = info
    return mapping

In [2]:
ann1 = load_annotation_dict("../../images/LLM_annotation/annotation.txt")
ann1

{'라이트쎈 중등수학 1-1 - 1_section_00_conf0.61.jpg': {'file': '라이트쎈 중등수학 1-1 - 1_section_00_conf0.61.jpg',
  'book': '라이트쎈 중등수학 1-1',
  'page': 1,
  'section_number': 0,
  'problem_number': 62,
  'type': '1',
  'answer': '2'},
 '라이트쎈 중등수학 1-1 - 1_section_01_conf0.55.jpg': {'file': '라이트쎈 중등수학 1-1 - 1_section_01_conf0.55.jpg',
  'book': '라이트쎈 중등수학 1-1',
  'page': 1,
  'section_number': 1,
  'problem_number': 63,
  'type': '0',
  'answer': '2'},
 '라이트쎈 중등수학 1-1 - 1_section_02_conf0.51.jpg': {'file': '라이트쎈 중등수학 1-1 - 1_section_02_conf0.51.jpg',
  'book': '라이트쎈 중등수학 1-1',
  'page': 1,
  'section_number': 2,
  'problem_number': 61,
  'type': '1',
  'answer': '10'},
 '라이트쎈 중등수학 1-1 - 1_section_03_conf0.46.jpg': {'file': '라이트쎈 중등수학 1-1 - 1_section_03_conf0.46.jpg',
  'book': '라이트쎈 중등수학 1-1',
  'page': 1,
  'section_number': 3,
  'problem_number': 65,
  'type': '0',
  'answer': '[3,4]'},
 '라이트쎈 중등수학 1-1 - 1_section_04_conf0.44.jpg': {'file': '라이트쎈 중등수학 1-1 - 1_section_04_conf0.44.jpg',
  'book': '라이트쎈

In [36]:
import os
import unicodedata
import re

def clean_string(s):
    """문자열 정제: Unicode 정규화, 공백 제거, invisible char 제거, 소문자 변환"""
    s = unicodedata.normalize("NFC", s)
    s = re.sub(r"\s+", "", s)  # 공백 제거
    s = re.sub(r"[\u200b\u200c\u200d\uFEFF]", "", s)  # invisible char 제거
    return s.lower()

def process_annotations(base_dir, annotation_dict, source_name):
    results = []
    for fname in os.listdir(base_dir):
        if not fname.lower().endswith(".jpg"):
            continue
        
        fname_clean = clean_string(fname)

        # annotation_dict에서 file 값과 fname_clean 비교
        matched = None
        for key, info in annotation_dict.items():
            file_value = info.get("file", "")
            file_clean = clean_string(file_value)
            if file_clean == fname_clean:
                matched = info
                break

        if not matched:
            print(f"⚠️ 매칭 실패: {fname} (annotation에 file이 없음)")
            continue

        image_path = os.path.join(base_dir, fname)

        results.append({
            "source": source_name,
            "book": matched["book"],
            "page": matched["page"],
            "problem_number": matched["problem_number"],
            "type": matched["type"],
            "answer": matched["answer"],
        })
    return results


In [37]:
results = []
results.extend(process_annotations("../../images/LLM_annotation", ann1, "LLM_annotation"))