In [None]:
from datasets import get_dataset_config_names, get_dataset_split_names, load_dataset
import os

# 🔧 설정
dataset_name = "HAERAE-HUB/KMMLU"
save_dir = "converted_json"
os.makedirs(save_dir, exist_ok=True)

# 1. 모든 subset(config) 이름 가져오기
subsets = get_dataset_config_names(dataset_name)

for subset in subsets:
    try:
        # 2. subset별 split 목록 가져오기 (예: train, validation, test)
        splits = get_dataset_split_names(dataset_name, subset)
        
        for split in splits:
            print(f"📥 다운로드 중: subset={subset}, split={split}")
            
            # 3. 데이터셋 로드
            dataset = load_dataset(dataset_name, subset, split=split)
            
            # 4. 파일 이름 지정 및 저장
            json_name = f"{subset}-{split}.json"
            json_path = os.path.join(save_dir, json_name)
            
            # 5. pandas로 변환 후 저장
            df = dataset.to_pandas()
            df.to_json(json_path, orient="records", lines=True, force_ascii=False)
            
            print(f"✅ 저장 완료: {json_path}")
    
    except Exception as e:
        print(f"❌ 오류 발생: subset={subset}, split={split} → {e}")


In [1]:
import pandas as pd
import os

# 🔧 설정
json_dir = "./KMMLU"
save_dir = "./KMMLU_Typo_Generated"
column = "question"
typo_types = ["introduce_typo", "drop_jongsung", "repeat_char", "merge_words", "swap_parts"]
count_range = range(1, 6)
ratio = None
max_repeat = 3

# Typo 타입 → 인덱스 매핑
typo_idx_map = {
    "introduce_typo": 1,
    "drop_jongsung": 2,
    "repeat_char": 3,
    "merge_words": 4,
    "swap_parts": 5
}

from typo_generation_modules import (
    introduce_typo_to_sentence,
    drop_jongsung_sentence,
    repeat_char_typo_no_space,
    merge_words_typo,
    swap_parts_in_sentence,
)

# test JSON 파일만 대상
json_paths = [
    os.path.join(json_dir, f)
    for f in os.listdir(json_dir)
    if f.endswith("-test.json")
]

# 저장 폴더 만들기
os.makedirs(save_dir, exist_ok=True)

# 처리 시작
for json_path in json_paths:
    df = pd.read_json(json_path, lines=True)
    base_filename = os.path.basename(json_path).replace(".json", "")

    if column in df.columns:
        for typo_type in typo_types:
            prev_indices = [set() for _ in range(len(df))]  # introduce_typo 전용

            for level in count_range:
                if level == 1:
                    df_work = df.copy()
                    texts = df_work[column].astype(str).tolist()
                else:
                    prev_name = f"{base_filename}_typo_{typo_idx_map[typo_type]}_level_{level-1}.json"
                    prev_path = os.path.join(save_dir, prev_name)
                    df_work = pd.read_json(prev_path, lines=True)
                    texts = df_work[column].astype(str).tolist()

                new_texts = []

                if typo_type == "introduce_typo":
                    for i, text in enumerate(texts):
                        new_text, applied = introduce_typo_to_sentence(
                            text,
                            count=1,
                            ratio=ratio,
                            skip_indices=prev_indices[i]
                        )
                        new_texts.append(new_text)
                        prev_indices[i].update(applied)
                    df_work[column] = new_texts

                else:
                    typo_funcs = {
                        "drop_jongsung": lambda text: drop_jongsung_sentence(text, count=1, ratio=ratio),
                        "repeat_char": lambda text: repeat_char_typo_no_space(text, count=1, ratio=ratio, max_repeat=max_repeat),
                        "merge_words": lambda text: merge_words_typo(text, count=1, ratio=ratio),
                        "swap_parts": lambda text: swap_parts_in_sentence(text, count=1, ratio=ratio)
                    }
                    df_work[column] = [typo_funcs[typo_type](text) for text in texts]

                save_name = f"{base_filename}_typo_{typo_idx_map[typo_type]}_level_{level}.json"
                save_path = os.path.join(save_dir, save_name)
                df_work.to_json(save_path, orient="records", lines=True, force_ascii=False)
                print(f"✅ 저장 완료: {save_path}")


✅ 저장 완료: ./KMMLU_Typo_Generated\Accounting-test_typo_1_level_1.json
✅ 저장 완료: ./KMMLU_Typo_Generated\Accounting-test_typo_1_level_2.json
✅ 저장 완료: ./KMMLU_Typo_Generated\Accounting-test_typo_1_level_3.json
✅ 저장 완료: ./KMMLU_Typo_Generated\Accounting-test_typo_1_level_4.json
✅ 저장 완료: ./KMMLU_Typo_Generated\Accounting-test_typo_1_level_5.json
✅ 저장 완료: ./KMMLU_Typo_Generated\Accounting-test_typo_2_level_1.json
✅ 저장 완료: ./KMMLU_Typo_Generated\Accounting-test_typo_2_level_2.json
✅ 저장 완료: ./KMMLU_Typo_Generated\Accounting-test_typo_2_level_3.json
✅ 저장 완료: ./KMMLU_Typo_Generated\Accounting-test_typo_2_level_4.json
✅ 저장 완료: ./KMMLU_Typo_Generated\Accounting-test_typo_2_level_5.json
✅ 저장 완료: ./KMMLU_Typo_Generated\Accounting-test_typo_3_level_1.json
✅ 저장 완료: ./KMMLU_Typo_Generated\Accounting-test_typo_3_level_2.json
✅ 저장 완료: ./KMMLU_Typo_Generated\Accounting-test_typo_3_level_3.json
✅ 저장 완료: ./KMMLU_Typo_Generated\Accounting-test_typo_3_level_4.json
✅ 저장 완료: ./KMMLU_Typo_Generated\Accounting-test_

In [None]:
import os
import json
from collections import defaultdict

# 🔧 설정
base_dir = "KMMLU_typo"  # typo가 적용된 파일들이 있는 곳
final_dir = "All_KMMLU_Typo_Tests"  # 최종 병합 결과 저장 폴더

# 📁 폴더 생성
os.makedirs(final_dir, exist_ok=True)

# 🔍 typo_type + level 추출 함수
def extract_typo_info(filename):
    parts = filename.split("_")
    typo = parts[-3]  # e.g., 'typo_1'
    level = parts[-1].replace(".json", "")  # e.g., '5'
    return typo, level

# 🧩 파일 그룹화
grouped_files = defaultdict(lambda: defaultdict(list))
files = [f for f in os.listdir(base_dir) if f.endswith(".json") and "test" in f]
for filename in files:
    typo, level = extract_typo_info(filename)
    grouped_files[typo][level].append(filename)

# 📦 병합 후 저장 (중간 폴더 없이 바로)
for typo_type in grouped_files:
    for level in grouped_files[typo_type]:
        combined_data = {}
        for file in grouped_files[typo_type][level]:
            full_path = os.path.join(base_dir, file)
            with open(full_path, "r", encoding="utf-8") as f:
                lines = f.readlines()
                parsed_lines = [json.loads(line) for line in lines if line.strip()]
            dataset_name = file.split("-")[0]  # e.g., Accounting
            combined_data[dataset_name] = parsed_lines

        # 📄 파일 이름: KMMLU_typo_{n}_level_{m}.json
        save_filename = f"KMMLU_{typo_type}_level_{level}.json"
        save_path = os.path.join(final_dir, save_filename)

        with open(save_path, "w", encoding="utf-8") as f:
            json.dump(combined_data, f, ensure_ascii=False, indent=2)

print("✅ All typo-level test files have been merged and saved.")


✅ All typo-level test files have been merged and saved.


In [6]:
import os
import subprocess
import shutil

# [1] 설정
repo_url = "https://github.com/CAU-AI-Project-Team5/K-Typo.git"
local_repo_path = "."
source_dir = "KMMLU_typo"
target_dir = os.path.join("Datasets", "KMMLU")

# [2] Git 초기화
if not os.path.exists(os.path.join(local_repo_path, ".git")):
    subprocess.run(["git", "init"], cwd=local_repo_path)

# [3] 원격 저장소 연결
remotes = subprocess.run(["git", "remote"], cwd=local_repo_path, capture_output=True, text=True).stdout
if "origin" not in remotes:
    subprocess.run(["git", "remote", "add", "origin", repo_url], cwd=local_repo_path)

# [4] main 브랜치가 없으면 생성
branches = subprocess.run(["git", "branch"], cwd=local_repo_path, capture_output=True, text=True).stdout
if "main" not in branches:
    subprocess.run(["git", "checkout", "-b", "main"], cwd=local_repo_path)

# [5] Datasets/KMMLU 폴더 생성
os.makedirs(target_dir, exist_ok=True)

# [6] typo 파일 복사
for file in os.listdir(source_dir):
    if file.endswith(".json"):
        shutil.copy(os.path.join(source_dir, file), os.path.join(target_dir, file))

# [7] Git add + commit + push
subprocess.run(["git", "add", "."], cwd=local_repo_path)
subprocess.run(["git", "commit", "-m", "Move KMMLU_typo files into Datasets/KMMLU"], cwd=local_repo_path)
result = subprocess.run(["git", "push", "-u", "origin", "main"], cwd=local_repo_path, capture_output=True, text=True)

# 결과 출력
print("stdout:\n", result.stdout)
print("stderr:\n", result.stderr)
print("returncode:", result.returncode)


stdout:
 branch 'main' set up to track 'origin/main'.

stderr:
 Everything up-to-date

returncode: 0
