In [14]:
from datasets import load_dataset
import os

# 🔧 설정
subset = "Electronics-Engineering"
split = "train"
save_dir = "converted_json"

# 1. 데이터셋 다운로드
dataset = load_dataset("HAERAE-HUB/KMMLU", subset, split=split)

# 2. 저장 폴더 만들기
os.makedirs(save_dir, exist_ok=True)

# 3. 파일 이름
json_name = f"{subset}-{split}.json"
json_path = os.path.join(save_dir, json_name)


df = dataset.to_pandas()
df.to_json(json_path, orient="records", lines=True, force_ascii=False)


print(f"✅ 저장 완료: {json_path}")


✅ 저장 완료: converted_json\Electronics-Engineering-train.json


In [20]:
import pandas as pd
import os

# 🔧 설정
json_dir = "./converted_json"
save_dir = "./modified_json"
column = "question"
typo_types = ["introduce_typo", "drop_jongsung", "repeat_char","merge_words","swap_parts"]
count = 3
ratio = None
max_repeat = 3

from typo_generation import (
    introduce_typo_to_sentence,
    drop_jongsung_sentence,
    repeat_char_typo_no_space,
    merge_words_typo,
    grammar_error,
    swap_parts_in_sentence,
    typo_dict
)

# 모든 JSON 경로 수집
json_paths = [
    os.path.join(json_dir, f)
    for f in os.listdir(json_dir)
    if f.endswith(".json")
]

# 오타 함수 정의
typo_funcs = {
    "introduce_typo": lambda text: introduce_typo_to_sentence(text, count=count, ratio=ratio),
    "drop_jongsung": lambda text: drop_jongsung_sentence(text, count=count, ratio=ratio),
    "repeat_char": lambda text: repeat_char_typo_no_space(text, count=count, ratio=ratio, max_repeat=max_repeat),
    "merge_words": lambda text: merge_words_typo(text, count=count, ratio=ratio),
    "grammar_error": lambda text: grammar_error(text, typo_dict, count=count, ratio=ratio),
    "swap_parts": lambda text: swap_parts_in_sentence(text, count=count, ratio=ratio)
}

# 저장 폴더 만들기
os.makedirs(save_dir, exist_ok=True)

# 처리 시작
for json_path in json_paths:
    df = pd.read_json(json_path, lines=True)
    base_filename = os.path.basename(json_path).replace(".json", "")  # 확장자 제거

    if column in df.columns:
        for typo_type in typo_types:
            df_copy = df.copy()
            df_copy[column] = df_copy[column].astype(str).apply(typo_funcs[typo_type])

            save_name = f"{base_filename}_{typo_type}.json"  # ✅ 파일명 앞, 오타 뒤
            save_path = os.path.join(save_dir, save_name)

            df_copy.to_json(save_path, orient="records", lines=True, force_ascii=False)
            print(f"✅ 저장 완료: {save_path}")



✅ 저장 완료: ./modified_json\Electronics-Engineering-train_introduce_typo.json
✅ 저장 완료: ./modified_json\Electronics-Engineering-train_drop_jongsung.json
✅ 저장 완료: ./modified_json\Electronics-Engineering-train_repeat_char.json
✅ 저장 완료: ./modified_json\Electronics-Engineering-train_merge_words.json
✅ 저장 완료: ./modified_json\Electronics-Engineering-train_swap_parts.json
