In [1]:
import os
import json
import pandas as pd
from pprint import pprint

In [None]:
import os
import json
import pandas as pd

def csv_generator(file_num_list: list, output_folder_path: str, rag_method: str, hyper_param_method: str=None):
    ## 상위 폴더명 저장
    output_paper_folder_path = []
    for file_num in file_num_list:
        json_file_num = f"00{file_num}"[-3:]
        output_paper_folder_path.append(f"{output_folder_path}/json/{rag_method}/paper_{json_file_num}_output/")

    ## [Step 1] 데이터 불러오기
    total_data = {}
    for i, op in zip(file_num_list, output_paper_folder_path):
        total_data[f"{i}"] = {}
        temp_data = {}
        j = 0
        
        if not os.path.exists(op):
            print(f"[경고] 경로 없음: {op}")
            continue

        for filename in os.listdir(op):
            if hyper_param_method is None:
                j += 1
                if filename.startswith(f"category-{j}-paper_"):
                    json_file_path = os.path.join(op, filename)    
                    with open(json_file_path, "r", encoding="utf-8") as f:
                        temp_data[f'category-{j}'] = json.load(f)
            
            elif filename.endswith(f"{hyper_param_method}.json"):
                pass 
                
        total_data[f"{i}"] = temp_data

    ## [Step 2] 모든 샘플을 훑어서 "모든 가능한 Stoichiometry Key"와 "C-rate" 수집
    all_keys_temp = set()  # (수정됨) 변수명 충돌 방지용 임시 Set
    all_crates = set()

    for paper_id in file_num_list:
        temp_data = total_data.get(str(paper_id))
        if not temp_data: continue
        
        try:
            samples = list(temp_data['category-1'].keys())
            
            # 1. Stoichiometry Key 수집
            for sample in samples:
                stoich_info = temp_data['category-1'][sample].get('Stoichiometry information', {})
                all_keys_temp.update(stoich_info.keys())

            # 2. C-rate 수집
            for sample in samples:
                c_rate_data = temp_data['category-4'][sample][0].get('C-rate and Specific capacity', [])
                for entry in c_rate_data:
                    if 'C-rate' in entry:
                        all_crates.add(str(entry['C-rate']))
                    elif 'Other C-rates and performance' in entry:
                        for sub in entry['Other C-rates and performance']:
                            all_crates.add(str(sub['C-rate']))
        except KeyError:
            continue

    # [중요] 정렬 로직 (주요 원소 우선 정렬)
    priority_order = ['Li ratio', 'Ni ratio', 'Co ratio', 'Mn ratio', 'O ratio']
    
    # 1. 우선순위 목록에 있는 키가 실제로 수집되었으면 그 순서대로 가져옴
    sorted_priority = [key for key in priority_order if key in all_keys_temp]
    
    # 2. 나머지 키(W, Al, Zr 등)는 알파벳순 정렬
    sorted_others = sorted([key for key in all_keys_temp if key not in priority_order])
    
    # 3. 최종 리스트 결합 (이제부터 이 리스트를 컬럼 생성에 사용)
    all_stoich_keys = sorted_priority + sorted_others
    
    # C-rate 정렬
    all_crates = sorted(
        [c for c in all_crates if c is not None and str(c).lower() != 'none'],
        key=lambda x: float(str(x))
    )

    ## [Step 3] 실제 Row 데이터 생성
    all_rows = []
    for paper_id in file_num_list:
        temp_data = total_data.get(str(paper_id))
        if not temp_data: continue

        try:
            samples = list(temp_data['category-1'].keys())
            content = temp_data 

            for sample in samples:
                stoich_cat = content.get('category-1', {}).get(sample, {})
                electro_cat = content.get('category-2', {}).get(sample, {})
                particle_cat = content.get('category-3', {})
                capacity_cat = content.get('category-4', {}).get(sample, [{}])[0]

                row = {
                    'Paper ID': paper_id,
                    'Sample': sample,
                }

                # 1. Stoichiometry 정보 동적 할당
                current_stoich_info = stoich_cat.get('Stoichiometry information', {})
                for key in all_stoich_keys:
                    row[key] = current_stoich_info.get(key, 0)

                # 2. 고정 필드들
                row.update({
                    'Commercial NCM used': stoich_cat.get('Commercial NCM used'),
                    'Lithium source': stoich_cat.get('Lithium source'),
                    'Synthesis method': stoich_cat.get('Synthesis method'),
                    'Crystallization method': stoich_cat.get('Crystallization method'),
                    'Crystallization final temperature': stoich_cat.get('Crystallization final temperature'),
                    'Crystallization final duration (hours)': stoich_cat.get('Crystallization final duration (hours)'),
                    'Doping': stoich_cat.get('Doping'),
                    'Coating': stoich_cat.get('Coating'),

                    'Active material to Conductive additive to Binder ratio': electro_cat.get('Active material to Conductive additive to Binder ratio'),
                    'Electrolyte salt': electro_cat.get('Electrolyte', [{}])[0].get('Salt') if electro_cat.get('Electrolyte') else None,
                    'Electrolyte concentration': electro_cat.get('Electrolyte', [{}])[0].get('Concentration') if electro_cat.get('Electrolyte') else None,
                    'Electrolyte solvent': electro_cat.get('Electrolyte', [{}])[0].get('Solvent') if electro_cat.get('Electrolyte') else None,
                    'Electrolyte solvent ratio': electro_cat.get('Electrolyte', [{}])[0].get('Solvent ratio') if electro_cat.get('Electrolyte') else None,
                    'Additive': electro_cat.get('Additive'),
                    'Loading density (mass loading of NCM)': electro_cat.get('Loading density (mass loading of NCM)')
                })

                # 3. Particle 정보
                def get_particle_val(prefix_list):
                    for key in prefix_list:
                        if key in particle_cat:
                            return particle_cat[key].get(sample)
                    return None

                row['Particle size'] = get_particle_val(['Particle size', 'ParticleSize'])
                row['Particle shape'] = get_particle_val(['Particle shape', 'ParticleShape'])
                row['Particle distribution'] = get_particle_val(['Particle distribution', 'ParticleDistribution'])
                row['Coating layer characteristics'] = get_particle_val(['Coating layer characteristics', 'CoatingLayerCharacteristics'])
                row['Crystal structure and lattice characteristics'] = get_particle_val(['Crystal structure and lattice characteristics', 'CrystalStructureAndLatticeCharacteristics'])
                
                # 4. 성능 지표
                row['Voltage range'] = capacity_cat.get('Voltage range')
                row['Temperature'] = capacity_cat.get('Temperature')

                # 5. C-rate 정보
                for c in all_crates:
                    row[f'C-rate {c}'] = None
                
                c_rate_entries = capacity_cat.get('C-rate and Specific capacity', [])
                for entry in c_rate_entries:
                    val = entry.get('Capacity')
                    if 'C-rate' in entry:
                        row[f'C-rate {entry["C-rate"]}'] = val
                    elif 'Other C-rates and performance' in entry:
                        for sub in entry['Other C-rates and performance']:
                            row[f'C-rate {sub["C-rate"]}'] = sub.get('Capacity')

                all_rows.append(row)

        except Exception as e:
            print(f"[오류] Sample 처리 중 에러 (Paper {paper_id}): {e}")

    ## [Step 4] DataFrame 생성 및 저장
    df = pd.DataFrame(all_rows)
    
    # [추가됨] 데이터 클리닝: 특수문자 줄표(–)를 일반 빼기(-)로 변환
    # (문자열 컬럼 전체에 대해 일괄 적용)
    df = df.replace('–', '-', regex=True)  # En dash 처리
    df = df.replace('—', '-', regex=True)  # Em dash 처리 (혹시 몰라서 추가)

    os.makedirs(f"{output_folder_path}/csv", exist_ok=True)
    file_name = f"{rag_method}.csv" if hyper_param_method is None else f"{rag_method}_{hyper_param_method}.csv"
    save_path = f"{output_folder_path}/csv/{file_name}"
    
    try:
        # [수정됨] encoding='utf-8-sig' 추가
        df.to_csv(save_path, index=False, encoding='utf-8-sig')
        print(f"CSV 저장 완료: {save_path}")
    except PermissionError:
        import time
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        new_file_name = file_name.replace(".csv", f"_{timestamp}.csv")
        new_save_path = f"{output_folder_path}/csv/{new_file_name}"
        
        # [수정됨] 여기도 encoding='utf-8-sig' 추가
        df.to_csv(new_save_path, index=False, encoding='utf-8-sig')
        print(f"[주의] 파일이 열려 있어 새 이름으로 저장됨: {new_save_path}")

    return df

In [18]:
file_num_list = [11, 16, 22, 35, 39, 40, 41, 42, 44, 56]  ## 11, 16, 22, 35, 39, 40, 41, 42, 44, 56
output_folder_path = "./output"
rag_method = "relevance-rag"
hyper_param_method = None ## "paper56_o1"

csv_generator(file_num_list, output_folder_path, rag_method, hyper_param_method)

CSV 저장 완료: ./output/csv/relevance-rag.csv


Unnamed: 0,Paper ID,Sample,Li ratio,Ni ratio,Co ratio,Mn ratio,O ratio,V ratio,W ratio,Commercial NCM used,...,C-rate 0.5,C-rate 1.0,C-rate 2.0,C-rate 4.0,C-rate 5.0,C-rate 6.0,C-rate 10.0,C-rate 20.0,C-rate 40.0,C-rate None
0,11,NR0,1.0,0.33,0.33,0.33,2.0,0.0,0.0,no,...,,123.0,,,45.5,,,,,
1,11,NR1,1.0,0.33,0.33,0.33,2.0,0.0,0.0,no,...,,,,,47.0,,,,,
2,11,NR3,1.0,0.33,0.33,0.33,2.0,0.0,0.0,no,...,,,,,75.4,,,,,
3,11,NR5,1.0,0.33,0.33,0.33,2.0,0.0,0.0,no,...,,,,,95.4,,,,,
4,16,NCM,1.01,0.35,0.32,0.32,0.0,0.0,0.0,no,...,175.7,154.8,143.5,,101.7,,71.5,44.5,13.2,
5,16,NCM/C,1.03,0.32,0.33,0.34,0.0,0.0,0.0,no,...,191.2,178.9,163.5,,135.8,,109.4,81.9,54.6,
6,22,Pristine,1.0,0.84,0.1,0.06,2.0,0.0,0.0,no,...,,,,,,,,,,
7,22,V-0.005,1.0,0.84,0.1,0.06,2.0,0.005,0.0,no,...,,,,,,,,,,
8,22,V-0.01,1.0,0.84,0.1,0.06,2.0,0.01,0.0,no,...,,,,,,,,,,
9,22,V-0.02,1.0,0.84,0.1,0.06,2.0,0.02,0.0,no,...,,,,,,,,,,
