In [1]:
import os
import json
import pandas as pd

In [2]:
def csv_generator(file_num_list: list, output_folder_path: str, rag_method: str, hyper_param_method: str=None):
    ## 상위 폴더명 저장
    output_paper_folder_path = []
    for file_num in file_num_list:
        json_file_num = f"00{file_num}"[-3:]
        output_paper_folder_path.append(f"{output_folder_path}/json/{rag_method}/paper_{json_file_num}_output/")

    ## 해당 방법에 따른 데이터셋 불러오기
    total_data = {}
    for i, op in zip(file_num_list, output_paper_folder_path):
        total_data[f"{i}"] = {}
        temp_data = {}
        j = 0
        for filename in os.listdir(op):
            # print(filename)
            if hyper_param_method is None:
                j += 1
                if filename.startswith(f"category-{j}-paper_"):
                    json_file_path = os.path.join(op, filename)    
                    with open(json_file_path, "r", encoding="utf-8") as f:
                        data = json.load(f)
                    temp_data = temp_data | data
            
            elif filename.endswith(f"{hyper_param_method}.json"):
                json_file_path = os.path.join(op, filename)
                with open(json_file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                temp_data = temp_data | data        
        total_data[f"{i}"] = temp_data


    ## json을 csv 형태로 변환
    all_rows = []
    for paper_id in file_num_list:
        try:
            temp_data = total_data[str(paper_id)]
        except KeyError:
            print(f"[경고] Paper ID {paper_id} 없음.")
            continue

        samples = list(temp_data.get('Stoichiometry information', {}).keys())
        content = temp_data.copy()
        all_crates = set()

        # 모든 C-rate 종류 수집
        for sample in samples:
            try:
                c_rate_info = content[sample][0]['C-rate and Specific capacity']
                for entry in c_rate_info:
                    if 'C-rate' in entry:
                        all_crates.add(str(entry['C-rate']))
                    elif 'Other C-rates and performance' in entry:
                        for sub_entry in entry['Other C-rates and performance']:
                            all_crates.add(str(sub_entry['C-rate']))
            except (KeyError, IndexError, TypeError):
                continue

        # 정렬
        all_crates = sorted(
            [c for c in all_crates if c is not None and str(c).lower() != 'none'],
            key=lambda x: float(str(x))
        )

        for sample in samples:
            try:
                stoich = content['Stoichiometry information'][sample]
                electrolyte = content.get('Electrolyte', [{}])[0]
                row = {
                    'Paper ID': paper_id,
                    'Sample': sample,
                    
                    'Li ratio': stoich.get('Li ratio'),
                    'Ni ratio': stoich.get('Ni ratio'),
                    'Co ratio': stoich.get('Co ratio'),
                    'Mn ratio': stoich.get('Mn ratio'),
                    'O ratio': stoich.get('O ratio'),
                    'W ratio': stoich.get('W ratio'),
                    'Commercial NCM used': content.get('Commercial NCM used', {}).get(sample),
                    'Lithium source': content.get('Lithium source', {}),
                    'Synthesis method': content.get('Synthesis method', {}),
                    'Crystallization method': content.get('Crystallization method', {}),
                    'Crystallization final temperature': content.get('Crystallization final temperature', {}),
                    'Crystallization final duration (hours)': content.get('Crystallization final duration (hours)', {}),
                    'Doping': content.get('Doping', {}), 
                    'Coating': content.get('Coating', {}), 
                    'Additional treatment': content.get('Additional treatment', {}), 
                    
                    'Active material to Conductive additive to Binder ratio': content.get('Active material to Conductive additive to Binder ratio'),
                    'Electrolyte salt': electrolyte.get('Salt'),
                    'Electrolyte concentration': electrolyte.get('Concentration'),
                    'Electrolyte solvent': electrolyte.get('Solvent'),
                    'Electrolyte solvent ratio': electrolyte.get('Solvent ratio'),
                    'Additive': content.get('Additive'),
                    'Loading density (mass loading of NCM)': content.get('Loading density (mass loading of NCM)'),

                    'Particle size': content.get('Particle size', {}).get(sample),
                    'Particle shape': content.get('Particle shape', {}).get(sample),
                    'Particle distribution': content.get('Particle distribution', {}).get(sample),
                    'Coating layer characteristics': content.get('Coating layer characteristics', {}).get(sample),
                    'Crystal structure and lattice characteristics': content.get('Crystal structure and lattice characteristics', {}).get(sample),
                    
                    'Voltage range': content.get(sample, [{}])[0].get('Voltage range'),
                    'Temperature': content.get(sample, [{}])[0].get('Temperature'),
                }

                # C-rate 열 미리 생성
                for c in all_crates:
                    row[f'C-rate {c}'] = None

                # C-rate 값 넣기
                c_rate_info = content[sample][0].get('C-rate and Specific capacity', [])
                for entry in c_rate_info:
                    if 'C-rate' in entry:
                        row[f'C-rate {entry["C-rate"]}'] = entry.get('Capacity')
                    elif 'Other C-rates and performance' in entry:
                        for sub_entry in entry['Other C-rates and performance']:
                            row[f'C-rate {sub_entry["C-rate"]}'] = sub_entry.get('Capacity')

                all_rows.append(row)
            except Exception as e:
                print(f"[오류] Sample {sample} (Paper {paper_id}): {e}")

    # 최종 DataFrame 생성
    df = pd.DataFrame(all_rows)
    # 폴더 없으면 생성
    os.makedirs(f"{output_folder_path}/csv", exist_ok=True)
    if hyper_param_method is None:
        df.to_csv(f"{output_folder_path}/csv/{rag_method}.csv", index=False)
    else:
        df.to_csv(f"{output_folder_path}/csv/{rag_method}_{hyper_param_method}.csv", index=False)


In [3]:
file_num_list = [11, 16, 22, 35, 39, 40, 41, 42, 44, 56]  ## 11, 16, 22, 35, 39, 40, 41, 42, 44, 56
output_folder_path = "../output"
rag_method = "multiagent-rag"
hyper_param_method = "6" ## "paper56_o1"

In [4]:
csv_generator(file_num_list, output_folder_path, rag_method, hyper_param_method)

[오류] Sample polymer/γ-Al2O3-coated NCM622 (Paper 44): 'polymer/γ-Al2O3-coated NCM622'
