In [2]:
import os
import time
import re
import pandas as pd
from tqdm import tqdm
from math import inf
import sys

from pymatgen.core import Composition
from pymatgen.io.cif import CifWriter

# 尝试导入 mp-api；未安装则提示安装
try:
    from mp_api.client import MPRester
except Exception as e:
    raise ImportError("未安装 mp-api。请先在上方单元执行：%pip install mp-api pymatgen pandas tqdm") from e


def sanitize_filename(name: str, max_len: int = 120) -> str:
    # 替换非法字符并限长
    safe = re.sub(r'[\\/:\"*?<>|]', "_", name).strip().strip(".")
    if len(safe) > max_len:
        base, ext = os.path.splitext(safe)
        safe = (base[:max_len - len(ext) - 1] + "…" + ext) if ext else safe[:max_len]
    # 规避 Windows 保留名
    reserved = {"CON","PRN","AUX","NUL","COM1","COM2","COM3","COM4","COM5","COM6","COM7","COM8","COM9",
                "LPT1","LPT2","LPT3","LPT4","LPT5","LPT6","LPT7","LPT8","LPT9"}
    if safe.upper() in reserved:
        safe = f"_{safe}"
    return safe


def composition_to_search_keys(comp_str: str):
    """
    返回 (reduced_formula, chemsys_str)
    """
    try:
        comp = Composition(comp_str)
        reduced = comp.reduced_formula  # 规范化化学式
        chemsys = "-".join(sorted(el.symbol for el in comp.elements))  # 元素集
        return reduced, chemsys
    except Exception:
        return None, None


def fetch_best_doc(mpr: MPRester, comp_str: str):
    """
    使用公式检索失败则退回按化学体系检索，返回 energy_above_hull 最小的 SummaryDoc
    """
    reduced, chemsys = composition_to_search_keys(comp_str)
    fields = ["material_id", "energy_above_hull", "structure", "formula_pretty"]

    # 先用规范化公式
    docs = []
    if reduced:
        try:
            docs = mpr.summary.search(formula=reduced, fields=fields)
        except Exception:
            docs = []

    # 再用化学体系兜底（不同版本参数名可能不同）
    if not docs and chemsys:
        try:
            docs = mpr.summary.search(chemsys=chemsys, fields=fields)
            if not docs:
                docs = mpr.summary.search(chemical_system=chemsys, fields=fields)
        except Exception:
            docs = []

    if not docs:
        return None

    # 选择最稳（能量最低）
    docs = [d for d in docs if getattr(d, "energy_above_hull", None) is not None]
    if not docs:
        return None
    best = min(docs, key=lambda d: d.energy_above_hull if d.energy_above_hull is not None else inf)
    return best


def download_cif_files(api_key: str, input_csv: str, output_dir: str, sleep_sec: float = 0.2, max_retries: int = 3):
    os.makedirs(output_dir, exist_ok=True)
    print(f"输出目录已创建: {os.path.abspath(output_dir)}")

    # 读取 CSV（前两行是说明文字，真正表头在第三行）
    try:
        df = pd.read_csv(input_csv, header=2)
    except FileNotFoundError:
        print(f"错误: 输入文件 '{input_csv}' 未找到。")
        return
    except Exception as e:
        print(f"读取CSV文件时出错: {e}")
        return

    # 室温筛选
    try:
        rt_df = df[(df['temperature'] >= 15) & (df['temperature'] <= 35)].copy()
        unique_compositions = pd.unique(rt_df['composition'].dropna())
    except Exception as e:
        print(f"数据列缺失或格式异常: {e}")
        return

    print(f"在 '{input_csv}' 中找到 {len(unique_compositions)} 种在室温下的独立化学成分。")

    summary_rows = []

    # 连接 MP
    try:
        mpr = MPRester(api_key)
    except Exception as e:
        print(f"连接到 Materials Project 出错，请检查 API Key 与网络: {e}")
        return

    with mpr:
        with tqdm(total=len(unique_compositions), desc="正在下载CIF文件") as pbar:
            for comp_str in unique_compositions:
                status = "Unknown Error"
                mp_id, e_hull, saved_filename = None, None, None

                # 重试
                last_err = None
                for attempt in range(1, max_retries + 1):
                    try:
                        doc = fetch_best_doc(mpr, comp_str)
                        if not doc:
                            status = "Not Found"
                        else:
                            mp_id = str(doc.material_id)
                            e_hull = doc.energy_above_hull
                            structure = doc.structure  # pymatgen Structure

                            # 写 CIF
                            base_name = f"{sanitize_filename(comp_str)}_{mp_id}.cif"
                            filepath = os.path.join(output_dir, sanitize_filename(base_name))
                            CifWriter(structure).write_file(filepath)
                            saved_filename = os.path.basename(filepath)
                            status = "Success"

                        break  # 成功或确认未找到都跳出重试
                    except Exception as e:
                        last_err = e
                        status = f"Error: {str(e).splitlines()[0]}"
                        time.sleep(sleep_sec * attempt)  # 简单回退
                if status.startswith("Error") and last_err:
                    # 已经记录
                    pass

                summary_rows.append({
                    "original_composition": comp_str,
                    "status": status,
                    "material_id": mp_id,
                    "e_above_hull": e_hull,
                    "saved_filename": saved_filename
                })

                pbar.set_postfix(composition=str(comp_str)[:20], status=status)
                pbar.update(1)
                time.sleep(sleep_sec)  # 速率限制

    # 保存下载摘要
    summary_df = pd.DataFrame(summary_rows)
    summary_path = os.path.join(output_dir, "download_summary.csv")
    summary_df.to_csv(summary_path, index=False)

    success_count = (summary_df['status'] == 'Success').sum()
    failed_count = len(unique_compositions) - success_count

    print("\n" + "=" * 50)
    print("处理完成！")
    print(f"成功下载: {success_count} 个结构的CIF文件")
    print(f"未找到或出错: {failed_count} 个化学成分")
    print(f"所有CIF文件已保存在: {os.path.abspath(output_dir)}")
    print(f"详细下载日志已保存在: {os.path.abspath(summary_path)}")
    print("=" * 50)


# Notebook 直接运行本段即可开始下载；优先使用环境变量，其次使用你提供的密钥
API_KEY = os.getenv("MP_API_KEY") or "LWy9cEJNTrC8Bk1b5QEfsL6EU9tLnZiw"
INPUT_CSV = "LiIonDatabase.csv"
OUTPUT_DIR = "downloaded_cifs"
SLEEP_SEC = 0.2
MAX_RETRIES = 3

download_cif_files(api_key=API_KEY,
                   input_csv=INPUT_CSV,
                   output_dir=OUTPUT_DIR,
                   sleep_sec=SLEEP_SEC,
                   max_retries=MAX_RETRIES)

输出目录已创建: d:\学习相关文档\大三上\大数据与机器智能\大作业\mattergen\downloaded_cifs
在 'LiIonDatabase.csv' 中找到 422 种在室温下的独立化学成分。


  docs = mpr.summary.search(formula=reduced, fields=fields)
Retrieving SummaryDoc documents: 0it [00:00, ?it/s]
  docs = mpr.summary.search(chemsys=chemsys, fields=fields)
Retrieving SummaryDoc documents: 100%|██████████| 7/7 [00:00<?, ?it/s]
Retrieving SummaryDoc documents: 0it [00:00, ?it/s]38s/it, composition=Li2OHBr, status=Success]
Retrieving SummaryDoc documents: 100%|██████████| 5/5 [00:00<?, ?it/s]
Retrieving SummaryDoc documents: 0it [00:00, ?it/s]94s/it, composition=Li2.4OH0.6Cl, status=Success]
Retrieving SummaryDoc documents: 100%|██████████| 5/5 [00:00<?, ?it/s]
Retrieving SummaryDoc documents: 100%|██████████| 1/1 [00:00<?, ?it/s]=Li2.1OH0.9Cl, status=Success]
Retrieving SummaryDoc documents: 0it [00:00, ?it/s]33s/it, composition=Li2OHCl, status=Success]     
Retrieving SummaryDoc documents: 100%|██████████| 5/5 [00:00<?, ?it/s]
Retrieving SummaryDoc documents: 100%|██████████| 1/1 [00:00<?, ?it/s]=Li2.7OH0.3Cl, status=Success]
Retrieving SummaryDoc documents: 100%|██████


处理完成！
成功下载: 167 个结构的CIF文件
未找到或出错: 255 个化学成分
所有CIF文件已保存在: d:\学习相关文档\大三上\大数据与机器智能\大作业\mattergen\downloaded_cifs
详细下载日志已保存在: d:\学习相关文档\大三上\大数据与机器智能\大作业\mattergen\downloaded_cifs\download_summary.csv





In [3]:
from data_process import CrystalDataProcessor
processor = CrystalDataProcessor(
    cif_dir="results/generated_crystals_cif.zip",
    conductivity_file="extracted_conductivity.csv",
    output_dir="processed_data",
    cutoff_radius=8.0
)
processor.run() 

2025-09-07 11:37:08,880 - data_process - INFO - Starting preprocessing pipeline
2025-09-07 11:37:08,880 - data_process - INFO - Extracting CIF files from results/generated_crystals_cif.zip
2025-09-07 11:37:08,892 - data_process - INFO - Found 337 CIF files to process
2025-09-07 11:37:08,892 - data_process - INFO - Loaded conductivity data for 321 materials
2025-09-07 11:37:08,897 - data_process - INFO - Renamed column 'ID' to 'material_id'
2025-09-07 11:37:08,897 - data_process - INFO - Renamed column 'Ionic conductivity (S cm-1)' to 'conductivity'
2025-09-07 11:37:08,899 - data_process - INFO - Created log-transformed conductivity for 321 entries
  node_positions = torch.tensor(node_positions, dtype=torch.float)
Processing CIF files: 100%|██████████| 337/337 [07:35<00:00,  1.35s/it]
2025-09-07 11:44:44,341 - data_process - INFO - Successfully processed 337 CIF files
2025-09-07 11:44:44,341 - data_process - INFO - Failed to process 0 CIF files
Saving processed data: 100%|██████████| 33

{'status': 'success',
 'total_processed': 337,
 'with_conductivity': 321,
 'without_conductivity': 16,
 'train_size': 257,
 'val_size': 32,
 'test_size': 32,
 'output_dir': 'processed_data'}