In [1]:
pdb_id_chain_list = ["6kshD","4ctaA","2x14A","2j9cA","3f5mA","3ruvD","5trdA","3c1mC","3v2uC","1f3fC","7nsdA","5dd7A","1i58A","3hy2Y","7uldA","1z0sA","1un9A","7alrA","121pA","2jg1C","6cauA","2py7X","5dghA","2aqxB","3dntA","3wguC","4uxxC","2xanA","6ci7C","3gqkA","4amfA","1d4xA","7cqqA","4lacC","7d8iA","1xdpA","2i1oA","1vl1A","1to6A","1yzyA","1fitA","1s1dA","6a8pB","2bz0A","3zcbA","4edkA","5gufA","1k90A","4crjA","6aazA","6fl4A","8dcdA","1twfB","7tgkD","1rn8A","1j09A","2q16B","7edzC","1wc6B","3tuxA","3vthA","6p1pA","6r5dA","4yvzA","4ru9A","3f2bA","3ercC","6t0vB","7y7pA","6sqzD","1mb9B","8dbjA","6b5kA","6h77A","3wdlB","4ff3A","2f17A","5bsmA","1xdnA","6c02A","5w51E","3amtA","6ig2D","3jqmB","7fggA","6vd0A","6d5kC","7v0fA","6txeA"]

In [4]:
import os
import glob
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB.mmcifio import MMCIFIO



# 需要提取的 <PDB+链> 列表，例如 "6kshD" 表示从 6ksh.cif 中提取链 D
  # 示例列表

# 输入文件夹路径（存放原始的 .cif 文件）
input_folder = "raw_cif"  # 请替换为实际路径

# 输出文件夹路径（保存提取后的 .cif 文件）
output_folder = "cif_extracted"  # 请替换为实际路径
os.makedirs(output_folder, exist_ok=True)  # 如果文件夹不存在，则创建

# 初始化 MMCIFParser（安静模式，避免打印大量信息）
parser = MMCIFParser(QUIET=True)

for item in pdb_id_chain_list:
    # 假设前4个字符是 PDB ID（小写），后面的字符为链ID（大写）
    pdb_id = item[:4].lower()   # 例如 "6ksh"
    chain_id = item[4:].upper() # 例如 "D"
    
    # 构造输入文件路径，例如 "path/to/cif_files/6ksh.cif"
    input_file = os.path.join(input_folder, f"{pdb_id}.cif")
    if not os.path.exists(input_file):
        print(f"文件不存在: {input_file}")
        continue
    
    # 解析 CIF 文件
    structure = parser.get_structure(item, input_file)
    
    # 保留指定的链，删除其他链
    for model in structure:
        for chain in list(model.get_chains()):
            if chain.get_id() != chain_id:
                model.detach_child(chain.get_id())
    
    # 构造输出文件名，例如 "6ksh_chainD.cif"
    output_file = os.path.join(output_folder, f"{pdb_id}_chain{chain_id}.cif")
    
    # 保存提取后的结构到新的 CIF 文件
    io = MMCIFIO()
    io.set_structure(structure)
    io.save(output_file)
    
    print(f"提取链 {chain_id} -> {output_file}")


提取链 D -> cif_extracted\6ksh_chainD.cif
提取链 A -> cif_extracted\4cta_chainA.cif
提取链 A -> cif_extracted\2x14_chainA.cif
提取链 A -> cif_extracted\2j9c_chainA.cif
提取链 A -> cif_extracted\3f5m_chainA.cif
提取链 D -> cif_extracted\3ruv_chainD.cif
提取链 A -> cif_extracted\5trd_chainA.cif
提取链 C -> cif_extracted\3c1m_chainC.cif
提取链 C -> cif_extracted\3v2u_chainC.cif
提取链 C -> cif_extracted\1f3f_chainC.cif
提取链 A -> cif_extracted\7nsd_chainA.cif
提取链 A -> cif_extracted\5dd7_chainA.cif
提取链 A -> cif_extracted\1i58_chainA.cif
提取链 Y -> cif_extracted\3hy2_chainY.cif
提取链 A -> cif_extracted\7uld_chainA.cif
提取链 A -> cif_extracted\1z0s_chainA.cif
提取链 A -> cif_extracted\1un9_chainA.cif
提取链 A -> cif_extracted\7alr_chainA.cif
提取链 A -> cif_extracted\121p_chainA.cif
提取链 C -> cif_extracted\2jg1_chainC.cif
提取链 A -> cif_extracted\6cau_chainA.cif
提取链 X -> cif_extracted\2py7_chainX.cif
提取链 A -> cif_extracted\5dgh_chainA.cif
提取链 B -> cif_extracted\2aqx_chainB.cif
提取链 A -> cif_extracted\3dnt_chainA.cif
提取链 C -> cif_extracted\3w

In [5]:
import os
import glob
import csv
from Bio.PDB import MMCIFParser, NeighborSearch
from Bio.PDB.Polypeptide import is_aa

# 输入文件夹，存放 .cif 文件
input_folder = "cif_extracted"  # 替换为你的实际路径
# 输出文件夹，用于保存提取结果的 CSV 文件
output_folder = "ATP_cif"  # 替换为你的实际路径
os.makedirs(output_folder, exist_ok=True)

# 获取文件夹下所有 .cif 文件
cif_files = glob.glob(os.path.join(input_folder, "*.cif"))

# 初始化 MMCIFParser（QUIET=True 可避免解析时输出大量信息）
parser = MMCIFParser(QUIET=True)

# 设置距离阈值（单位：Å），判定是否为ATP结合位点
threshold = 4.0

for cif_file in cif_files:
    # 获取文件名（不含路径）及基名（去掉后缀）
    file_name = os.path.basename(cif_file)
    base_name = os.path.splitext(file_name)[0]  # 例如 "1d4x_chainA"

    # 解析 CIF 文件
    try:
        structure = parser.get_structure(base_name, cif_file)
    except Exception as e:
        print(f"解析失败: {cif_file}, 错误: {e}")
        continue

    # 收集 ATP 配体原子
    atp_atoms = []
    for model in structure:
        for chain in model:
            for residue in chain:
                # 如果残基名称是 "ATP"（有时也可能是其他异构名）
                if residue.get_resname() == "ATP":
                    atp_atoms.extend(list(residue.get_atoms()))

    # 收集所有蛋白质中标准氨基酸的原子
    protein_atoms = []
    # 同时保存残基信息（便于在CSV中输出）
    all_residues = []  # 格式: [(chain_id, residue_id, resname, residue_obj), ...]

    for model in structure:
        for chain in model:
            for residue in chain:
                if is_aa(residue, standard=True):
                    # 保存该残基的 (chain_id, residue_id, residue_name, residue_obj)
                    chain_id = chain.get_id()
                    res_id = residue.get_id()    # (hetfield, seq_number, insertion_code)
                    resname = residue.get_resname()
                    all_residues.append((chain_id, res_id, resname, residue))
                    # 将原子加入 protein_atoms
                    protein_atoms.extend(list(residue.get_atoms()))

    # 如果没有 ATP 原子，则所有残基都标记为 "N"
    if not atp_atoms:
        print(f"文件 {file_name} 中未找到 ATP 配体。")
        # 也可选择在 CSV 中直接全部标记为 "N"
        # 写出 CSV
        output_csv = os.path.join(output_folder, base_name + ".csv")
        with open(output_csv, "w", newline="") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["Prot.ID", "Chain", "NO", "Residue", "ATP Binding Site"])
            for (chain_id, res_id, resname, _) in all_residues:
                seq_number = res_id[1]  # residue_id 格式 (hetfield, seq_number, icode)
                writer.writerow([base_name, chain_id, seq_number, resname, "N"])
        print(f"已输出 CSV（全部N）: {output_csv}")
        continue

    # 构建邻域搜索树
    ns = NeighborSearch(protein_atoms)

    # 搜索到的ATP结合残基
    binding_residues = set()

    # 对每个ATP原子进行邻域搜索
    for atp_atom in atp_atoms:
        close_atoms = ns.search(atp_atom.get_coord(), threshold)
        for atom in close_atoms:
            residue = atom.get_parent()
            # 确保是标准氨基酸
            if is_aa(residue, standard=True):
                # 用 residue 对象的 parent (chain) 和 get_id() 来标识
                chain_id = residue.get_parent().get_id()
                res_id = residue.get_id()  # (hetfield, seq_number, icode)
                binding_residues.add((chain_id, res_id))

    # 输出到 CSV
    output_csv = os.path.join(output_folder, base_name + ".csv")
    with open(output_csv, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Prot.ID", "Chain", "NO", "Residue", "ATP Binding Site"])
        
        for (chain_id, res_id, resname, _) in all_residues:
            seq_number = res_id[1]  # residue_id 格式 (hetfield, seq_number, icode)
            if (chain_id, res_id) in binding_residues:
                binding_flag = "B"
            else:
                binding_flag = "N"
            writer.writerow([base_name, chain_id, seq_number, resname, binding_flag])
    
    print(f"文件 {file_name} 处理完成，输出: {output_csv}")


文件 121p_chainA.cif 中未找到 ATP 配体。
已输出 CSV（全部N）: ATP_cif\121p_chainA.csv
文件 1d4x_chainA.cif 处理完成，输出: ATP_cif\1d4x_chainA.csv
文件 1f3f_chainC.cif 中未找到 ATP 配体。
已输出 CSV（全部N）: ATP_cif\1f3f_chainC.csv
文件 1fit_chainA.cif 中未找到 ATP 配体。
已输出 CSV（全部N）: ATP_cif\1fit_chainA.csv
文件 1i58_chainA.cif 中未找到 ATP 配体。
已输出 CSV（全部N）: ATP_cif\1i58_chainA.csv
文件 1j09_chainA.cif 处理完成，输出: ATP_cif\1j09_chainA.csv
文件 1k90_chainA.cif 中未找到 ATP 配体。
已输出 CSV（全部N）: ATP_cif\1k90_chainA.csv
文件 1mb9_chainB.cif 处理完成，输出: ATP_cif\1mb9_chainB.csv
文件 1rn8_chainA.cif 中未找到 ATP 配体。
已输出 CSV（全部N）: ATP_cif\1rn8_chainA.csv
文件 1s1d_chainA.cif 中未找到 ATP 配体。
已输出 CSV（全部N）: ATP_cif\1s1d_chainA.csv
文件 1to6_chainA.cif 中未找到 ATP 配体。
已输出 CSV（全部N）: ATP_cif\1to6_chainA.csv
文件 1twf_chainB.cif 中未找到 ATP 配体。
已输出 CSV（全部N）: ATP_cif\1twf_chainB.csv
文件 1un9_chainA.cif 中未找到 ATP 配体。
已输出 CSV（全部N）: ATP_cif\1un9_chainA.csv
文件 1vl1_chainA.cif 中未找到 ATP 配体。
已输出 CSV（全部N）: ATP_cif\1vl1_chainA.csv
文件 1wc6_chainB.cif 中未找到 ATP 配体。
已输出 CSV（全部N）: ATP_cif\1wc6_chainB.csv
文件 1