### Download from PDF bank

In [10]:
# import pandas as pd

# create list
pdb_id_chain_list = ["6kshD","4ctaA","2x14A","2j9cA","3f5mA","3ruvD","5trdA","3c1mC","3v2uC","1f3fC","7nsdA","5dd7A","1i58A","3hy2Y","7uldA","1z0sA","1un9A","7alrA","121pA","2jg1C","6cauA","2py7X","5dghA","2aqxB","3dntA","3wguC","4uxxC","2xanA","6ci7C","3gqkA","4amfA","1d4xA","7cqqA","4lacC","7d8iA","1xdpA","2i1oA","1vl1A","1to6A","1yzyA","1fitA","1s1dA","6a8pB","2bz0A","3zcbA","4edkA","5gufA","1k90A","4crjA","6aazA","6fl4A","8dcdA","1twfB","7tgkD","1rn8A","1j09A","2q16B","7edzC","1wc6B","3tuxA","3vthA","6p1pA","6r5dA","4yvzA","4ru9A","3f2bA","3ercC","6t0vB","7y7pA","6sqzD","1mb9B","8dbjA","6b5kA","6h77A","3wdlB","4ff3A","2f17A","5bsmA","1xdnA","6c02A","5w51E","3amtA","6ig2D","3jqmB","7fggA","6vd0A","6d5kC","7v0fA","6txeA"]
# n/a is deleted


# Read the CSV file and extract the pdb_id column
# df = pd.read_csv("../Files/representatives.csv")
# pdb_id_chain_list = df['pdb_id'].tolist()

len(pdb_id_chain_list)

89

In [11]:
import os
import requests

os.makedirs("raw_pdb", exist_ok=True)

for pdb_id_chain in pdb_id_chain_list:
    pdb_id = pdb_id_chain[:4].lower()  # 去掉链号，保留真正的 PDB ID
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    r = requests.get(url)
    if r.status_code == 200:
        filename = os.path.join("raw_pdb", f"{pdb_id}.pdb")
        with open(filename, "wb") as f:
            f.write(r.content)
        #print(f"Downloaded {pdb_id}")
    else:
        print(f"Failed to download {pdb_id}, status code: {r.status_code}")


### Extract certain chain using Bio.PDB

In [12]:
import os
from Bio.PDB import PDBParser, PDBIO, Select

# 自定义选择器，只保留指定链
class ChainSelect(Select):
    def __init__(self, chain_id):
        super().__init__()
        self.chain_id = chain_id

    def accept_chain(self, chain):
        # 如果链的 ID == 目标链号，就保留
        if chain.get_id() == self.chain_id:
            return True
        return False

pdb_parser = PDBParser(QUIET=True)  # QUIET=True 可避免过多警告
pdb_io = PDBIO()

# 这是上一步下载好的 ID + 链列表

for pdb_id_chain in pdb_id_chain_list:
    pdb_id = pdb_id_chain[:4].lower()  # 例如 "6ksh"
    chain_id = pdb_id_chain[4:].upper()  # 例如 "D"

    # 原始文件路径
    raw_pdb_path = os.path.join("raw_pdb", f"{pdb_id}.pdb")
    if not os.path.exists(raw_pdb_path):
        print(f"文件不存在，无法提取链: {raw_pdb_path}")
        continue

    # 解析
    structure = pdb_parser.get_structure(pdb_id, raw_pdb_path)

    # 输出文件名，可自定义，如 "6ksh_chainD.pdb"
    output_filename = f"{pdb_id}_chain{chain_id}.pdb"
    output_path = os.path.join("extracted_chains", output_filename)

    os.makedirs("extracted_chains", exist_ok=True)

    # 设置结构到PDBIO，保留特定链
    pdb_io.set_structure(structure)
    pdb_io.save(output_path, select=ChainSelect(chain_id))
    #print(f"链 {chain_id} 提取完成，已保存到：{output_path}")


### upload to ATPbind

In [1]:
import requests
import os
import time

# 服务器的提交地址
URL = "https://zhanggroup.org/ATPbind/atpbind.cgi"
# 用于接收结果的邮箱
EMAIL = "19803048535@163.com"
# PDB 文件所在目录
PDB_DIR = "extracted_chains"

# 一次提交多少个后暂停
BATCH_SIZE = 10
# 暂停时间（秒），15 分钟 = 15 * 60 = 900
BATCH_SLEEP_TIME = 15 * 60

# 短暂停，防止瞬时过快提交
SHORT_SLEEP_TIME = 10

# 收集所有 .pdb 文件
pdb_files = [f for f in os.listdir(PDB_DIR) if f.endswith(".pdb")]

# 计数器
count = 0

# 遍历每个 PDB 文件并提交
for i, pdb_file in enumerate(pdb_files):
    pdb_path = os.path.join(PDB_DIR, pdb_file)

    # 可以把文件名 (去掉 .pdb 后缀) 当作 ID
    # 如果你想保留后缀，直接用 pdb_file 也行
    protein_id = os.path.splitext(pdb_file)[0]

    print(f"即将提交 {pdb_file} ...", end=" ")

    with open(pdb_path, "rb") as f:
        # files 参数对应 <input type="file" name="str_file">
        files = {
            "str_file": (pdb_file, f, "chemical/x-pdb"),
        }
        # data 参数对应其他表单字段
        data = {
            "REPLY-E-MAIL": EMAIL,
            "TARGET-NAME": protein_id
        }

        try:
            resp = requests.post(URL, files=files, data=data, timeout=300)
        except requests.exceptions.RequestException as e:
            print(f"\n请求异常: {e}")
            continue

    if resp.status_code == 200:
        print("提交成功")
    else:
        print(f"提交失败, HTTP状态码: {resp.status_code}")

    count += 1

    # 若已发送 BATCH_SIZE 个，并且还没到最后一个文件，就休眠 15 分钟
    if count % BATCH_SIZE == 0 and (i < len(pdb_files) - 1):
        print(f"已发送 {count} 个请求，休眠 {BATCH_SLEEP_TIME / 60} 分钟...\n")
        time.sleep(BATCH_SLEEP_TIME)
    else:
        # 不是每 10 个后的长休眠时，就短暂休眠几秒
        time.sleep(SHORT_SLEEP_TIME)

print("\n全部提交完毕！")


即将提交 121p_chainA.pdb ... 提交成功
即将提交 1d4x_chainA.pdb ... 提交成功
即将提交 1f3f_chainC.pdb ... 提交成功
即将提交 1fit_chainA.pdb ... 提交成功
即将提交 1i58_chainA.pdb ... 提交成功
即将提交 1j09_chainA.pdb ... 提交成功
即将提交 1k90_chainA.pdb ... 提交成功
即将提交 1mb9_chainB.pdb ... 提交成功
即将提交 1rn8_chainA.pdb ... 提交成功
即将提交 1s1d_chainA.pdb ... 提交成功
已发送 10 个请求，休眠 15.0 分钟...

即将提交 1to6_chainA.pdb ... 提交成功
即将提交 1twf_chainB.pdb ... 提交成功
即将提交 1un9_chainA.pdb ... 提交成功
即将提交 1vl1_chainA.pdb ... 提交成功
即将提交 1wc6_chainB.pdb ... 提交成功
即将提交 1xdn_chainA.pdb ... 提交成功
即将提交 1xdp_chainA.pdb ... 提交成功
即将提交 1yzy_chainA.pdb ... 提交成功
即将提交 1z0s_chainA.pdb ... 提交成功
即将提交 2aqx_chainB.pdb ... 提交成功
已发送 20 个请求，休眠 15.0 分钟...

即将提交 2bz0_chainA.pdb ... 提交成功
即将提交 2f17_chainA.pdb ... 提交成功
即将提交 2i1o_chainA.pdb ... 提交成功
即将提交 2j9c_chainA.pdb ... 提交成功
即将提交 2jg1_chainC.pdb ... 提交成功
即将提交 2py7_chainX.pdb ... 提交成功
即将提交 2q16_chainB.pdb ... 提交成功
即将提交 2x14_chainA.pdb ... 提交成功
即将提交 2xan_chainA.pdb ... 提交成功
即将提交 3amt_chainA.pdb ... 提交成功
已发送 30 个请求，休眠 15.0 分钟...

即将提交 3c1m_chainC.pdb .