In [1]:
import os
import time
import pandas as pd
from urllib.parse import urlparse
from ftplib import FTP

def download_file(ftp, remote_filepath, local_filepath):
    with open(local_filepath, 'wb') as f:
        ftp.retrbinary(f'RETR {remote_filepath}', f.write)

def find_gpff_links(ftp, directory):
    files = ftp.nlst(directory)
    return [f for f in files if f.endswith('protein.gpff.gz')]

def connect_and_login(url_parts):
    ftp = FTP(url_parts.netloc)
    ftp.login()
    return ftp

def download_with_retry(ftp, gpff_link, filename, max_retries=3, delay=5):
    for i in range(max_retries):
        try:
            download_file(ftp, gpff_link, filename)
            print(f'成功下载文件：{filename}')
            return
        except Exception as e:
            print(f'下载失败：{filename}，错误：{e}')
            if i < max_retries - 1:
                print(f'尝试重新下载，等待{delay}秒...')
                time.sleep(delay)
            else:
                print(f'放弃下载：{filename}')

# 读取CSV文件
df = pd.read_csv('links.csv')

# 遍历表格中的每一行
for index, row in df.iterrows():
    # 获取链接
    url = row['links']
    url_parts = urlparse(url)
    
    # 连接到FTP服务器
    ftp = connect_and_login(url_parts)
    
    # 寻找目标文件链接
    gpff_links = find_gpff_links(ftp, url_parts.path)

    # 下载目标文件
    for gpff_link in gpff_links:
        filename = os.path.basename(gpff_link)
        download_with_retry(ftp, gpff_link, filename)
        time.sleep(1)  # 在下载文件之间添加延迟

    # 断开FTP连接
    ftp.quit()


成功下载文件：GCF_000155695.1_ASM15569v1_protein.gpff.gz
成功下载文件：GCF_000171235.2_ASM17123v2_protein.gpff.gz
成功下载文件：GCF_000172555.1_ASM17255v1_protein.gpff.gz
成功下载文件：GCF_000173075.1_ASM17307v1_protein.gpff.gz
成功下载文件：GCF_000242935.2_ASM24293v3_protein.gpff.gz
成功下载文件：GCF_000378105.1_ASM37810v1_protein.gpff.gz
成功下载文件：GCF_000379365.1_ASM37936v1_protein.gpff.gz
成功下载文件：GCF_000436395.1_MGS154_protein.gpff.gz
成功下载文件：GCF_000526255.1_ASM52625v1_protein.gpff.gz
成功下载文件：GCF_000739615.1_ASM73961v1_protein.gpff.gz
成功下载文件：GCF_000953475.1_MfumSolVChr1_protein.gpff.gz
成功下载文件：GCF_001318295.1_ASM131829v1_protein.gpff.gz
成功下载文件：GCF_001578645.1_ASM157864v1_protein.gpff.gz
成功下载文件：GCF_001580015.1_ASM158001v1_protein.gpff.gz
成功下载文件：GCF_001580045.1_ASM158004v1_protein.gpff.gz
成功下载文件：GCF_001613545.1_ASM161354v1_protein.gpff.gz
成功下载文件：GCF_001647615.1_ASM164761v1_protein.gpff.gz
成功下载文件：GCF_001650175.1_ASM165017v1_protein.gpff.gz
成功下载文件：GCF_001683795.1_ASM168379v1_protein.gpff.gz
成功下载文件：GCF_001746835.1_ASM174683v1_protein.g

In [13]:
from Bio import Entrez

def get_taxon_id_from_gcaid(gcaid):
    Entrez.email = "your.email@example.com"  # 请使用您自己的电子邮件地址

    # 使用Entrez.esearch检索GCAid相关的记录
    handle = Entrez.esearch(db="assembly", term=gcaid)
    record = Entrez.read(handle)
    handle.close()

    # 从检索到的记录中获取Assembly ID
    assembly_id = record["IdList"][0]

    # 使用Entrez.efetch获取Assembly ID相关的记录
    handle = Entrez.efetch(db="assembly", id=assembly_id, rettype="docsum", retmode="xml")
    record = Entrez.read(handle)
    handle.close()

    # 在记录中查找taxon id
    taxon_id = record['DocumentSummarySet']['DocumentSummary'][0]['Taxid']

    return taxon_id

gcaid = "GCA_007117865.1"  # 示例GCAid
taxon_id = get_taxon_id_from_gcaid(gcaid)
print(f"Taxon ID for {gcaid} is {taxon_id}")


Taxon ID for GCA_007117865.1 is 2026784


In [7]:
from Bio import Entrez

def get_protein_names(txid, email):
    Entrez.email = email  # 请使用您自己的电子邮件地址

    # 使用Entrez.esearch检索给定txid的蛋白质记录
    handle = Entrez.esearch(db="protein", term=f"txid{txid}[organism:exp]", retmax=1000)
    record = Entrez.read(handle)
    handle.close()

    protein_ids = record["IdList"]

    protein_names = []
    for protein_id in protein_ids:
        # 使用Entrez.efetch获取蛋白质记录
        handle = Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text")
        record = handle.read()
        handle.close()

        # 在记录中查找蛋白质名称
        for line in record.split("\n"):
            if "DEFINITION" in line:
                protein_name = line.split("DEFINITION")[1].strip()
                protein_names.append(protein_name)
                break

    return protein_names

txid = 2026784  # 示例txid
email = "u7457260@anu.edu.au"  # 请使用您自己的电子邮件地址
protein_names = get_protein_names(txid, email)
print(f"Protein names for txid {txid}:")

for name in protein_names:
    print(name)


In [4]:
from Bio import Entrez

def get_protein_names(txid, email, retmax=1000):
    Entrez.email = email  # 请使用您自己的电子邮件地址

    # 首先检索给定txid的蛋白质记录总数
    handle = Entrez.esearch(db="protein", term=f"txid{txid}[organism:exp]")
    record = Entrez.read(handle)
    handle.close()

    total_records = int(record["Count"])
    protein_names = []

    for retstart in range(0, total_records, retmax):
        # 使用Entrez.esearch检索给定txid的蛋白质记录
        handle = Entrez.esearch(db="protein", term=f"txid{txid}[organism:exp]", retmax=retmax, retstart=retstart)
        record = Entrez.read(handle)
        handle.close()

        protein_ids = record["IdList"]

        for protein_id in protein_ids:
            # 使用Entrez.efetch获取蛋白质记录
            handle = Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text")
            record = handle.read()
            handle.close()

            # 在记录中查找蛋白质名称
            for line in record.split("\n"):
                if "DEFINITION" in line:
                    protein_name = line.split("DEFINITION")[1].strip()
                    protein_names.append(protein_name)
                    break

    return protein_names

txid = 2026784  # 示例txid
email = "u7457260@anu.edu.au"  # 请使用您自己的电子邮件地址
retmax = 1000  # 每批次检索的记录数
protein_names = get_protein_names(txid, email, retmax)
print(f"Protein names for txid {txid}:")
for name in protein_names:
    print(name)


KeyboardInterrupt: 

In [2]:
import time
from Bio import Entrez

def get_protein_names(txid, email, batch_size=1000, delay=1):
    Entrez.email = email  # 请使用您自己的电子邮件地址

    # 首先获取与给定txid相关的蛋白质记录的总数
    handle = Entrez.esearch(db="protein", term=f"txid{txid}[organism:exp]", retmax=1)
    record = Entrez.read(handle)
    handle.close()

    total_proteins = int(record["Count"])

    protein_names = []

    # 分批处理检索蛋白质名称
    for start in range(0, total_proteins, batch_size):
        handle = Entrez.esearch(db="protein", term=f"txid{txid}[organism:exp]", retmax=batch_size, retstart=start)
        record = Entrez.read(handle)
        handle.close()

        protein_ids = record["IdList"]

        for protein_id in protein_ids:
            # 使用Entrez.efetch获取蛋白质记录
            handle = Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text")
            record = handle.read()
            handle.close()

            # 在记录中查找蛋白质名称
            for line in record.split("\n"):
                if "DEFINITION" in line:
                    protein_name = line.split("DEFINITION")[1].strip()
                    protein_names.append(protein_name)
                    break

            # 在每次请求之间添加延迟，防止请求速率过快
            time.sleep(delay)

    return protein_names

txid = 2026784  # 示例txid
email = "shandongduyiheng@163.com"  # 请使用您自己的电子邮件地址
protein_names = get_protein_names(txid, email)
print(f"Protein names for txid {txid}:")
for name in protein_names:
    print(name)

ConnectionAbortedError: [WinError 10053] 你的主机中的软件中止了一个已建立的连接。