# 根据过滤后的16S数据更新BGC数据表

In [1]:
from Bio import SeqIO
from pathlib import Path

# 读取fasta文件并获取所有序列名
def get_fasta_sequence_names(fasta_path):
    records = SeqIO.parse(fasta_path, 'fasta')
    sequence_names = [record.id for record in records]
    return sequence_names

In [2]:
fasta_file = Path('../data/filtered_16SrRNA.fasta')
sequence_names = get_fasta_sequence_names(fasta_file)
len(sequence_names)
    

67171

In [3]:
import pandas as pd
tables1_hq_path = Path('../data/tables1_hq.csv')
tables1_hq_df = pd.read_csv(tables1_hq_path)
print(tables1_hq_df.shape)
tables1_hq_df.head()

(188497, 14)


Unnamed: 0,assembly_accession,CheckM completeness,CheckM contamination,16S_ID,assembly_level,Sum,PKSI,PKSother,NRPS,RiPPs,Saccharides,Terpene,PKS-NRP_Hybrids,Others
0,GCF_000022305.1,100.0,0.67,refseq4905,Complete Genome,3,0,0,0,0,0,1,0,2
1,GCF_000025965.1,100.0,0.37,refseq15376,Complete Genome,7,0,0,0,0,0,1,0,6
2,GCF_000023105.1,100.0,0.84,refseq45559,Complete Genome,1,0,0,0,0,0,1,0,0
3,GCF_000012365.1,100.0,0.06,refseq128457,Complete Genome,20,1,0,5,3,0,2,2,7
4,GCF_000024165.1,100.0,0.88,refseq35003,Complete Genome,8,0,0,2,1,0,1,0,4


In [4]:
# 删除16S_ID列中值不在sequence_names里的行
tables1_hq_df_filtered = tables1_hq_df[tables1_hq_df['16S_ID'].isin(sequence_names)]
print(tables1_hq_df_filtered.shape)
tables1_hq_df_filtered.head()

(175321, 14)


Unnamed: 0,assembly_accession,CheckM completeness,CheckM contamination,16S_ID,assembly_level,Sum,PKSI,PKSother,NRPS,RiPPs,Saccharides,Terpene,PKS-NRP_Hybrids,Others
0,GCF_000022305.1,100.0,0.67,refseq4905,Complete Genome,3,0,0,0,0,0,1,0,2
1,GCF_000025965.1,100.0,0.37,refseq15376,Complete Genome,7,0,0,0,0,0,1,0,6
2,GCF_000023105.1,100.0,0.84,refseq45559,Complete Genome,1,0,0,0,0,0,1,0,0
3,GCF_000012365.1,100.0,0.06,refseq128457,Complete Genome,20,1,0,5,3,0,2,2,7
4,GCF_000024165.1,100.0,0.88,refseq35003,Complete Genome,8,0,0,2,1,0,1,0,4


In [5]:
tables1_hq_df_filtered.to_csv("../data/TableS1_HQ_filtered16S.csv", index=False)

# 根据过滤后的 Table 更新 16S 序列

In [13]:
unique_refseq_ids = tables1_hq_df_filtered["16S_ID"].unique()

In [14]:
from Bio import SeqIO
from pathlib import Path

# 读取fasta文件
def read_fasta(fasta_path):
    records = list(SeqIO.parse(fasta_path, 'fasta'))
    return records

In [15]:
# 保留整合后的序列
def filter_not_in_dataset(records):
    keep_records = []
    for record in records:
        if record.id in unique_refseq_ids:
            keep_records.append(record)
    return keep_records


In [16]:
records = read_fasta("../data/filtered_16SrRNA.fasta")
keep_records = filter_not_in_dataset(records)


In [17]:
len(keep_records)

53812

In [18]:
# 保存过滤后的序列
output_file = Path('../data/filtered_intergrated_16SrRNA.fasta')
SeqIO.write(keep_records, output_file, 'fasta')

53812