In [61]:
import pandas as pd

input_csv_file = "analysis/top3BGCrichedGenera/data/DataS1.8BGC.csv"
input_df = pd.read_csv(input_csv_file, sep=",", header=0)
input_df.shape

(175321, 22)

In [63]:
input_df = input_df[input_df['sum'] >= 20]
input_df.shape

(6062, 22)

In [64]:
grouped = (
    input_df
    .groupby(['phylum', 'genus'])['sum']
    .agg(
        mean_sum     = 'mean',
        sum_std      = 'std',
        genome_count = 'count'
    )
    .reset_index()
)

grouped

Unnamed: 0,phylum,genus,mean_sum,sum_std,genome_count
0,p__Actinobacteria,g__Actinoalloteichus,30.500000,15.003333,6
1,p__Actinobacteria,g__Actinocrispum,54.000000,,1
2,p__Actinobacteria,g__Actinokineospora,34.666667,8.094143,12
3,p__Actinobacteria,g__Actinomadura,27.285714,5.529144,7
4,p__Actinobacteria,g__Actinophytocola,38.400000,8.264381,5
...,...,...,...,...,...
126,p__Proteobacteria,g__Serratia,22.000000,,1
127,p__Proteobacteria,g__Sinorhizobium,22.000000,1.000000,3
128,p__Proteobacteria,g__Sorangium,38.400000,6.148170,5
129,p__Proteobacteria,g__Stigmatella,31.666667,6.429101,3


In [65]:
filtered_grouped = grouped[
    (grouped['sum_std'] <= 15) &
    (grouped['genome_count'] >= 5) &
    (grouped['mean_sum'] >= 10)
].reset_index(drop=True).sort_values(by='mean_sum', ascending=False)

filtered_grouped

Unnamed: 0,phylum,genus,mean_sum,sum_std,genome_count
26,p__Proteobacteria,g__Corallococcus,53.72093,13.570496,43
5,p__Actinobacteria,g__Kibdelosporangium,49.714286,2.429972,7
6,p__Actinobacteria,g__Kitasatospora,38.675676,11.78524,37
33,p__Proteobacteria,g__Sorangium,38.4,6.14817,5
2,p__Actinobacteria,g__Actinophytocola,38.4,8.264381,5
19,p__Actinobacteria,g__Streptomyces,37.660987,12.971623,1702
16,p__Actinobacteria,g__Saccharothrix,35.631579,7.725934,19
0,p__Actinobacteria,g__Actinokineospora,34.666667,8.094143,12
3,p__Actinobacteria,g__Amycolatopsis,33.864583,13.407476,96
18,p__Actinobacteria,g__Streptacidiphilus,32.777778,6.960204,9


In [66]:
# 假设 df_sorted 已经按 ['phylum','mean_sum'] 排好序，且列名分别为 'phylum', 'genus', 'mean_sum'
target_phylum_genus = {}  # 最终结果：{ phylum1: [genus1, genus2, genus3], phylum2: […], … }

for _, row in filtered_grouped.iterrows():
    ph = row['phylum']
    gen = row['genus']
    
    # 如果这个 phylum 尚未加入，且已满 6 个 phylum，则跳过
    if ph not in target_phylum_genus:
        if len(target_phylum_genus) >= 6:
            continue
        # 新 phylum，初始化空 list
        target_phylum_genus[ph] = []
    
    # 对于已加入的 phylum，若其 genus 数量 < 3，则添加
    if len(target_phylum_genus[ph]) < 3:
        target_phylum_genus[ph].append(gen)
    
    # 提前退出：当已收集到 6×3=18 条组合时，整个循环可以结束
    total = sum(len(glist) for glist in target_phylum_genus.values())
    if total >= 6 * 3:
        break

# 打印结果检验
# 收集所有 (phylum, genus) 对
pairs = [(ph, g) for ph, gens in target_phylum_genus.items() for g in gens]

# 从 filtered_grouped 中筛选出这些行
result_df = filtered_grouped[
    filtered_grouped.apply(lambda r: (r['phylum'], r['genus']) in pairs, axis=1)
].reset_index(drop=True)
result_df.to_csv('analysis/top3BGCrichedGenera/data/top_phylum_genus_mean.csv', index=False)
result_df

Unnamed: 0,phylum,genus,mean_sum,sum_std,genome_count
0,p__Proteobacteria,g__Corallococcus,53.72093,13.570496,43
1,p__Actinobacteria,g__Kibdelosporangium,49.714286,2.429972,7
2,p__Actinobacteria,g__Kitasatospora,38.675676,11.78524,37
3,p__Proteobacteria,g__Sorangium,38.4,6.14817,5
4,p__Actinobacteria,g__Actinophytocola,38.4,8.264381,5
5,p__Proteobacteria,g__Myxococcus,32.431818,12.445972,44
6,p__Firmicutes,g__Paenibacillus,28.15,7.419828,40
7,p__Bacteroidetes,g__Chitinophaga,27.684211,5.831453,19
8,p__Cyanobacteria,g__Nostoc,26.705882,5.535174,34
9,p__Firmicutes,g__Brevibacillus,24.0,2.0,5


In [67]:
_df = result_df[['phylum','genus']]

# 2. inner-merge 或者 .isin 过滤
filtered_df = input_df.merge(_df, on=['phylum','genus'], how='inner')

# 3. 保存
filtered_df.to_csv('analysis/top3BGCrichedGenera/data/top3genera.csv', index=False)

filtered_df

Unnamed: 0,assembly_accession,CheckM completeness,CheckM contamination,16S_ID,taxid,assembly_level,kindom,phylum,class,order,...,species,sum,PKSI,PKSother,NRPS,RiPPs,Saccharides,Terpene,PKS-NRP_Hybrids,Others
0,GCF_002082585.1,99.96,0.38,refseq136643,68173,Complete Genome,k__Bacteria,p__Actinobacteria,c__Actinomycetia,o__Streptomycetales,...,s__Kitasatospora albolonga,38,0.0,3.0,7.0,6.0,0.0,5.0,4.0,13.0
1,GCF_004364325.1,99.92,2.84,refseq70366,502181,Contig,k__Bacteria,p__Actinobacteria,c__Actinomycetia,o__Pseudonocardiales,...,s__Actinophytocola oryzae,42,6.0,2.0,6.0,5.0,0.0,3.0,5.0,15.0
2,GCF_017876405.1,99.75,1.19,refseq22918,1365924,Contig,k__Bacteria,p__Actinobacteria,c__Actinomycetia,o__Pseudonocardiales,...,s__Kibdelosporangium banguiense,46,3.0,0.0,11.0,8.0,1.0,5.0,3.0,15.0
3,GCF_000520795.1,99.72,0.19,refseq159969,1333863,Contig,k__Bacteria,p__Firmicutes,c__Bacilli,o__Bacillales,...,s__Paenibacillus polymyxa,31,0.0,2.0,19.0,4.0,0.0,0.0,5.0,1.0
4,GCF_000612225.1,99.72,2.14,refseq162225,1444314,Contig,k__Bacteria,p__Firmicutes,c__Bacilli,o__Bacillales,...,s__Paenibacillus ehimensis,27,0.0,1.0,17.0,2.0,0.0,1.0,4.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,GCF_016757455.1,93.90,2.69,refseq185580,2804502,Contig,k__Bacteria,p__Firmicutes,c__Bacilli,o__Bacillales,...,s__Bacillus sp. RHFS18,46,0.0,21.0,19.0,1.0,0.0,1.0,2.0,2.0
296,GCF_014222275.1,93.81,0.91,refseq60516,2681306,Contig,k__Bacteria,p__Cyanobacteria,c__unclassified Cyanobacteria class,o__Nostocales,...,s__Nostoc sp. UCD122,31,1.0,2.0,9.0,9.0,0.0,6.0,4.0,0.0
297,GCF_014698115.1,93.26,3.16,refseq180410,2692844,Contig,k__Bacteria,p__Cyanobacteria,c__unclassified Cyanobacteria class,o__Nostocales,...,s__Nostoc sp. FACHB-973,28,2.0,3.0,8.0,4.0,0.0,4.0,4.0,3.0
298,GCF_000696185.1,93.12,0.62,refseq17595,1348663,Scaffold,k__Bacteria,p__Actinobacteria,c__Actinomycetia,o__Streptomycetales,...,s__Kitasatospora cheerisanensis,30,2.0,1.0,2.0,11.0,0.0,5.0,3.0,6.0
