In [2]:
import pandas as pd

In [3]:
file_path = "../data/TableS1.tsv"
data = pd.read_csv(file_path, sep='\t')

In [4]:
data.columns[10:12]

Index(['sum', 'sactipeptide'], dtype='object')

# Count the number under each taxonomy classification.

In [5]:
lineage_cols = ["kingdom", "phylum", "class", "order", "family", "genus", "species"]
lineage_df = data[lineage_cols]

def count_lineages(df):
    unique_counts = lineage_df.nunique()
    return unique_counts

count_lineages(data)

kingdom        1
phylum        46
class        106
order        252
family       603
genus       3210
species    32927
dtype: int64

# Count the number of genus under each phylum.

In [6]:
def count_genus_per_phylum(df):
    result = df.groupby('phylum')['genus'].nunique().reset_index()
    result.columns = ['phylum', 'genus_count']
    result = result.sort_values(by='genus_count', ascending=False)
    return result

count_genus_per_phylum(lineage_df)
    

Unnamed: 0,phylum,genus_count
36,p__Proteobacteria,1360
27,p__Firmicutes,630
2,p__Actinobacteria,419
5,p__Bacteroidetes,347
21,p__Cyanobacteria,110
35,p__Planctomycetes,62
18,p__Chloroflexi,38
43,p__Verrucomicrobia,33
1,p__Acidobacteria,24
38,p__Spirochaetes,20


In [7]:
import pandas as pd

def average_bgc_count_at_genus_level(df):
    required_columns = ['phylum', 'genus', 'sum']
    # Group by 'phylum' and 'genus' and calculate required statistics
    grouped = df.groupby(['phylum', 'genus'])['sum'].agg(
        mean='mean',
        std='std',
        count='count'
    ).reset_index()

    # Rename columns for clarity
    grouped.columns = ['phylum', 'genus', 'mean_sum', 'std_sum', 'data_count']

    return grouped
    
average_bgc_count = average_bgc_count_at_genus_level(data)
average_bgc_count.to_csv("../data/Fig1.tsv", sep='\t', index=False)

# Count groups where the average BGC count exceeds thresholds (20 and 40).

In [8]:
def count_high_average_bgc(grouped_df):
    # Ensure required columns exist
    required_columns = ['mean_sum']
    if not all(col in grouped_df.columns for col in required_columns):
        raise ValueError(f"The DataFrame must contain columns: {required_columns}")

    # Count groups exceeding thresholds
    count_over_20 = grouped_df[grouped_df['mean_sum'] >= 20]
    count_over_40 = grouped_df[grouped_df['mean_sum'] >= 40]

    result = {
        'count_over_20': count_over_20.shape[0],
        'count_over_40': count_over_40.shape[0]
    }

    return count_over_20, count_over_40, result
count_over_20, count_over_40, result = count_high_average_bgc(average_bgc_count)
result

{'count_over_20': 118, 'count_over_40': 16}

In [9]:
proteobacteria = count_over_40[count_over_40["phylum"] == "p__Proteobacteria"]
proteobacteria

Unnamed: 0,phylum,genus,mean_sum,std_sum,data_count
1859,p__Proteobacteria,g__Archangium,44.142857,9.685532,7
2059,p__Proteobacteria,g__Corallococcus,54.826087,14.629201,46
2080,p__Proteobacteria,g__Cystobacter,43.666667,3.05505,3
2175,p__Proteobacteria,g__Enhygromyxa,40.333333,6.429101,3
2491,p__Proteobacteria,g__Minicystis,45.0,,1
2764,p__Proteobacteria,g__Pyxidicoccus,59.2,18.646716,5
3057,p__Proteobacteria,g__Vitiosangium,53.0,,1


# Genus with the largest average number of BGCs.

In [10]:
average_bgc_count.sort_values(["mean_sum"], ascending=False).head(1)

Unnamed: 0,phylum,genus,mean_sum,std_sum,data_count
2764,p__Proteobacteria,g__Pyxidicoccus,59.2,18.646716,5


# Strain with the largest number of BGCs.

In [11]:
data.sort_values(["sum"], ascending=False).head(1).iloc[:, :11]

Unnamed: 0,assembly_accession,taxid,assembly_level,kingdom,phylum,class,order,family,genus,species,sum
203490,GCF_009864805.1,2690362,Contig,k__Bacteria,p__Actinobacteria,c__Actinomycetia,o__Streptomycetales,f__Streptomycetaceae,g__Streptomyces,s__Streptomyces sp. SID8382,142


In [12]:
total_bgc = data["sum"].sum()
print(f"Total number of BGCs: {total_bgc}")

Total number of BGCs: 1295905


# Explore 17 previously neglected and/or underestimated genera but rich in BGCs

In [13]:
sorted_average_bgc_count = average_bgc_count.sort_values("mean_sum", ascending=False)
filtered_sorted_average_bgc_count = sorted_average_bgc_count[sorted_average_bgc_count['data_count'] > 10]
top6_phylum = filtered_sorted_average_bgc_count['phylum'].unique()[:6]
top6_phylum

array(['p__Proteobacteria', 'p__Actinobacteria', 'p__Cyanobacteria',
       'p__Bacteroidetes', 'p__Firmicutes', 'p__Acidobacteria'],
      dtype=object)

In [14]:
filtered_df = filtered_sorted_average_bgc_count[
    (filtered_sorted_average_bgc_count['phylum'].isin(top6_phylum)) &
    (filtered_sorted_average_bgc_count['genus'] != 'g__Streptomyces')
]
top_per_phylum = filtered_df.sort_values(['phylum', 'mean_sum'], ascending=[True, False]).groupby('phylum').head(3)
top_per_phylum

Unnamed: 0,phylum,genus,mean_sum,std_sum,data_count
14,p__Acidobacteria,g__Granulicella,9.176471,4.260799,17
12,p__Acidobacteria,g__Edaphobacter,8.0,1.664101,14
222,p__Actinobacteria,g__Kitasatospora,37.634146,12.026546,41
367,p__Actinobacteria,g__Saccharothrix,36.7,8.909486,20
241,p__Actinobacteria,g__Lentzea,32.92,5.670979,25
528,p__Bacteroidetes,g__Chitinophaga,17.268657,8.364032,67
485,p__Bacteroidetes,g__Aquimarina,10.690476,5.27529,42
770,p__Bacteroidetes,g__Spirosoma,7.518519,2.20786,27
963,p__Cyanobacteria,g__Nostoc,23.86747,6.191598,83
903,p__Cyanobacteria,g__Calothrix,21.25,7.077578,20


In [15]:
def extract_matching_rows(total_df, target_df, max_rows=100):
    # Merge total_df with target_df on 'phylum' and 'genus'
    merged_df = total_df.merge(target_df[['phylum', 'genus']], on=['phylum', 'genus'], how='inner')
    
    # Group by 'phylum' and 'genus' and apply sampling
    def sample_group(group):
        return group.sample(n=min(len(group), max_rows), random_state=42)
    
    result_df = merged_df.groupby(['phylum', 'genus'], group_keys=False).apply(sample_group)
    result_df = result_df.dropna(axis=1, how='all')
    return result_df

In [16]:
result_df = extract_matching_rows(data, top_per_phylum)
result_df.to_csv("../data/Fig2.tsv", sep='\t', index=False)
result_df.shape

(829, 445)

In [21]:
fig2_df = pd.read_csv("../data/Fig2.tsv", sep='\t')
fig2_df["sum"].describe()

count    829.000000
mean      24.294331
std       13.281611
min        3.000000
25%       16.000000
50%       23.000000
75%       29.000000
max       96.000000
Name: sum, dtype: float64

In [26]:
bgc_class_col = fig2_df.columns[11:]
bgc_class_col

Index(['NRPS', 'terpene', 'NRPS+T1PKS', 'bacteriocin', 'NRPS-like',
       'lanthipeptide', 'T1PKS', 'T3PKS', 'siderophore', 'transAT-PKS-like',
       ...
       'NRPS+other+terpene+transAT-PKS', 'NRPS+NRPS-like+T2PKS+bacteriocin',
       'NRPS+T1PKS+terpene+thioamide-NRP', 'NRPS+T2PKS+ectoine',
       'CDPS+blactam', 'LAP+NRPS+T1PKS+butyrolactone',
       'NRPS+T1PKS+hglE-KS+resorcinol',
       'NRPS+NRPS-like+PKS-like+T1PKS+T3PKS+transAT-PKS',
       'NRPS+NRPS-like+terpene+transAT-PKS',
       'NRPS+NRPS-like+PKS-like+T1PKS+lanthipeptide+transAT-PKS'],
      dtype='object', length=433)

In [32]:
total_sum = fig2_df[bgc_class_col].sum().sort_values(ascending=False)
percentage = (total_sum / total_sum.sum()) * 100
percentage.head(5)

NRPS           31.201589
terpene        10.094340
NRPS+T1PKS      7.348560
bacteriocin     6.742800
NRPS-like       4.712016
dtype: float64