In [1]:
# import packages
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.express as px
import numpy as np

### Source of info

- **LQ_gtdbtk.bac120.summary.tsv** - taxonomic info about representitives MAGs from LQ clusterin for Bacteria sp. Created with GTDB-tk v2.0 (1,641 lines)
- **LQ_gtdbtk.ar53.summary.tsv** - taxonomic info about representitives MAGs from LQ clusterin for Archaea sp. Created with GTDB-tk v2.0 (5 lines)
- **LQ_genomeInformation.csv** - info about completeness,contamination,strain_heterogeneity,length,N50. Created with dREP (15,430 lines)
- **LQ_Cdb.csv** - info about secondary_cluster. Created with dREP (9,475 lines, only bins goin to clustering)
- **2023_Pantiukh_EstMB_MAGsDB_Hqcl.csv** - list of representatives genomes from cluster. Created with 'ls * all dereplicated 
  
### Сalculated parameters 
- **bins_in_cluster** - calculated here based on final table
- **bin_qual** - calculated here based on final table

### Manualy added info
- **rep_MAG_ID** - manually added. Format: [L][0001][sp_name] were H - LQ clustering, 0001 - number, based on bacteria prevalence, sp_name - in case of unknow undS (unknown species)

In [2]:
## Step 0. Read genomeInformation file
genomeInformation = pd.read_csv('input/LQ_genomeInformation.csv')
# rename column genome
genomeInformation = genomeInformation.rename(columns={'genome': 'bin_name'})
# drop column centrality
genomeInformation = genomeInformation.drop(columns=['centrality'])
genomeInformation.tail()

Unnamed: 0,bin_name,completeness,contamination,strain_heterogeneity,length,N50
15424,VZR2LZ_vamb.278282.fa,67.24,0.0,0.0,899993,41034
15425,VZSPJW_maxbin.036.fa,93.97,47.08,26.23,3167676,2880
15426,VZTMOW_vamb.137436.fa,15.52,0.0,0.0,65154,65154
15427,VZTMOW_vamb.164616.fa,67.63,0.0,0.0,1364300,30903
15428,VZWEWM_maxbin.048.fa,33.62,0.47,0.0,2294493,5647


In [3]:
## Step 1. Add info about cluster
cluster = pd.read_csv('input/LQ_Cdb.csv')
# rename column genome
cluster = cluster.rename(columns={'genome': 'bin_name'})
# keep only bin name and cluster columns
cluster = cluster[['bin_name', 'secondary_cluster']]
cluster = cluster.rename(columns={'secondary_cluster': 'cluster'})
# merge cluster info with genome info
mrg_st1 = pd.merge(genomeInformation, cluster, on='bin_name', how='outer')
# fill NaN values with value
mrg_st1 = mrg_st1.fillna('undefined')
# add prefix to cluster name if it is not undefined
mrg_st1['cluster'] = 'LQ-'+mrg_st1['cluster'].astype(str)
mrg_st1['cluster'] = mrg_st1['cluster'].replace('LQ-undefined', 'undefined')

mrg_st1.tail()

Unnamed: 0,bin_name,completeness,contamination,strain_heterogeneity,length,N50,cluster
15424,VZR2LZ_vamb.278282.fa,67.24,0.0,0.0,899993,41034,LQ-225_1.2.5.6
15425,VZSPJW_maxbin.036.fa,93.97,47.08,26.23,3167676,2880,LQ-326_1
15426,VZTMOW_vamb.137436.fa,15.52,0.0,0.0,65154,65154,undefined
15427,VZTMOW_vamb.164616.fa,67.63,0.0,0.0,1364300,30903,LQ-738_1.2.3.4.5.9
15428,VZWEWM_maxbin.048.fa,33.62,0.47,0.0,2294493,5647,undefined


In [4]:
## Step 2. Add info about representative genomes
representative = pd.read_csv('input/2023_Pantiukh_EstMB_MAGsDB_LQcl.csv')
# define cluster name for each representative MAGs 
mrg = pd.merge(representative, mrg_st1, left_on='rep_MAG_name', right_on='bin_name', how='inner')
mrg = mrg[['rep_MAG_name', 'cluster']]
# add line to mrg
mrg.loc[len(mrg)] = ['undefined', 'undefined']
# add ino about rep_MAG to mrg1
mrg_st2 = pd.merge(mrg_st1, mrg, on='cluster', how='outer')

mrg_st2.tail()

Unnamed: 0,bin_name,completeness,contamination,strain_heterogeneity,length,N50,cluster,rep_MAG_name
15424,VY25ST_vamb.218068.fa,68.97,25.86,100.0,2406469,5865,LQ-457_24,VY25ST_vamb.218068.fa
15425,VY36ZZ_metabat.1.fa,86.8,26.88,44.0,3157159,5203,LQ-265_1,VY36ZZ_metabat.1.fa
15426,VYHIBD_maxbin.133.fa,72.65,41.38,34.29,2934952,10499,LQ-793_2,VYHIBD_maxbin.133.fa
15427,VYWBYK_metabat.123.fa,65.52,0.0,0.0,1468926,120694,LQ-696_1,VYWBYK_metabat.123.fa
15428,VZGZLV_maxbin.027.fa,100.0,49.29,74.0,3007490,1894,LQ-457_3,VZGZLV_maxbin.027.fa


In [5]:
## Step 3. Add info about taxonomy

# READ bacteria taxonomy
gt = pd.read_csv('input/LQ_gtdbtk.bac120.summary.tsv', sep='\t')
gt = gt[['user_genome','classification','fastani_ani','closest_placement_reference','closest_placement_ani']]

gt['domain'] = gt['classification'].str.split(';', expand=True)[0].str.replace(r'd__', '')
gt['phylum'] = gt['classification'].str.split(';', expand=True)[1].str.replace(r'p__', '')
gt['class'] = gt['classification'].str.split(';', expand=True)[2].str.replace(r'c__', '')
gt['order'] = gt['classification'].str.split(';', expand=True)[3].str.replace(r'o__', '')
gt['family'] = gt['classification'].str.split(';', expand=True)[4].str.replace(r'f__', '')
gt['genus'] = gt['classification'].str.split(';', expand=True)[5].str.replace(r'g__', '')
gt['sp'] = gt['classification'].str.split(';', expand=True)[6].str.replace(r's__', '')
gt = gt.drop('classification', axis=1)

# rename column genome
gt = gt.rename(columns={'user_genome': 'rep_MAG_name'})
# skip columns 
gt = gt[['rep_MAG_name', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'sp']]
# fill NaN empty values with value
gt['sp'] = gt['sp'].replace('', 'undefined')
gt['genus'] = gt['genus'].replace('', 'undefined')
gt['family'] = gt['family'].replace('', 'undefined')

# READ archaea taxonomy
ar = pd.read_csv('input/LQ_gtdbtk.ar53.summary.tsv', sep='\t')
ar = ar[['user_genome','classification','fastani_ani','closest_placement_reference','closest_placement_ani']]

ar['domain'] = ar['classification'].str.split(';', expand=True)[0].str.replace(r'd__', '')
ar['phylum'] = ar['classification'].str.split(';', expand=True)[1].str.replace(r'p__', '')
ar['class'] = ar['classification'].str.split(';', expand=True)[2].str.replace(r'c__', '')
ar['order'] = ar['classification'].str.split(';', expand=True)[3].str.replace(r'o__', '')
ar['family'] = ar['classification'].str.split(';', expand=True)[4].str.replace(r'f__', '')
ar['genus'] = ar['classification'].str.split(';', expand=True)[5].str.replace(r'g__', '')
ar['sp'] = ar['classification'].str.split(';', expand=True)[6].str.replace(r's__', '')
ar = ar.drop('classification', axis=1)

# rename column genome
ar = ar.rename(columns={'user_genome': 'rep_MAG_name'})
# skip columns 
ar = ar[['rep_MAG_name', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'sp']]
# fill NaN empty values with value
ar['sp'] = ar['sp'].replace('', 'undefined')
ar['genus'] = ar['genus'].replace('', 'undefined')

# concat bacteria and archaea taxonomy
taxa = pd.concat([gt, ar], ignore_index=True)
# add fa to rep_MAG_name
taxa['rep_MAG_name'] = taxa['rep_MAG_name'].astype(str)+'.fa'
# add line to mrg
taxa.loc[len(taxa)] = ['undefined', 'undefined','undefined', 'undefined','undefined', 'undefined','undefined', 'undefined']

# add ino about taxonomy to mrg_st2
mrg_st3 = pd.merge(mrg_st2, taxa, on='rep_MAG_name', how='outer')

mrg_st3.tail()


Unnamed: 0,bin_name,completeness,contamination,strain_heterogeneity,length,N50,cluster,rep_MAG_name,domain,phylum,class,order,family,genus,sp
15424,VY25ST_vamb.218068.fa,68.97,25.86,100.0,2406469,5865,LQ-457_24,VY25ST_vamb.218068.fa,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales_A,Pasteurellaceae,Haemophilus_D,undefined
15425,VY36ZZ_metabat.1.fa,86.8,26.88,44.0,3157159,5203,LQ-265_1,VY36ZZ_metabat.1.fa,Bacteria,Bacillota_A,Clostridia,Lachnospirales,Lachnospiraceae,Coprococcus_A,Coprococcus_A catus
15426,VYHIBD_maxbin.133.fa,72.65,41.38,34.29,2934952,10499,LQ-793_2,VYHIBD_maxbin.133.fa,Bacteria,Bacillota_A,Clostridia,Oscillospirales,Oscillospiraceae,Limivicinus,undefined
15427,VYWBYK_metabat.123.fa,65.52,0.0,0.0,1468926,120694,LQ-696_1,VYWBYK_metabat.123.fa,Bacteria,Bacillota_A,Clostridia,Oscillospirales,Acutalibacteraceae,UMGS1858,undefined
15428,VZGZLV_maxbin.027.fa,100.0,49.29,74.0,3007490,1894,LQ-457_3,VZGZLV_maxbin.027.fa,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales_A,Pasteurellaceae,Haemophilus_D,Haemophilus_D parainfluenzae_K


In [6]:
## Step 4. Add info about bins number in cluster
bn = mrg_st3[['bin_name','cluster']].groupby('cluster').count()
# rename column
bn = bn.rename(columns={'bin_name': 'bin_number_in_cluster'})
# merge it with mrg_st3
mrg_st4 = pd.merge(mrg_st3, bn, on='cluster', how='outer')

mrg_st4.tail()

Unnamed: 0,bin_name,completeness,contamination,strain_heterogeneity,length,N50,cluster,rep_MAG_name,domain,phylum,class,order,family,genus,sp,bin_number_in_cluster
15424,VY25ST_vamb.218068.fa,68.97,25.86,100.0,2406469,5865,LQ-457_24,VY25ST_vamb.218068.fa,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales_A,Pasteurellaceae,Haemophilus_D,undefined,1
15425,VY36ZZ_metabat.1.fa,86.8,26.88,44.0,3157159,5203,LQ-265_1,VY36ZZ_metabat.1.fa,Bacteria,Bacillota_A,Clostridia,Lachnospirales,Lachnospiraceae,Coprococcus_A,Coprococcus_A catus,1
15426,VYHIBD_maxbin.133.fa,72.65,41.38,34.29,2934952,10499,LQ-793_2,VYHIBD_maxbin.133.fa,Bacteria,Bacillota_A,Clostridia,Oscillospirales,Oscillospiraceae,Limivicinus,undefined,1
15427,VYWBYK_metabat.123.fa,65.52,0.0,0.0,1468926,120694,LQ-696_1,VYWBYK_metabat.123.fa,Bacteria,Bacillota_A,Clostridia,Oscillospirales,Acutalibacteraceae,UMGS1858,undefined,1
15428,VZGZLV_maxbin.027.fa,100.0,49.29,74.0,3007490,1894,LQ-457_3,VZGZLV_maxbin.027.fa,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales_A,Pasteurellaceae,Haemophilus_D,Haemophilus_D parainfluenzae_K,1


In [7]:
# SAVE FINAL TABLE with all bins went to LQcl
mrg_st4.to_excel('results/LQcl_all_bins_all_info.xlsx', index=False)

In [8]:
## Step 5. Select only representative MAGs
mrg_st5 = pd.merge(representative, mrg_st4, left_on='rep_MAG_name', right_on='bin_name', how='inner')
# rename column genome
mrg_st5 = mrg_st5.rename(columns={'rep_MAG_name_x': 'rep_MAG_name'})
# drop column centrality
mrg_st5 = mrg_st5.drop(columns=['bin_name','rep_MAG_name_y'])
mrg_st5.tail()

Unnamed: 0,rep_MAG_name,completeness,contamination,strain_heterogeneity,length,N50,cluster,domain,phylum,class,order,family,genus,sp,bin_number_in_cluster
1639,VZTMOW_maxbin.050_sub.fa,52.27,7.97,0.0,1164295,1203,LQ-992_1,Bacteria,Verrucomicrobiota,Verrucomicrobiae,Opitutales,CAG-312,Merdousia,undefined,4
1640,VZTMOW_maxbin.071.fa,66.77,25.86,41.18,1729642,49503,LQ-752_3,Bacteria,Bacillota_A,Clostridia,Oscillospirales,Oscillospiraceae,Vescimonas,undefined,1
1641,VZTMOW_maxbin.072.fa,73.28,26.49,62.86,1821208,48328,LQ-752_2,Bacteria,Bacillota_A,Clostridia,Oscillospirales,Oscillospiraceae,Vescimonas,undefined,1
1642,VZXIHP_vamb.46239.fa,73.75,0.0,0.0,2515878,11062,LQ-683_1,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,UBA4372,UBA4372 sp900543815,1
1643,VZZW6V_vamb.255038.fa,68.97,0.0,0.0,966784,22255,LQ-25_3,Bacteria,Bacillota_A,Clostridia,TANB77,UBA1234,HGM13618,HGM13618 sp900753975,3


In [10]:
## Step 6. Add info about rep_MAG_ID
mid = pd.read_csv('input/LQ_rename.csv', sep='\t')
mrg_st6 = pd.merge(mrg_st5, mid, on='rep_MAG_name', how='inner')
mrg_st6.tail()

Unnamed: 0,rep_MAG_name,completeness,contamination,strain_heterogeneity,length,N50,cluster,domain,phylum,class,order,family,genus,sp,bin_number_in_cluster,rep_MAG_ID
1639,VZTMOW_maxbin.050_sub.fa,52.27,7.97,0.0,1164295,1203,LQ-992_1,Bacteria,Verrucomicrobiota,Verrucomicrobiae,Opitutales,CAG-312,Merdousia,undefined,4,L0475_Merdousia_undS.fa
1640,VZTMOW_maxbin.071.fa,66.77,25.86,41.18,1729642,49503,LQ-752_3,Bacteria,Bacillota_A,Clostridia,Oscillospirales,Oscillospiraceae,Vescimonas,undefined,1,L1642_Vescimonas_undS.fa
1641,VZTMOW_maxbin.072.fa,73.28,26.49,62.86,1821208,48328,LQ-752_2,Bacteria,Bacillota_A,Clostridia,Oscillospirales,Oscillospiraceae,Vescimonas,undefined,1,L1643_Vescimonas_undS.fa
1642,VZXIHP_vamb.46239.fa,73.75,0.0,0.0,2515878,11062,LQ-683_1,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,UBA4372,UBA4372 sp900543815,1,L1644_UBA4372_sp900543815.fa
1643,VZZW6V_vamb.255038.fa,68.97,0.0,0.0,966784,22255,LQ-25_3,Bacteria,Bacillota_A,Clostridia,TANB77,UBA1234,HGM13618,HGM13618 sp900753975,3,L0578_HGM13618_sp900753975.fa


In [11]:
# Step 7. Add info about bins quality
conditions = [(mrg_st6['completeness'] > 90) & (mrg_st6['contamination'] < 5),
        (mrg_st6['completeness'] <= 90) & (mrg_st6['contamination'] < 10) | \
        (mrg_st6['completeness'] > 90) & (mrg_st6['contamination'] >= 5) & (mrg_st6['contamination'] < 10),
        (mrg_st6['contamination'] >= 10)]

values = ['HQ', 'MQ', 'LQ']
mrg_st6['bin_qual'] = np.select(conditions, values)

In [13]:
# SAVE FINAL TABLE with only representative MAGs came from LQcl
mrg_st6.to_excel('results/LQcl_representative_MAGs_all_info.xlsx', index=False)