In [2]:
# import packages
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.express as px
import numpy as np

### Source of info

- **results/HQcl_representative_MAGs_all_info.xlsx** - Representative genomes from HQ clustering. Created with Final_build_HQcl.ipynb (1,641 lines)
- **results/LQcl_representative_MAGs_all_info.xlsx** - Representative genomes from LQ clustering. Created with Final_build_LQcl.ipynb (5 lines)
- **results/HQLQcl_together_Cdb.csv** - info about secondary_cluster. Created with dREP (9,475 lines, only bins goin to clustering)
  
- **input/HQLQcl_representative_genomes_list.csv** - list of representatives genomes from cluster. Created with 'ls * all dereplicated 
  

In [9]:
## Step 1. Combine HQ and LQ representative genomes
# Combine HQcl_representative_MAGs_all_info.xlsx an LQcl_representative_MAGs_all_info.xlsx
HQcl = pd.read_excel('results/HQcl_representative_MAGs_all_info.xlsx')
LQcl = pd.read_excel('results/LQcl_representative_MAGs_all_info.xlsx')
# Combine HQcl and LQcl
HQcl_LQcl = pd.concat([HQcl, LQcl])
# reset index
HQcl_LQcl = HQcl_LQcl.reset_index(drop=True)
HQcl_LQcl.tail(2)


2257
1644
3901


Unnamed: 0,rep_MAG_name,pre_cluster,completeness,contamination,strain_heterogeneity,length,N50,bst_preCl_bin_name,cluster,domain,phylum,class,order,family,genus,sp,bin_number_in_cluster,bin_qual,rep_MAG_ID
3899,VZXIHP_vamb.46239.fa,,73.75,0.0,0.0,2515878,11062,,LQ-683_1,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,UBA4372,UBA4372 sp900543815,1,MQ,L1644_UBA4372_sp900543815.fa
3900,VZZW6V_vamb.255038.fa,,68.97,0.0,0.0,966784,22255,,LQ-25_3,Bacteria,Bacillota_A,Clostridia,TANB77,UBA1234,HGM13618,HGM13618 sp900753975,3,MQ,L0578_HGM13618_sp900753975.fa


In [22]:
## Step 1. Add info about cluster
cluster = pd.read_csv('input/HQLQcl_together_Cdb.csv')
# rename column genome
cluster = cluster.rename(columns={'genome': 'rep_MAG_ID'})
# keep only bin name and cluster columns
cluster = cluster[['rep_MAG_ID', 'secondary_cluster']]
cluster = cluster.rename(columns={'secondary_cluster': 'HQLQ_cluster'})
cluster.tail(2)

# merge HQcl_LQcl and cluster
mrg_st1 = pd.merge(HQcl_LQcl, cluster, on='rep_MAG_ID', how='outer')
mrg_st1.tail(2)

Unnamed: 0,rep_MAG_name,pre_cluster,completeness,contamination,strain_heterogeneity,length,N50,bst_preCl_bin_name,cluster,domain,phylum,class,order,family,genus,sp,bin_number_in_cluster,bin_qual,rep_MAG_ID,HQLQ_cluster
3899,VZXIHP_vamb.46239.fa,,73.75,0.0,0.0,2515878,11062,,LQ-683_1,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,UBA4372,UBA4372 sp900543815,1,MQ,L1644_UBA4372_sp900543815.fa,574_1
3900,VZZW6V_vamb.255038.fa,,68.97,0.0,0.0,966784,22255,,LQ-25_3,Bacteria,Bacillota_A,Clostridia,TANB77,UBA1234,HGM13618,HGM13618 sp900753975,3,MQ,L0578_HGM13618_sp900753975.fa,215_1


In [26]:
# Step 2. Read the list of representative genomes from HQLQ clustering
lst = pd.read_csv('input/HQLQcl_representative_genomes_list.csv')
lst.tail(2)

# define cluster for HQLQ_rep_MAGs
mrg_st2 = pd.merge(lst, mrg_st1, left_on='HQLQ_rep_MAG_ID', right_on='rep_MAG_ID', how='inner')
mrg_st2 = mrg_st2[['HQLQ_rep_MAG_ID', 'HQLQ_cluster']]
mrg_st2.tail(2)

# Step 3. merge mrg_st1 and mrg_st2
mrg_st3 = pd.merge(mrg_st1, mrg_st2, on='HQLQ_cluster', how='outer')
mrg_st3 = mrg_st3[['HQLQ_rep_MAG_ID','HQLQ_cluster','rep_MAG_ID', 'bin_qual','completeness', 'contamination',
       'strain_heterogeneity', 'length', 'N50', 'cluster', 'domain', 'phylum', 'class', 
       'order', 'family', 'genus', 'sp', 'bin_number_in_cluster','rep_MAG_name']]
mrg_st3.tail(2)

Unnamed: 0,HQLQ_rep_MAG_ID,HQLQ_cluster,rep_MAG_ID,bin_qual,completeness,contamination,strain_heterogeneity,length,N50,cluster,domain,phylum,class,order,family,genus,sp,bin_number_in_cluster,rep_MAG_name
3899,L1639_Oxalobacter_undS.fa,2063_1,L1639_Oxalobacter_undS.fa,LQ,73.38,17.45,7.14,2857230,1116,LQ-1022_2,Bacteria,Pseudomonadota,Gammaproteobacteria,Burkholderiales,Burkholderiaceae,Oxalobacter,undefined,1,VZR2LZ_maxbin.165.fa
3900,L1642_Vescimonas_undS.fa,1900_2,L1642_Vescimonas_undS.fa,LQ,66.77,25.86,41.18,1729642,49503,LQ-752_3,Bacteria,Bacillota_A,Clostridia,Oscillospirales,Oscillospiraceae,Vescimonas,undefined,1,VZTMOW_maxbin.071.fa


In [27]:
# Step 4. Caclulate the number of genomes in each HQLQ_cluster
bn = mrg_st3[['rep_MAG_ID','HQLQ_cluster']].groupby('HQLQ_cluster').count()
# rename column
bn = bn.rename(columns={'rep_MAG_ID': 'rep_MAG_ID_in_HQLQcluster'})
# merge it with mrg_st3
mrg_st4 = pd.merge(mrg_st3, bn, on='HQLQ_cluster', how='outer')

mrg_st4.tail()


Unnamed: 0,HQLQ_rep_MAG_ID,HQLQ_cluster,rep_MAG_ID,bin_qual,completeness,contamination,strain_heterogeneity,length,N50,cluster,domain,phylum,class,order,family,genus,sp,bin_number_in_cluster,rep_MAG_name,rep_MAG_ID_in_HQLQcluster
3896,L1631_Haemophilus_D_parainfluenzae_K.fa,106_1,L1631_Haemophilus_D_parainfluenzae_K.fa,LQ,100.0,49.29,74.0,3007490,1894,LQ-457_3,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales_A,Pasteurellaceae,Haemophilus_D,Haemophilus_D parainfluenzae_K,1,VZGZLV_maxbin.027.fa,1
3897,L1632_HGM05190_sp900759815.fa,663_1,L1632_HGM05190_sp900759815.fa,MQ,57.05,0.0,0.0,1340836,8501,LQ-501_1,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Muribaculaceae,HGM05190,HGM05190 sp900759815,1,VZHO9I_vamb.250220.fa,1
3898,L1638_UMGS872_undS.fa,380_2,L1638_UMGS872_undS.fa,LQ,50.0,24.14,0.0,1283430,56641,LQ-185_4,Bacteria,Bacillota,Bacilli,RF39,UBA660,UMGS872,undefined,1,VZR2LZ_maxbin.071_sub.fa,1
3899,L1639_Oxalobacter_undS.fa,2063_1,L1639_Oxalobacter_undS.fa,LQ,73.38,17.45,7.14,2857230,1116,LQ-1022_2,Bacteria,Pseudomonadota,Gammaproteobacteria,Burkholderiales,Burkholderiaceae,Oxalobacter,undefined,1,VZR2LZ_maxbin.165.fa,1
3900,L1642_Vescimonas_undS.fa,1900_2,L1642_Vescimonas_undS.fa,LQ,66.77,25.86,41.18,1729642,49503,LQ-752_3,Bacteria,Bacillota_A,Clostridia,Oscillospirales,Oscillospiraceae,Vescimonas,undefined,1,VZTMOW_maxbin.071.fa,1


Moove to representatives MAGs from cluster

In [31]:
## Step 5. Select only representative MAGs
mrg_st5 = pd.merge(lst, mrg_st4, left_on='HQLQ_rep_MAG_ID', right_on='rep_MAG_ID', how='inner')
# rename column genome
mrg_st5 = mrg_st5.rename(columns={'HQLQ_rep_MAG_ID_x': 'HQLQ_rep_MAG_ID'})
# drop column centrality
mrg_st5 = mrg_st5.drop(columns=['rep_MAG_ID','HQLQ_rep_MAG_ID_y'])
mrg_st5.tail()

Unnamed: 0,HQLQ_rep_MAG_ID,HQLQ_cluster,bin_qual,completeness,contamination,strain_heterogeneity,length,N50,cluster,domain,phylum,class,order,family,genus,sp,bin_number_in_cluster,rep_MAG_name,rep_MAG_ID_in_HQLQcluster
2723,L1631_Haemophilus_D_parainfluenzae_K.fa,106_1,LQ,100.0,49.29,74.0,3007490,1894,LQ-457_3,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales_A,Pasteurellaceae,Haemophilus_D,Haemophilus_D parainfluenzae_K,1,VZGZLV_maxbin.027.fa,1
2724,L1632_HGM05190_sp900759815.fa,663_1,MQ,57.05,0.0,0.0,1340836,8501,LQ-501_1,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Muribaculaceae,HGM05190,HGM05190 sp900759815,1,VZHO9I_vamb.250220.fa,1
2725,L1638_UMGS872_undS.fa,380_2,LQ,50.0,24.14,0.0,1283430,56641,LQ-185_4,Bacteria,Bacillota,Bacilli,RF39,UBA660,UMGS872,undefined,1,VZR2LZ_maxbin.071_sub.fa,1
2726,L1639_Oxalobacter_undS.fa,2063_1,LQ,73.38,17.45,7.14,2857230,1116,LQ-1022_2,Bacteria,Pseudomonadota,Gammaproteobacteria,Burkholderiales,Burkholderiaceae,Oxalobacter,undefined,1,VZR2LZ_maxbin.165.fa,1
2727,L1642_Vescimonas_undS.fa,1900_2,LQ,66.77,25.86,41.18,1729642,49503,LQ-752_3,Bacteria,Bacillota_A,Clostridia,Oscillospirales,Oscillospiraceae,Vescimonas,undefined,1,VZTMOW_maxbin.071.fa,1


## Save the files

In [34]:
# SAVE FINAL TABLE with all MAGs came from HQ an LQ clustering 
mrg_st4.to_excel('results/together_HQLQcl_all_MAGs_all_info.xlsx', index=False)

In [35]:
# SAVE FINAL TABLE with only HQLQ_rep_MAGs came from HQ an LQ clustering 
mrg_st5.to_excel('results/together_HQLQcl_representative_MAGs_all_info.xlsx', index=False)