# Calculating Excess Atom Fraction (EAF)
This script will calculate the EAF for each contig in the viral and cell enriched fractions.

# Import needed packages

In [26]:
import pandas as pd
from sklearn.preprocessing import normalize
import re
#from Bio import Entrez
import time
import glob

Functions:

In [2]:
# Function to search NCBI for taxa level
# # Set email for NCBI Entrez API
# Entrez.email = "your_email@example.com"
pd.set_option('display.precision', 5)
def get_taxonomic_level(taxid, retries=3):
    for attempt in range(retries):
        try:
            # Fetch the taxonomy record from NCBI Entrez
            handle = Entrez.efetch(db="taxonomy", id=str(taxid), retmode="xml")
            records = Entrez.read(handle)
            handle.close()
            
            # Extract the rank from the taxonomy record
            taxonomic_level = records[0].get("Rank", "no rank")
            return taxonomic_level
        except Exception as e:
            if attempt < retries - 1:
                time.sleep(1)  # Wait a bit before retrying
            else:
                return "error"

def modify_contig_taxa(row):
    prefix = prefixes.get(row['taxonomic_level'], '')
    return f"{prefix}{row['contig_taxa']}"


# Use coverage file from anvio to create a relative abundance dataframe. Save as a csv

In [3]:
# Define old and new names
oldnames = [
    "CLEAN_DAY7_DO_0_12C_CELL_ENRICHED_1_6", 
    "CLEAN_DAY7_DO_0_12C_CELL_ENRICHED_7",
    "CLEAN_DAY7_DO_0_12C_CELL_ENRICHED_8", 
    "CLEAN_DAY7_DO_0_12C_CELL_ENRICHED_9",
    "CLEAN_DAY7_DO_0_12C_CELL_ENRICHED_10_12",
    "CLEAN_DAY7_DO_0_13C_CELL_ENRICHED_1_5", 
    "CLEAN_DAY7_DO_0_13C_CELL_ENRICHED_6",
    "CLEAN_DAY7_DO_0_13C_CELL_ENRICHED_7", 
    "CLEAN_DAY7_DO_0_13C_CELL_ENRICHED_8",
    "CLEAN_DAY7_DO_0_13C_CELL_ENRICHED_9_12"
]

newnames = [
    "DAY7_DO_0_12C_CELL_ENRICHED_6", 
    "DAY7_DO_0_12C_CELL_ENRICHED_7",
    "DAY7_DO_0_12C_CELL_ENRICHED_8", 
    "DAY7_DO_0_12C_CELL_ENRICHED_9",
    "DAY7_DO_0_12C_CELL_ENRICHED_10",
    "DAY7_DO_0_13C_CELL_ENRICHED_5", 
    "DAY7_DO_0_13C_CELL_ENRICHED_6",
    "DAY7_DO_0_13C_CELL_ENRICHED_7", 
    "DAY7_DO_0_13C_CELL_ENRICHED_8",
    "DAY7_DO_0_13C_CELL_ENRICHED_9"
]


In [None]:
# Read the data
contig_cov = pd.read_csv("/projects/luo_lab/Rogers_SidersViralAnalysis_XXXX_20XX/data/processed/anvio/merged_profile_db/drep_contigs_removed/drep_contigs_removed-COVs.txt", sep="\t")

In [None]:
# Rename columns
contig_cov.rename(columns=dict(zip(oldnames, newnames)), inplace=True)

In [None]:
# Convert to DataFrame with contig as row index
cell_relative_abundance = contig_cov.set_index("contig").transpose().reset_index()

In [None]:

# Normalize the data
normalized_data = normalize(cell_relative_abundance.drop(columns="index"), axis=1, norm='l1')
cell_relative_abundance = pd.DataFrame(normalized_data, columns=cell_relative_abundance.columns[1:])
cell_relative_abundance["samples"] = contig_cov.columns[1:]

In [None]:
# Reorder columns to have samples first
cell_relative_abundance = cell_relative_abundance[["samples"] + list(cell_relative_abundance.columns[:-1])]

In [None]:
# Save to CSV
cell_relative_abundance.to_csv("/projects/luo_lab/Rogers_SidersViralAnalysis_XXXX_20XX/data/analysis/drep_contigs_removed_rel_abun2.csv", index=False)
cell_relative_abundance

In [5]:
cell_relative_abundance_filter=cell_relative_abundance[cell_relative_abundance['samples'] != 'CLEAN_DAY0_DO_0_ENV_CELL_CONTROL_NONE']
cell_relative_abundance_filter

contig
day7_DO_0_12C_cell_enriched_000000049952    1.78839e-02
day7_DO_0_13C_cell_enriched_000001271829    1.51552e-02
day7_DO_0_12C_cell_enriched_000000013946    1.20845e-02
day7_DO_0_12C_cell_enriched_000001054855    1.19326e-02
day7_DO_0_13C_cell_enriched_000000556126    1.03738e-02
                                               ...     
day7_DO_0_12C_cell_enriched_000000749775    4.35894e-08
day7_DO_0_13C_cell_enriched_000000312182    3.93459e-08
day0_DO_0_env_cell_control_000001988746     2.19770e-08
day7_DO_0_13C_cell_enriched_000000339011    1.14433e-08
day7_DO_0_13C_cell_enriched_000000342924    0.00000e+00
Length: 606431, dtype: float64

In [6]:
# Calculate the sum of each column
column_sums = cell_relative_abundance_filter.sum()

# Identify columns with a sum of zero
columns_to_drop = column_sums[column_sums == 0].index

# Drop the columns
cell_relative_abundance_filter = cell_relative_abundance_filter.drop(columns=columns_to_drop)

contig
day7_DO_0_12C_cell_enriched_000000049952    1.78343e-02
day7_DO_0_13C_cell_enriched_000001271829    1.51552e-02
day7_DO_0_12C_cell_enriched_000000013946    1.20842e-02
day7_DO_0_12C_cell_enriched_000001054855    1.19323e-02
day7_DO_0_13C_cell_enriched_000000556126    1.03629e-02
                                               ...     
day0_DO_0_env_cell_control_000001418278     4.23800e-10
day0_DO_0_env_cell_control_000000143212     4.14162e-10
day0_DO_0_env_cell_control_000001017624     3.79464e-10
day0_DO_0_env_cell_control_000000540674     2.87552e-10
day0_DO_0_env_cell_control_000001780477     1.63170e-10
Length: 542706, dtype: float64

# Create density data frame

In [4]:


# Create the cell_density_df DataFrame
cell_density_df = pd.DataFrame({
    "fraction": ["6", "7", "8", "9", "10", "5", "6", "7", "8", "9"],
    "treatment": ["12C", "12C", "12C", "12C", "12C", "13C", "13C", "13C", "13C", "13C"],
    "density": [1.71328932, 1.70782552, 1.70236172, 1.69580516, 1.69034136,
                1.71547484, 1.70891828, 1.70345448, 1.69689792, 1.69143412],
    "qpcr_ratio": [0.038416781, 0.186560093, 0.987937393, 1, 0.113818084,
                   0.103393172, 1, 0.368097202, 0.224894639, 0.321954999],
    "filtrate_type": ["cell fraction"] * 10
})

In [5]:

# Remove rows where fraction is 12 and drop filtrate_type column
cell_density_df = cell_density_df[cell_density_df["fraction"] != "12"].drop(columns=["filtrate_type"])

# Save to CSV
cell_density_df.to_csv("/projects/luo_lab/Siders_data/results/tables/cell_density_table2.csv", index=False)


# Create a relative abundance table of the MAGs only

Create a data frame that has the lowest taxanomic level for each MAG

In [11]:
#!python3 /projects/luo_lab/Rogers_SidersViralAnalysis_XXXX_20XX/data/processed/Taxonomy/metadata/gtdb_to_ncbi_majority_vote.py --gtdbtk_output_dir /projects/luo_lab/Rogers_SidersViralAnalysis_XXXX_20XX/data/processed/Taxonomy/MAG/drep --bac120_metadata_file /projects/luo_lab/Rogers_SidersViralAnalysis_XXXX_20XX/data/processed/Taxonomy/metadata/bac120_metadata_r214.tar.gz --ar53_metadata_file /projects/luo_lab/Rogers_SidersViralAnalysis_XXXX_20XX/data/processed/Taxonomy/metadata/ar53_metadata_r214.tar.gz --output_file /projects/luo_lab/Rogers_SidersViralAnalysis_XXXX_20XX/data/processed/Taxonomy/MAG/drep/gtdbtk_to_ncbi_taxonomy.csv --gtdbtk_prefix gtdbtk

In [8]:
#Call in gtdbtk to ncbi prediction for cell enrichment MAGs. To make this I used this command:python3 gtdb_to_ncbi_majority_vote.py --gtdbtk_output_dir ../MAG --bac120_metadata_file bac120_metadata_r214.tar.gz --ar53_metadata_file ar53_metadata_r214.tar.gz --output_file ../MAG/gtdbtk_to_ncbi_taxonomy.csv --gtdbtk_prefix gtdbtk
mag_taxa = pd.read_csv("/projects/luo_lab/Rogers_SidersViralAnalysis_XXXX_20XX/data/processed/Taxonomy/MAG/drep/gtdbtk_to_ncbi_taxonomy.csv", sep = '\t')
mag_taxa_sub=mag_taxa[["Genome ID", "Majority vote NCBI classification", "GTDB classification"]]
# Split the 'Majority vote NCBI classification' column into multiple columns
ncbi_cols = ['ncbi_domain', 'ncbi_phylum', 'ncbi_class', 'ncbi_order', 'ncbi_family', 'ncbi_genus', 'ncbi_species']
gtdb_cols = ['gtdb_domain', 'gtdb_phylum', 'gtdb_class', 'gtdb_order', 'gtdb_family', 'gtdb_genus', 'gtdb_species']

mag_taxa_sub[ncbi_cols] = mag_taxa['Majority vote NCBI classification'].str.split(';', expand=True)
mag_taxa_sub[gtdb_cols] = mag_taxa['GTDB classification'].str.split(';', expand=True)

# # Rename the columns
mag_taxa_sub = mag_taxa_sub.rename(columns={'Genome ID': 'MAG'})

# Define the replacement function
def replace_and_capitalize(match):
    if match.group(2):
        return f"{match.group(1).upper()}_{match.group(2)}"
    else:
        return pd.NA

for col in ncbi_cols:
    mag_taxa_sub[col] = mag_taxa_sub[col].str.replace(r"(.)__(.*)", replace_and_capitalize, regex=True)

for col in gtdb_cols:
    mag_taxa_sub[col] = mag_taxa_sub[col].str.replace(r"(.)__(.*)", replace_and_capitalize, regex=True)

# Replace empty strings with NaN
mag_taxa_sub = mag_taxa_sub.replace("", pd.NA)
# # Create 'taxonomy' column by coalescing across the classification columns
mag_taxa_sub['MAG_ncbi_taxa'] = mag_taxa_sub[['ncbi_species', 'ncbi_genus', 'ncbi_family', 'ncbi_order', 'ncbi_class', 'ncbi_phylum', 'ncbi_domain']].bfill(axis=1).iloc[:, 0]
mag_taxa_sub['MAG_gtdb_taxa'] = mag_taxa_sub[['gtdb_species', 'gtdb_genus', 'gtdb_family', 'gtdb_order', 'gtdb_class', 'gtdb_phylum', 'gtdb_domain']].bfill(axis=1).iloc[:, 0]

# # Select the desired columns and rename them
mag_taxa_sub = mag_taxa_sub[['MAG', 'MAG_ncbi_taxa', 'MAG_gtdb_taxa']]


# Define the new column 'MAG_taxa' using a lambda function
mag_taxa_sub['MAG_taxa_combined'] = mag_taxa_sub.apply(
    lambda row: row['MAG_gtdb_taxa'] if row['MAG_ncbi_taxa'] == row['MAG_gtdb_taxa'] 
    else f"{row['MAG_gtdb_taxa']} (NCBI: {row['MAG_ncbi_taxa']})", axis=1
)

# Create mask for rows starting with 'Candidatus '
mask = mag_taxa_sub['MAG_ncbi_taxa'].str.lower().str.startswith('candidatus ')

# For rows starting with 'Candidatus ', keep the first two words
mag_taxa_sub.loc[mask, 'MAG_ncbi_taxa'] = mag_taxa_sub.loc[mask, 'MAG_ncbi_taxa'].str.split(' ').str[:2].str.join(' ')

# For remaining rows, keep only the first word
mag_taxa_sub.loc[~mask, 'MAG_ncbi_taxa'] = mag_taxa_sub.loc[~mask, 'MAG_ncbi_taxa'].str.split(' ').str[0]

mag_taxa_sub['MAG'] = mag_taxa_sub['MAG'].str.replace('.', '_')
mag_taxa_sub

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mag_taxa_sub[ncbi_cols] = mag_taxa['Majority vote NCBI classification'].str.split(';', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mag_taxa_sub[ncbi_cols] = mag_taxa['Majority vote NCBI classification'].str.split(';', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mag_t

Unnamed: 0,MAG,MAG_ncbi_taxa,MAG_gtdb_taxa,MAG_taxa_combined
0,day7-DO-0-12C-cell-enriched_bin_13,D_Archaea,F_GW2011-AR1,F_GW2011-AR1 (NCBI: D_Archaea)
1,day7-DO-0-12C-cell-enriched_bin_43,P_Candidatus,F_GW2011-AR18,F_GW2011-AR18 (NCBI: P_Candidatus Woesearchaeota)
2,day7-DO-0-12C-cell-enriched_bin_45,D_Archaea,G_JAGVXH01,G_JAGVXH01 (NCBI: D_Archaea)
3,day7-DO-0-13C-cell-enriched_bin_66,P_Candidatus,G_JAHIYG01,G_JAHIYG01 (NCBI: P_Candidatus Altiarchaeota)
4,day0-DO-0-env-cell-control_bin_20,C_Gammaproteobacteria,F_GCF-002020875,F_GCF-002020875 (NCBI: C_Gammaproteobacteria)
...,...,...,...,...
153,day7-DO-0-12C-cell-enriched_bin_51,P_Lentisphaerae,F_JAIOPI01,F_JAIOPI01 (NCBI: P_Lentisphaerae)
154,day7-DO-0-13C-cell-enriched_bin_2,P_Verrucomicrobia,G_JABGOW01,G_JABGOW01 (NCBI: P_Verrucomicrobia)
155,day7-DO-0-13C-cell-enriched_bin_26,C_Phycisphaerae,G_JAIPFM01,G_JAIPFM01 (NCBI: C_Phycisphaerae)
156,day7-DO-0-13C-cell-enriched_bin_4,O_Pirellulales,G_M30B53,G_M30B53 (NCBI: O_Pirellulales)


Create the normalized abundance data frame for shift plot

In [9]:
mag_contigs = pd.read_csv("/projects/luo_lab/Rogers_SidersViralAnalysis_XXXX_20XX/data/processed/MAGs/drep/extra_files/binning_results.txt", 
                          header=None,sep='\t', names=['organism', 'MAG'])
# Select the columns needed for the calcuations:
taxa=mag_taxa_sub[['MAG','MAG_taxa_combined']]
# Transform the data
MAG_relative_abundance = (
    cell_relative_abundance_filter
    .melt(id_vars="samples", var_name="organism", value_name="relative_abun")
    .merge(mag_contigs, on="organism")
    .drop(columns=["organism"])
    .groupby(["samples", "MAG"], as_index=False)["relative_abun"]
    .sum()
)

# Add treatment and day columns
MAG_relative_abundance["treatment"] = MAG_relative_abundance["samples"].apply(lambda x: "12C" if "12C" in x else "13C")
MAG_relative_abundance["day"] = MAG_relative_abundance["samples"].apply(lambda x: "DAY7" if "DAY7" in x else "Other")

# # Extract sample and fraction
MAG_relative_abundance[["sample", "fraction"]] = MAG_relative_abundance["samples"].str.extract(r"(.*)_([^_]+)$")

#Merge with taxa and density_values
MAG_relative_abundance = (
     MAG_relative_abundance
     .merge(taxa, how="left")
     .merge(cell_density_df, how="left")
 )

# Normalize relative abundance with qPCR ratio
qpcr_sum = (MAG_relative_abundance["relative_abun"] * MAG_relative_abundance["qpcr_ratio"]).sum()
MAG_relative_abundance["rel_abun_qpcr_norm"] = MAG_relative_abundance["relative_abun"] * MAG_relative_abundance["qpcr_ratio"] / qpcr_sum
MAG_relative_abundance = MAG_relative_abundance[MAG_relative_abundance['MAG_taxa_combined'].notna() & (MAG_relative_abundance['MAG_taxa_combined'] != '')]
MAG_relative_abundance.to_csv("/projects/luo_lab/Rogers_SidersViralAnalysis_XXXX_20XX/data/analysis/drep_contigs_removed_MAG_rel_abun_to_contigs2.csv", index=False)
MAG_relative_abundance

Unnamed: 0,samples,MAG,relative_abun,treatment,day,sample,fraction,MAG_taxa_combined,density,qpcr_ratio,rel_abun_qpcr_norm
0,DAY7_DO_0_12C_CELL_ENRICHED_10,day0-DO-0-env-cell-control_bin_1,1.12134e-06,12C,DAY7,DAY7_DO_0_12C_CELL_ENRICHED,10,G_QWPN01 (NCBI: G_Candidatus Kuenenia),1.69034,0.11382,1.22320e-07
1,DAY7_DO_0_12C_CELL_ENRICHED_10,day0-DO-0-env-cell-control_bin_100,1.36469e-07,12C,DAY7,DAY7_DO_0_12C_CELL_ENRICHED,10,O_JAIPIQ01 (NCBI: P_Candidatus Marinimicrobia),1.69034,0.11382,1.48865e-08
2,DAY7_DO_0_12C_CELL_ENRICHED_10,day0-DO-0-env-cell-control_bin_101,6.63199e-06,12C,DAY7,DAY7_DO_0_12C_CELL_ENRICHED,10,G_Synechococcus_C (NCBI: G_Synechococcus),1.69034,0.11382,7.23438e-07
3,DAY7_DO_0_12C_CELL_ENRICHED_10,day0-DO-0-env-cell-control_bin_102,1.61140e-07,12C,DAY7,DAY7_DO_0_12C_CELL_ENRICHED,10,C_Desulfarculia (NCBI: C_Deltaproteobacteria),1.69034,0.11382,1.75776e-08
4,DAY7_DO_0_12C_CELL_ENRICHED_10,day0-DO-0-env-cell-control_bin_103,7.91856e-07,12C,DAY7,DAY7_DO_0_12C_CELL_ENRICHED,10,O_Kiritimatiellales,1.69034,0.11382,8.63781e-08
...,...,...,...,...,...,...,...,...,...,...,...
1575,DAY7_DO_0_13C_CELL_ENRICHED_9,day7-DO-0-13C-cell-enriched_bin_57,6.43244e-03,13C,DAY7,DAY7_DO_0_13C_CELL_ENRICHED,9,G_4572-104 (NCBI: P_Tenericutes),1.69143,0.32195,1.98480e-03
1576,DAY7_DO_0_13C_CELL_ENRICHED_9,day7-DO-0-13C-cell-enriched_bin_6,6.19092e-06,13C,DAY7,DAY7_DO_0_13C_CELL_ENRICHED,9,G_UBA6154 (NCBI: O_Candidatus Nanopelagicales),1.69143,0.32195,1.91028e-06
1577,DAY7_DO_0_13C_CELL_ENRICHED_9,day7-DO-0-13C-cell-enriched_bin_61,2.82757e-05,13C,DAY7,DAY7_DO_0_13C_CELL_ENRICHED,9,O_LZORAL124-64-63 (NCBI: D_Bacteria),1.69143,0.32195,8.72479e-06
1578,DAY7_DO_0_13C_CELL_ENRICHED_9,day7-DO-0-13C-cell-enriched_bin_66,3.63616e-05,13C,DAY7,DAY7_DO_0_13C_CELL_ENRICHED,9,G_JAHIYG01 (NCBI: P_Candidatus Altiarchaeota),1.69143,0.32195,1.12198e-05


In [18]:
mag_contigs

Unnamed: 0,organism,MAG
0,day0_DO_0_env_cell_control_000000005806,day0-DO-0-env-cell-control_bin_100
1,day0_DO_0_env_cell_control_000000012483,day0-DO-0-env-cell-control_bin_100
2,day0_DO_0_env_cell_control_000000013412,day0-DO-0-env-cell-control_bin_100
3,day0_DO_0_env_cell_control_000000019303,day0-DO-0-env-cell-control_bin_100
4,day0_DO_0_env_cell_control_000000021219,day0-DO-0-env-cell-control_bin_100
...,...,...
68720,day7_DO_0_13C_cell_enriched_000001362116,day7-DO-0-13C-cell-enriched_bin_7
68721,day7_DO_0_13C_cell_enriched_000001363719,day7-DO-0-13C-cell-enriched_bin_7
68722,day7_DO_0_13C_cell_enriched_000001364281,day7-DO-0-13C-cell-enriched_bin_7
68723,day7_DO_0_13C_cell_enriched_000001365278,day7-DO-0-13C-cell-enriched_bin_7


# Create a relative abundance table of all contigs in cell enrichment only
NOTE: Need to rerun Kraken.
-Running now, Job ID 9309147

In [3]:
cell_kraken2_taxa=pd.read_csv("/projects/luo_lab/Rogers_SidersViralAnalysis_XXXX_20XX/data/processed/kraken2_out/drep/kraken2_output.txt", sep="\t", header=None, 
                 names=["Classified", "organism","contig_taxa","length","kmer_mapping"])
cell_kraken2_taxa

Unnamed: 0,Classified,organism,contig_taxa,length,kmer_mapping
0,U,day0_DO_0_env_cell_control_000000000017,unclassified (taxid 0),1158,0:96 2895946:3 0:1025
1,U,day0_DO_0_env_cell_control_000000000047,unclassified (taxid 0),1363,0:1329
2,U,day0_DO_0_env_cell_control_000000000072,unclassified (taxid 0),1156,0:1122
3,U,day0_DO_0_env_cell_control_000000000074,unclassified (taxid 0),1434,0:1178 131567:5 0:217
4,U,day0_DO_0_env_cell_control_000000000077,unclassified (taxid 0),1297,0:654 9606:5 0:245 9606:3 0:356
...,...,...,...,...,...
639429,U,day7_DO_0_12C_cell_enriched_000001055312,unclassified (taxid 0),3127,0:392 1397:5 0:2696
639430,U,day7_DO_0_12C_cell_enriched_000001055321,unclassified (taxid 0),2133,0:115 1605838:5 0:787 679318:5 0:558 2747817:1...
639431,U,day7_DO_0_12C_cell_enriched_000001055326,unclassified (taxid 0),3791,0:638 1643:2 0:1872 1224145:7 0:111 455364:1 0...
639432,U,day7_DO_0_12C_cell_enriched_000001055341,unclassified (taxid 0),1301,0:235 2951988:1 0:300 3035472:2 0:729


In [11]:
#Call in kraken2 taxonomic predictions for cell enrichment contigs
cell_kraken2_taxa=pd.read_csv("/projects/luo_lab/Rogers_SidersViralAnalysis_XXXX_20XX/data/processed/kraken2_out/drep/kraken2_output.txt", sep="\t", header=None, 
                 names=["Classified", "organism","contig_taxa","length","kmer_mapping"])
cell_kraken2_taxa = cell_kraken2_taxa[cell_kraken2_taxa["Classified"] == "C"]
# Extract the taxid from Column3 using regex
cell_kraken2_taxa['taxid'] = cell_kraken2_taxa['contig_taxa'].apply(lambda x: int(re.search(r'taxid (\d+)', x).group(1)))
cell_kraken2_taxa['contig_taxa'] = cell_kraken2_taxa['contig_taxa'].str.replace(r'\s*\(taxid \d+\)', '', regex=True)
# Get the unique taxids
unique_taxids = cell_kraken2_taxa['taxid'].unique()

# Create a dictionary to map taxid to taxonomic level
taxid_to_level = {taxid: get_taxonomic_level(taxid) for taxid in unique_taxids}

# Map the taxonomic levels back to the original dataframe
cell_kraken2_taxa['taxonomic_level'] = cell_kraken2_taxa['taxid'].map(taxid_to_level)

#Select the desired columns:
cell_kraken2_taxa_sub = cell_kraken2_taxa[["Classified", "organism", "contig_taxa", 'taxonomic_level']]

# Define a dictionary for prefixes based on taxonomic levels
prefixes = {
    'species': 'S_',
    'genus': 'G_',
    'superkingdom': 'SK_',
    'strain': 'ST_',
    'phylum': 'P_',
    'family': 'F_',
    'no rank': 'NR_',
    'order': 'O_',
    'class': 'C_',
    'clade': 'CL_'
}

cell_kraken2_taxa_sub['contig_taxa'] = cell_kraken2_taxa_sub.apply(modify_contig_taxa, axis=1)
cell_kraken2_taxa_sub=cell_kraken2_taxa_sub[['organism','contig_taxa']]
cell_kraken2_taxa_sub.to_csv('/projects/luo_lab/Rogers_SidersViralAnalysis_XXXX_20XX/data/analysis/kraken2_contig_taxa2.csv', index=False) 
cell_kraken2_taxa_sub


            Email address is not specified.

            To make use of NCBI's E-utilities, NCBI requires you to specify your
            email address with each request.  As an example, if your email address
            is A.N.Other@example.com, you can specify it as follows:
               from Bio import Entrez
               Entrez.email = 'A.N.Other@example.com'
            In case of excessive usage of the E-utilities, NCBI will attempt to contact
            a user at the email address provided before blocking access to the
            E-utilities.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cell_kraken2_taxa_sub['contig_taxa'] = cell_kraken2_taxa_sub.apply(modify_contig_taxa, axis=1)


Unnamed: 0,organism,contig_taxa
272,day0_DO_0_env_cell_control_000000001653,S_Homo sapiens
656,day0_DO_0_env_cell_control_000000003819,NR_unclassified Synechococcus
920,day0_DO_0_env_cell_control_000000005199,O_Synechococcales
956,day0_DO_0_env_cell_control_000000005304,NR_unclassified Synechococcus
1065,day0_DO_0_env_cell_control_000000005867,S_Synechococcus sp. WH 8101
...,...,...
639272,day7_DO_0_12C_cell_enriched_000001053746,S_Synechococcus sp. HK01-R
639342,day7_DO_0_12C_cell_enriched_000001054405,G_Cutibacterium
639354,day7_DO_0_12C_cell_enriched_000001054490,S_Cutibacterium acnes
639387,day7_DO_0_12C_cell_enriched_000001054855,C_Gammaproteobacteria


# Functions for EAF calculations

These are the functions needed to calculated the Atom Fraction Excess (AFE): Formulas found in Hungate et al., 2015 "Quantitative microbial ecology through stable isotope probing"

In [7]:
def LongFormat(df, index, otu_col, value_col):
    return (df
     .set_index(index)
     .rename_axis([otu_col], axis=1).stack()
     .reset_index()
     .rename(columns={ 0: value_col}))

def reformate(df, index, otu_col, value_col):
    long = LongFormat(df, index, otu_col, value_col)
    long['treatment'] = long['samples'].apply(lambda x: "12C" if "12C" in x else "13C")
    long['day'] = long['samples'].apply(lambda x: "DAY7" if "DAY7" in x else "Other")
    long['fraction'] = long['samples'].str.extract(r'(\d+-\d+|\d+)$', expand=False)
    
    # Step 1: Create a mask indicating rows with count == 0
    zero_count_mask = long[value_col] == 0
    
    # Step 2: Identify 'samples' and 'organism' groups with any zero count
    groups_with_zero = long[zero_count_mask][['samples', 'organism']].drop_duplicates()
    
    # Step 3: Filter out these groups from the original DataFrame
    groups_with_zero['to_drop'] = True
    long = long.merge(groups_with_zero, on=['samples', 'organism'], how='left')
    
    long_filtered = long[long['to_drop'].isna()].drop(columns=['to_drop'])

    return long_filtered

def joindensitydf(df, densitydf, index, otu_col, value_col, common_column1,common_column2):
    reform = reformate(df, index, otu_col, value_col)
    reform2 = pd.merge(reform, densitydf, on=[common_column1,common_column2], how='left')

    rel_abun_qpcr_norm = reform2.groupby(['day', common_column1, 'organism']).filter(lambda x: len(x) > 2)

    rel_abun_qpcr_norm['count_qpcr'] = rel_abun_qpcr_norm['count'] * rel_abun_qpcr_norm['qpcr_ratio']

    # Group by 'day', 'treatment', 'organism' and calculate the sum for normalization
    grouped_sum = rel_abun_qpcr_norm.groupby(['day', common_column1, 'organism'])['count_qpcr'].transform('sum')

    # Calculate the normalized count
    rel_abun_qpcr_norm['count_qpcr_norm'] = rel_abun_qpcr_norm['count_qpcr'] / grouped_sum
    return rel_abun_qpcr_norm[['organism',common_column1,'density','count_qpcr_norm']]


def org_dna_denisity(df, densitydf, index, otu_col, value_col, common_column1,common_column2):
    joined_df=joindensitydf(df, densitydf, index, otu_col, value_col, common_column1,common_column2)
    # Group by contig and treatment, and calculate the sum of the product of rel_abun_qpcr_norm and density
    joined_df['weighted'] = joined_df['count_qpcr_norm'] * joined_df['density']
    grouped = joined_df.groupby(['organism', 'treatment']).apply(lambda x: (x['weighted']).sum())
    grouped = grouped.reset_index().rename(columns={ 0: 'weighted'})
    pivot_wider_df = grouped.pivot(index='organism', columns='treatment', values='weighted').reset_index()
    return pivot_wider_df

def AtomFractionExcess(df):
    #Shift
    df['shift'] = df['13C'] - df['12C']
    #Natural abundance molecular weight of each taxon: unlabeled
    df['M_light'] = 0.496 * df['GC'] + 307.691
    #Theoretical maximum molecular weight of fully labeled DNA
    df['M_HeavyMax'] = -0.4987282*df['GC'] + 9.974564 + df['M_light']
    #Molecular weight of DNA of taxon in labeled treatment
    df['M_Lab'] = (df['shift']/df['12C'] + 1) * df['M_light']
    df['EAF'] = (df['M_Lab'] - df['M_light']) / (df['M_HeavyMax'] - df['M_light']) * (1 - 0.01111233)
    return df

# Part 1: Calculate the EAF of contigs in the cell enriched fraction 

Step 1: Call in required data frames for EAF calculation of the cell enriched fraction


In [20]:
# Call in data frame of the gc content producted by seqkit: seqkit fx2tab --name --only-id --gc combined_cell_contigs_clean_headers.fa > combined_cell_contigs_gc_results.txt
GC_content=pd.read_csv("/projects/luo_lab/Rogers_SidersViralAnalysis_XXXX_20XX/data/processed/Assemblies/combined_cell_contigs_gc_results.txt", sep='\t',header=None, names=['organism','GC'])
GC_content['GC']=GC_content['GC']/100
GC_content

Unnamed: 0,organism,GC
0,day0_DO_0_env_cell_control_000000000017,0.5432
1,day0_DO_0_env_cell_control_000000000047,0.5106
2,day0_DO_0_env_cell_control_000000000072,0.5190
3,day0_DO_0_env_cell_control_000000000074,0.6227
4,day0_DO_0_env_cell_control_000000000077,0.2668
...,...,...
639429,day7_DO_0_13C_cell_enriched_000001368193,0.3937
639430,day7_DO_0_13C_cell_enriched_000001368197,0.6331
639431,day7_DO_0_13C_cell_enriched_000001368198,0.4803
639432,day7_DO_0_13C_cell_enriched_000001368200,0.2448


Step 2: Use the 'org_dna_density' function to calculate the mean DNA density of each contig

In [24]:
cell_relative_abundance_long=org_dna_denisity(cell_relative_abundance_filter, cell_density_df, 'samples','organism','count', 'treatment','fraction')
cell_relative_abundance_long

cell_relative_abundance_long = cell_relative_abundance_long[cell_relative_abundance_long['12C'].notna()]
cell_relative_abundance_long = cell_relative_abundance_long[cell_relative_abundance_long['13C'].notna()]

  grouped = joined_df.groupby(['organism', 'treatment']).apply(lambda x: (x['weighted']).sum())


In [25]:
cell_relative_abundance_long=pd.merge(cell_relative_abundance_long, GC_content)
cell_relative_abundance_long
test_sorted = cell_relative_abundance_long.sort_values(by='12C')
test_sorted

Unnamed: 0,organism,12C,13C,GC
31102,day7_DO_0_13C_cell_enriched_000000145017,1.69054,1.69588,0.4213
3558,day0_DO_0_env_cell_control_000000411487,1.69067,1.69183,0.3117
30345,day7_DO_0_13C_cell_enriched_000000091461,1.69088,1.70186,0.5081
9972,day0_DO_0_env_cell_control_000001473230,1.69091,1.69148,0.3530
11715,day0_DO_0_env_cell_control_000001898665,1.69098,1.69325,0.3171
...,...,...,...,...
14503,day7_DO_0_12C_cell_enriched_000000097819,1.71302,1.71016,0.6273
2947,day0_DO_0_env_cell_control_000000329791,1.71304,1.71045,0.6641
39008,day7_DO_0_13C_cell_enriched_000000814428,1.71309,1.70883,0.6202
15332,day7_DO_0_12C_cell_enriched_000000136616,1.71313,1.71300,0.6588


Step 3: Use the 'AtomFractionExcess' function to calculate the EAF of each contig and export to csv file to be used in R script for figure construction.

In [28]:
cell_atomic_fraction = pd.merge(mag_contigs,AtomFractionExcess(cell_relative_abundance_long), on=['organism'], how='outer')
#cell_atomic_fraction = cell_atomic_fraction[cell_atomic_fraction['12C'].notna()]
#cell_atomic_fraction = cell_atomic_fraction[cell_atomic_fraction['13C'].notna()]
# Replace empty strings in 'MAG' with "unbinned"
cell_atomic_fraction['MAG'] = cell_atomic_fraction['MAG'].fillna("unbinned")
cell_atomic_fraction
cell_atomic_fraction = cell_atomic_fraction[cell_atomic_fraction['EAF'].notna()]
cell_atomic_fraction.to_csv('/projects/luo_lab/Rogers_SidersViralAnalysis_XXXX_20XX/data/analysis/drep_cell_enrichment_atomic_fraction2.csv', index=False) 
# mag_atomic_fraction.to_csv('../../R/output_files/mag_atomic_fraction_E.csv', index=False) 
test_sorted = cell_atomic_fraction.sort_values(by='12C')
test_sorted

Unnamed: 0,organism,MAG,12C,13C,GC,shift,M_light,M_HeavyMax,M_Lab,EAF
82871,day7_DO_0_13C_cell_enriched_000000145017,unbinned,1.69054,1.69588,0.4213,0.00533,307.89996,317.66441,308.87104,0.09835
12752,day0_DO_0_env_cell_control_000000411487,unbinned,1.69067,1.69183,0.3117,0.00117,307.84560,317.66471,308.05811,0.02140
81618,day7_DO_0_13C_cell_enriched_000000091461,unbinned,1.69088,1.70186,0.5081,0.01097,307.94302,317.66418,309.94147,0.20329
41415,day0_DO_0_env_cell_control_000001473230,unbinned,1.69091,1.69148,0.3530,0.00057,307.86609,317.66460,307.96992,0.01048
51472,day0_DO_0_env_cell_control_000001898665,unbinned,1.69098,1.69325,0.3171,0.00227,307.84828,317.66470,308.26150,0.04163
...,...,...,...,...,...,...,...,...,...,...
58503,day7_DO_0_12C_cell_enriched_000000097819,unbinned,1.71302,1.71016,0.6273,-0.00286,308.00214,317.66385,307.48795,-0.05263
10286,day0_DO_0_env_cell_control_000000329791,day0-DO-0-env-cell-control_bin_80,1.71304,1.71045,0.6641,-0.00258,308.02039,317.66375,307.55567,-0.04766
96361,day7_DO_0_13C_cell_enriched_000000814428,unbinned,1.71309,1.70883,0.6202,-0.00427,307.99862,317.66387,307.23140,-0.07850
59640,day7_DO_0_12C_cell_enriched_000000136616,unbinned,1.71313,1.71300,0.6588,-0.00013,308.01776,317.66377,307.99394,-0.00244


In [29]:
#Merg cell_enriched_eaf,  mag_taxa_sub, and cell_kraken2_taxa_sub
# Perform the full join (equivalent to a merge in pandas)
cell_atomic_fraction2 = pd.merge(cell_atomic_fraction, mag_taxa_sub[["MAG","MAG_taxa_combined"]], how='outer')
cell_atomic_fraction2 = pd.merge(cell_atomic_fraction2, cell_kraken2_taxa_sub, how='outer')

# Apply the mutate logic using pandas' .apply and .loc
cell_atomic_fraction2['taxa'] = cell_atomic_fraction2.apply(
    lambda row: row['MAG_taxa_combined'] if pd.notna(row['MAG_taxa_combined']) else row['contig_taxa'], axis=1)
cell_atomic_fraction2['taxa'] = cell_atomic_fraction2['taxa'].replace("cellular organisms", pd.NA)
cell_atomic_fraction2['taxa'] = cell_atomic_fraction2['taxa'].apply(
    lambda x: x.replace("Unclassified ", "") if pd.notna(x) and "Unclassified" in x else x)
cell_atomic_fraction2['taxa'] = cell_atomic_fraction2['taxa'].apply(
    lambda x: x.replace("unclassified ", "") if pd.notna(x) and "unclassified" in x else x)
cell_atomic_fraction2['taxa'] = cell_atomic_fraction2['taxa'].str.replace(r'.*/\s*(.*$)', r'\1', regex=True)
cell_atomic_fraction2['MAG'] = cell_atomic_fraction2['MAG'].replace("", "unbinned")
# # Filter out rows where 'taxa' is NaN
cell_atomic_fraction2 = cell_atomic_fraction2.dropna(subset=['taxa'])
cell_atomic_fraction2 = cell_atomic_fraction2.dropna(subset=['organism'])
# #save the resulting DataFrame
cell_atomic_fraction2.to_csv('/projects/luo_lab/Rogers_SidersViralAnalysis_XXXX_20XX/data/analysis/drep_cell_enriched_contig_mag_taxa2.csv', index=False) 
cell_atomic_fraction2


Unnamed: 0,organism,MAG,12C,13C,GC,shift,M_light,M_HeavyMax,M_Lab,EAF,MAG_taxa_combined,contig_taxa,taxa
5,day0_DO_0_env_cell_control_000000000775,day0-DO-0-env-cell-control_bin_57,1.70689,1.70816,0.5850,0.00127,307.98116,317.66397,308.21052,0.02342,F_Alkalispirochaetaceae (NCBI: F_Spirochaetaceae),,F_Alkalispirochaetaceae (NCBI: F_Spirochaetaceae)
6,day0_DO_0_env_cell_control_000000000851,day0-DO-0-env-cell-control_bin_22,1.70732,1.70851,0.6089,0.00119,307.99301,317.66390,308.20775,0.02196,G_JADHUC01 (NCBI: F_Pirellulaceae),,G_JADHUC01 (NCBI: F_Pirellulaceae)
14,day0_DO_0_env_cell_control_000000001653,,,,,,,,,,,S_Homo sapiens,S_Homo sapiens
18,day0_DO_0_env_cell_control_000000002581,day0-DO-0-env-cell-control_bin_57,1.70338,1.70487,0.5243,0.00150,307.95105,317.66413,308.22182,0.02757,F_Alkalispirochaetaceae (NCBI: F_Spirochaetaceae),,F_Alkalispirochaetaceae (NCBI: F_Spirochaetaceae)
28,day0_DO_0_env_cell_control_000000003819,,,,,,,,,,,NR_unclassified Synechococcus,NR_Synechococcus
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49164,day7_DO_0_13C_cell_enriched_000001364281,day7-DO-0-13C-cell-enriched_bin_7,1.69983,1.69739,0.4474,-0.00244,307.91291,317.66434,307.47173,-0.04474,G_JAFGTC01 (NCBI: P_Candidatus Omnitrophica),,G_JAFGTC01 (NCBI: P_Candidatus Omnitrophica)
49174,day7_DO_0_13C_cell_enriched_000001365799,day7-DO-0-13C-cell-enriched_bin_7,1.69865,1.69772,0.4451,-0.00093,307.91177,317.66435,307.74398,-0.01701,G_JAFGTC01 (NCBI: P_Candidatus Omnitrophica),,G_JAFGTC01 (NCBI: P_Candidatus Omnitrophica)
49175,day7_DO_0_13C_cell_enriched_000001365842,,,,,,,,,,,F_Chlorobiaceae,F_Chlorobiaceae
49177,day7_DO_0_13C_cell_enriched_000001366290,day7-DO-0-13C-cell-enriched_bin_34,1.69629,1.69635,0.3999,0.00006,307.88935,317.66447,307.90047,0.00113,G_SDB (NCBI: G_Candidatus Cloacimonas),,G_SDB (NCBI: G_Candidatus Cloacimonas)


# Part 2: Calculate the EAF of the vOTUs present in the cell enriched fraction 

Call in the quality calculations from checkV

In [42]:
# List of parent directories
parent_dirs = [
    "day0-DO-0-env-virus-control",
    "day7-DO-0-12C-virus-enriched",
    "day7-DO-0-13C-virus-enriched"
]

# Use glob to find all quality_summary.tsv files in the checkv subdirectories
vquality_filepaths = [glob.glob(f"/projects/luo_lab/Siders_data/data/processed/Assemblies/{d}/checkv/quality_summary.tsv")[0] for d in parent_dirs]

# Read and merge files in one step, keeping headers only from the first file
merged_vquality = pd.concat([pd.read_csv(f, sep='\t') for f in vquality_filepaths], ignore_index=True)
merged_vquality['contig_id'] = merged_vquality['contig_id'].str.split('\|\|').str[0]
merged_vquality

  merged_vquality['contig_id'] = merged_vquality['contig_id'].str.split('\|\|').str[0]


Unnamed: 0,contig_id,contig_length,provirus,proviral_length,gene_count,viral_genes,host_genes,checkv_quality,miuvig_quality,completeness,completeness_method,contamination,kmer_freq,warnings
0,k141_1850343,17890,No,,20,6,0,Low-quality,Genome-fragment,25.40,HMM-based (lower-bound),0.0,1.00,
1,k141_2091625,13316,No,,14,2,0,Low-quality,Genome-fragment,20.92,HMM-based (lower-bound),0.0,1.00,
2,k141_364456,7120,No,,10,0,0,Not-determined,Genome-fragment,,,0.0,1.00,no viral genes detected
3,k141_1639972,9743,No,,12,3,1,Low-quality,Genome-fragment,16.66,AAI-based (medium-confidence),0.0,1.00,
4,k141_364466,5285,No,,8,0,0,Not-determined,Genome-fragment,,,0.0,1.05,no viral genes detected; low-confidence DTR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49686,k141_2093948,5541,No,,5,0,0,Not-determined,Genome-fragment,,,0.0,1.03,no viral genes detected; low-confidence DTR
49687,k141_2093968,7605,No,,11,0,0,Not-determined,Genome-fragment,,,0.0,1.02,no viral genes detected; low-confidence DTR
49688,k141_2093979,7626,No,,14,0,0,Not-determined,Genome-fragment,,,0.0,1.02,no viral genes detected; low-confidence DTR
49689,k141_2094046,9098,No,,11,0,1,Medium-quality,Genome-fragment,64.99,AAI-based (medium-confidence),0.0,1.01,no viral genes detected; low-confidence DTR


Call in file with the original contig names.

In [48]:
# List of parent directories
# Use glob to find all quality_summary.tsv files in the checkv subdirectories
vnames_filepaths = [glob.glob(f"/projects/luo_lab/Siders_data/data/processed/Assemblies/{d}/anvio/clean_fasta_file/reformat-report.txt")[0] for d in parent_dirs]
# Read and merge files in one step, keeping headers only from the first file
# Define the column names
vnames_columns = ['organism', 'contig_id']

# Read and merge files, assigning column names
vnames_merged = pd.concat([pd.read_csv(f, sep='\t', header=None, names=vnames_columns) for f in vnames_filepaths], ignore_index=True)
vnames_merged['contig_id'] = vnames_merged['contig_id'].str.split('__').str[0]

vnames_merged

Unnamed: 0,organism,contig_id
0,day0_DO_0_env_virus_control_000000000001,k141_729115
1,day0_DO_0_env_virus_control_000000000002,k141_729147
2,day0_DO_0_env_virus_control_000000000003,k141_912059
3,day0_DO_0_env_virus_control_000000000004,k141_365303
4,day0_DO_0_env_virus_control_000000000005,k141_1823202
...,...,...
48920,day7_DO_0_13C_virus_enriched_000000016814,k141_1560980
48921,day7_DO_0_13C_virus_enriched_000000016815,k141_2087707
48922,day7_DO_0_13C_virus_enriched_000000016816,k141_1393650
48923,day7_DO_0_13C_virus_enriched_000000016817,k141_696715


In [49]:
# Perform the inner join on 'contig_id'
vcontig_quality = pd.merge(vnames_merged, merged_vquality, on='contig_id', how='inner')

# Drop the 'contig_id' column
vcontig_quality = vcontig_quality.drop(columns=['contig_id'])
vcontig_quality

Unnamed: 0,organism,contig_length,provirus,proviral_length,gene_count,viral_genes,host_genes,checkv_quality,miuvig_quality,completeness,completeness_method,contamination,kmer_freq,warnings
0,day0_DO_0_env_virus_control_000000000001,16367,Yes,15000.0,17,5,2,Low-quality,Genome-fragment,22.19,AAI-based (medium-confidence),8.35,1.00,
1,day0_DO_0_env_virus_control_000000000002,10478,Yes,6828.0,11,3,3,Low-quality,Genome-fragment,9.01,HMM-based (lower-bound),34.83,1.00,
2,day0_DO_0_env_virus_control_000000000003,10561,Yes,7365.0,17,3,2,Low-quality,Genome-fragment,6.73,AAI-based (medium-confidence),30.26,1.00,
3,day0_DO_0_env_virus_control_000000000003,6803,No,,14,4,1,Low-quality,Genome-fragment,8.42,HMM-based (lower-bound),0.00,1.00,
4,day0_DO_0_env_virus_control_000000000004,28154,Yes,24691.0,47,10,2,Low-quality,Genome-fragment,30.67,HMM-based (lower-bound),12.30,1.00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49583,day7_DO_0_13C_virus_enriched_000000016814,5549,No,,1,1,0,Low-quality,Genome-fragment,1.45,HMM-based (lower-bound),0.00,1.03,
49584,day7_DO_0_13C_virus_enriched_000000016815,5003,No,,2,0,0,Low-quality,Genome-fragment,8.61,AAI-based (high-confidence),0.00,1.00,no viral genes detected
49585,day7_DO_0_13C_virus_enriched_000000016816,10078,No,,1,1,0,Low-quality,Genome-fragment,4.27,AAI-based (high-confidence),0.00,1.00,
49586,day7_DO_0_13C_virus_enriched_000000016817,5125,No,,1,1,0,Low-quality,Genome-fragment,2.69,AAI-based (high-confidence),0.00,1.00,


Step 1: Call in required data frames for EAF calculation of the cell enriched fraction

In [8]:
viral_ce_cov = pd.read_csv("/projects/luo_lab/Siders_data/data/processed/anvio/merged_profile_db/derep_viral_ce/derep_viral_ce-COVs.txt", sep="\t")

# Rename columns
viral_ce_cov.rename(columns=dict(zip(oldnames, newnames)), inplace=True)

# Convert to DataFrame with contig as row index
viral_ce_relative_abundance = viral_ce_cov.set_index("contig").transpose().reset_index()

# Normalize the data
normalized_data = normalize(viral_ce_relative_abundance.drop(columns="index"), axis=1, norm='l1')
viral_ce_relative_abundance = pd.DataFrame(normalized_data, columns=viral_ce_relative_abundance.columns[1:])
viral_ce_relative_abundance["samples"] = viral_ce_cov.columns[1:]

# Reorder columns to have samples first
viral_ce_relative_abundance = viral_ce_relative_abundance[["samples"] + list(viral_ce_relative_abundance.columns[:-1])]

# Save to CSV
viral_ce_relative_abundance.to_csv("/projects/luo_lab/Siders_data/results/tables/drep_vcell_rel_abun2.csv", index=False)
viral_ce_relative_abundance

contig,samples,day0_DO_0_env_virus_control_000000000002,day0_DO_0_env_virus_control_000000000003,day0_DO_0_env_virus_control_000000000004,day0_DO_0_env_virus_control_000000000008,day0_DO_0_env_virus_control_000000000010,day0_DO_0_env_virus_control_000000000013,day0_DO_0_env_virus_control_000000000014,day0_DO_0_env_virus_control_000000000015,day0_DO_0_env_virus_control_000000000016,...,day7_DO_0_13C_virus_enriched_000000016806,day7_DO_0_13C_virus_enriched_000000016807,day7_DO_0_13C_virus_enriched_000000016808,day7_DO_0_13C_virus_enriched_000000016809,day7_DO_0_13C_virus_enriched_000000016810,day7_DO_0_13C_virus_enriched_000000016813,day7_DO_0_13C_virus_enriched_000000016815,day7_DO_0_13C_virus_enriched_000000016816,day7_DO_0_13C_virus_enriched_000000016817,day7_DO_0_13C_virus_enriched_000000016818
0,CLEAN_DAY0_DO_0_ENV_CELL_CONTROL_NONE,0.0,1.25543e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DAY7_DO_0_12C_CELL_ENRICHED_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1e-05,4e-05,0.0
2,DAY7_DO_0_12C_CELL_ENRICHED_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DAY7_DO_0_12C_CELL_ENRICHED_7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DAY7_DO_0_12C_CELL_ENRICHED_8,0.0,0.00011743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,DAY7_DO_0_12C_CELL_ENRICHED_9,0.0,0.000115665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3e-05,0.0
6,DAY7_DO_0_13C_CELL_ENRICHED_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,DAY7_DO_0_13C_CELL_ENRICHED_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,DAY7_DO_0_13C_CELL_ENRICHED_7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,DAY7_DO_0_13C_CELL_ENRICHED_8,0.0,0.000181532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Step 2: Use the 'org_dna_density' function to calculate the mean DNA density of each vOTUs:

In [10]:
viral_ce_relative_abundance_filter=viral_ce_relative_abundance[viral_ce_relative_abundance['samples'] != 'CLEAN_DAY0_DO_0_ENV_CELL_CONTROL_NONE']
viral_ce_relative_abundance_filter

# Calculate the sum of each column
column_sums = viral_ce_relative_abundance_filter.sum()

# Identify columns with a sum of zero
columns_to_drop = column_sums[column_sums == 0].index

# Drop the columns
viral_ce_relative_abundance_filter = viral_ce_relative_abundance_filter.drop(columns=columns_to_drop)

In [11]:
# Call in data frame of the gc content producted by seqkit: seqkit fx2tab --name --only-id --gc combined_cell_contigs_clean_headers.fa > combined_cell_contigs_gc_results.txt
vGC_content=pd.read_csv("/projects/luo_lab/Siders_data/results/tables/combined_vOTU_gc_results.txt", sep='\t',header=None, names=['organism','GC'])
vGC_content['GC']=vGC_content['GC']/100
vGC_content

Unnamed: 0,organism,GC
0,day0_DO_0_env_virus_control_000000000002,0.4238
1,day0_DO_0_env_virus_control_000000000003,0.4308
2,day0_DO_0_env_virus_control_000000000004,0.3146
3,day0_DO_0_env_virus_control_000000000008,0.5272
4,day0_DO_0_env_virus_control_000000000010,0.3953
...,...,...
31483,day7_DO_0_13C_virus_enriched_000000016813,0.3166
31484,day7_DO_0_13C_virus_enriched_000000016815,0.4090
31485,day7_DO_0_13C_virus_enriched_000000016816,0.3079
31486,day7_DO_0_13C_virus_enriched_000000016817,0.3582


In [12]:
viral_ce_relative_abundance_long=org_dna_denisity(viral_ce_relative_abundance_filter, cell_density_df, 'samples','organism','count', 'treatment','fraction')
viral_ce_relative_abundance_long

  grouped = joined_df.groupby(['organism', 'treatment']).apply(lambda x: (x['weighted']).sum())


treatment,organism,12C,13C
0,day0_DO_0_env_virus_control_000000000336,1.69567,
1,day0_DO_0_env_virus_control_000000000405,,1.69920
2,day0_DO_0_env_virus_control_000000000454,1.69553,
3,day0_DO_0_env_virus_control_000000000574,1.69618,1.69500
4,day0_DO_0_env_virus_control_000000000575,1.69687,
...,...,...,...
647,day7_DO_0_13C_virus_enriched_000000016717,,1.70897
648,day7_DO_0_13C_virus_enriched_000000016725,1.69576,1.69629
649,day7_DO_0_13C_virus_enriched_000000016739,1.69506,1.69216
650,day7_DO_0_13C_virus_enriched_000000016770,1.70857,1.70888


In [13]:
viral_ce_relative_abundance_long=pd.merge(viral_ce_relative_abundance_long, vGC_content)
viral_ce_relative_abundance_long

Unnamed: 0,organism,12C,13C,GC
0,day0_DO_0_env_virus_control_000000000336,1.69567,,0.3761
1,day0_DO_0_env_virus_control_000000000405,,1.69920,0.4922
2,day0_DO_0_env_virus_control_000000000454,1.69553,,0.3748
3,day0_DO_0_env_virus_control_000000000574,1.69618,1.69500,0.3790
4,day0_DO_0_env_virus_control_000000000575,1.69687,,0.4416
...,...,...,...,...
647,day7_DO_0_13C_virus_enriched_000000016717,,1.70897,0.4032
648,day7_DO_0_13C_virus_enriched_000000016725,1.69576,1.69629,0.4157
649,day7_DO_0_13C_virus_enriched_000000016739,1.69506,1.69216,0.3592
650,day7_DO_0_13C_virus_enriched_000000016770,1.70857,1.70888,0.5424


Step 3: Use the 'AtomFractionExcess' function to calculate the EAF of each vOTU in the cell enriched fraction and export to csv file to be used in R script for figure construction.

In [50]:
viralcell_atomic_fraction=AtomFractionExcess(viral_ce_relative_abundance_long).dropna()
# Filter out organisms with "Not-determined" in the checkv_quality column of vcontig_quality
filtered_organisms = vcontig_quality[vcontig_quality['checkv_quality'] != 'Not-determined']['organism']

# Filter viralcell_atomic_fraction to keep only organisms that are in the filtered list
viralcell_atomic_fraction_filtered = viralcell_atomic_fraction[viralcell_atomic_fraction['organism'].isin(filtered_organisms)]

viralcell_atomic_fraction_filtered.to_csv('/projects/luo_lab/Siders_data/results/tables/viral_atomic_fraction_ce.csv', index=False) 
viralcell_atomic_fraction_filtered


Unnamed: 0,organism,12C,13C,GC,shift,M_light,M_HeavyMax,M_Lab,EAF
3,day0_DO_0_env_virus_control_000000000574,1.69618,1.69500,0.3790,-0.00118,307.87898,317.66453,307.66538,-0.02159
7,day0_DO_0_env_virus_control_000000000649,1.69830,1.69781,0.4490,-0.00049,307.91370,317.66434,307.82503,-0.00899
9,day0_DO_0_env_virus_control_000000000681,1.69583,1.69681,0.3951,0.00098,307.88697,317.66449,308.06429,0.01793
11,day0_DO_0_env_virus_control_000000000689,1.70467,1.70766,0.5609,0.00299,307.96921,317.66403,308.50911,0.05507
16,day0_DO_0_env_virus_control_000000001151,1.70268,1.70516,0.5493,0.00248,307.96345,317.66407,308.41238,0.04576
...,...,...,...,...,...,...,...,...,...
639,day7_DO_0_13C_virus_enriched_000000015584,1.70128,1.70377,0.4313,0.00248,307.90492,317.66439,308.35450,0.04555
644,day7_DO_0_13C_virus_enriched_000000016480,1.69579,1.69419,0.3606,-0.00160,307.86986,317.66458,307.58011,-0.02925
645,day7_DO_0_13C_virus_enriched_000000016530,1.70618,1.70846,0.5991,0.00228,307.98815,317.66393,308.39981,0.04207
649,day7_DO_0_13C_virus_enriched_000000016739,1.69506,1.69216,0.3592,-0.00290,307.86916,317.66458,307.34257,-0.05316


# Part 3: Calculate the EAF of the vOTUs present in the virus enriched fraction 

Step 1: Call in required data frames for EAF calculation of the cell enriched fraction

In [15]:
# Define old and new names
voldnames = ["CLEAN_DAY7_DO_0_12C_VIRAL_10_12","CLEAN_DAY7_DO_0_12C_VIRAL_1_6",
              "CLEAN_DAY7_DO_0_12C_VIRAL_7","CLEAN_DAY7_DO_0_12C_VIRAL_8",
              "CLEAN_DAY7_DO_0_12C_VIRAL_9",
              
              "CLEAN_DAY7_DO_0_13C_VIRAL_10_12",
              "CLEAN_DAY7_DO_0_13C_VIRAL_1_6","CLEAN_DAY7_DO_0_13C_VIRAL_7",
              "CLEAN_DAY7_DO_0_13C_VIRAL_8","CLEAN_DAY7_DO_0_13C_VIRAL_9"]

vnewnames = ["DAY7_DO_0_12C_VIRAL_10","DAY7_DO_0_12C_VIRAL_6",
              "DAY7_DO_0_12C_VIRAL_7","DAY7_DO_0_12C_VIRAL_8",
              "DAY7_DO_0_12C_VIRAL_9",
              
              "DAY7_DO_0_13C_VIRAL_10",
              "DAY7_DO_0_13C_VIRAL_6","DAY7_DO_0_13C_VIRAL_7",
              "DAY7_DO_0_13C_VIRAL_8","DAY7_DO_0_13C_VIRAL_9"]



viral_ve_cov = pd.read_csv("/projects/luo_lab/Siders_data/data/processed/anvio/merged_profile_db/derep_viral_ve/derep_viral_ve-COVs.txt", sep="\t")
# Rename columns
viral_ve_cov.rename(columns=dict(zip(voldnames, vnewnames)), inplace=True)

# Convert to DataFrame with contig as row index
viral_ve_relative_abundance = viral_ve_cov.set_index("contig").transpose().reset_index()

# Normalize the data
normalized_data = normalize(viral_ve_relative_abundance.drop(columns="index"), axis=1, norm='l1')
viral_ve_relative_abundance = pd.DataFrame(normalized_data, columns=viral_ve_relative_abundance.columns[1:])
viral_ve_relative_abundance["samples"] = viral_ve_cov.columns[1:]

# Reorder columns to have samples first
viral_ve_relative_abundance = viral_ve_relative_abundance[["samples"] + list(viral_ve_relative_abundance.columns[:-1])]

# Save to CSV
viral_ve_relative_abundance.to_csv("/projects/luo_lab/Siders_data/results/tables/drep_ve_rel_abun2.csv", index=False)
viral_ve_relative_abundance

contig,samples,day0_DO_0_env_virus_control_000000000002,day0_DO_0_env_virus_control_000000000003,day0_DO_0_env_virus_control_000000000004,day0_DO_0_env_virus_control_000000000008,day0_DO_0_env_virus_control_000000000010,day0_DO_0_env_virus_control_000000000013,day0_DO_0_env_virus_control_000000000014,day0_DO_0_env_virus_control_000000000015,day0_DO_0_env_virus_control_000000000016,...,day7_DO_0_13C_virus_enriched_000000016806,day7_DO_0_13C_virus_enriched_000000016807,day7_DO_0_13C_virus_enriched_000000016808,day7_DO_0_13C_virus_enriched_000000016809,day7_DO_0_13C_virus_enriched_000000016810,day7_DO_0_13C_virus_enriched_000000016813,day7_DO_0_13C_virus_enriched_000000016815,day7_DO_0_13C_virus_enriched_000000016816,day7_DO_0_13C_virus_enriched_000000016817,day7_DO_0_13C_virus_enriched_000000016818
0,CLEAN_DAY0_DO_0_ENV_VIRUS_CONTROL_NONE,8.35282e-06,5.36621e-05,5.29988e-05,4.7685e-06,1.68847e-05,7.72054e-06,1.6383e-05,8.01369e-06,1.22192e-05,...,8.46745e-06,3.3401e-06,7.8278e-06,2.4174e-06,4.64015e-06,1.35697e-06,3.06054e-06,3.69259e-06,2.32607e-05,1.49478e-06
1,CLEAN_DAY7_DO_0_12C_VIRUS_ENRICHED_10_12,0.0,2.87647e-06,0.000107712,0.0,5.87443e-06,0.0,2.93039e-05,0.0,1.65145e-05,...,6.61986e-06,8.28778e-08,0.0,4.5674e-07,2.9911e-06,6.49639e-06,7.76822e-07,1.38716e-05,1.14609e-05,2.14097e-06
2,CLEAN_DAY7_DO_0_12C_VIRUS_ENRICHED_1_6,0.0,2.63818e-05,2.33857e-06,5.54156e-07,0.0,0.0,3.55447e-07,8.29177e-06,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.25565e-06,1.61855e-05
3,CLEAN_DAY7_DO_0_12C_VIRUS_ENRICHED_7,0.0,2.23651e-05,0.0,1.03088e-05,0.0,0.0,0.0,2.40083e-05,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.56488e-07,1.3723e-05
4,CLEAN_DAY7_DO_0_12C_VIRUS_ENRICHED_8,4.6385e-06,1.60723e-05,8.65928e-05,0.0,1.24593e-05,0.0,2.64301e-05,0.0,0.0,...,2.93385e-05,4.1452e-06,6.69457e-07,7.57528e-06,8.49722e-06,3.4836e-05,2.98666e-08,0.0,4.94339e-05,2.86827e-06
5,CLEAN_DAY7_DO_0_12C_VIRUS_ENRICHED_9,7.18359e-06,0.000147398,1.15894e-05,0.0,1.99866e-05,0.0,0.0,1.52599e-05,0.0,...,0.0,0.0,3.22952e-06,6.93677e-06,2.0651e-06,1.19843e-06,2.1244e-05,0.0,2.0375e-05,1.57897e-05
6,CLEAN_DAY7_DO_0_13C_VIRUS_ENRICHED_10_12,0.0,1.35794e-06,0.00012135,0.0,3.24874e-06,0.0,3.22076e-05,0.0,5.49867e-06,...,1.59454e-05,5.17582e-06,0.0,0.0,1.26634e-05,2.10055e-05,0.0,1.22125e-05,6.15929e-05,0.0
7,CLEAN_DAY7_DO_0_13C_VIRUS_ENRICHED_1_6,0.0,1.36e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.76042e-07,0.0
8,CLEAN_DAY7_DO_0_13C_VIRUS_ENRICHED_7,7.92372e-07,3.30741e-05,3.60615e-06,1.68396e-05,3.19457e-07,0.0,1.98777e-06,2.09564e-05,3.51128e-07,...,1.90063e-06,6.5396e-07,2.89515e-06,3.06496e-07,1.05073e-06,0.0,3.08524e-06,0.0,3.44456e-06,1.31844e-05
9,CLEAN_DAY7_DO_0_13C_VIRUS_ENRICHED_8,6.99981e-06,9.41619e-05,1.93373e-05,8.40012e-07,1.27361e-05,0.0,5.35177e-06,1.16807e-05,1.59181e-06,...,1.45568e-05,5.84516e-06,1.42715e-05,1.36223e-05,1.10761e-05,1.21684e-05,1.31673e-05,9.37429e-07,3.88512e-05,1.36493e-05


In [57]:
viral_ve_cov

Unnamed: 0,contig,CLEAN_DAY0_DO_0_ENV_VIRUS_CONTROL_NONE,CLEAN_DAY7_DO_0_12C_VIRUS_ENRICHED_10_12,CLEAN_DAY7_DO_0_12C_VIRUS_ENRICHED_1_6,CLEAN_DAY7_DO_0_12C_VIRUS_ENRICHED_7,CLEAN_DAY7_DO_0_12C_VIRUS_ENRICHED_8,CLEAN_DAY7_DO_0_12C_VIRUS_ENRICHED_9,CLEAN_DAY7_DO_0_13C_VIRUS_ENRICHED_10_12,CLEAN_DAY7_DO_0_13C_VIRUS_ENRICHED_1_6,CLEAN_DAY7_DO_0_13C_VIRUS_ENRICHED_7,CLEAN_DAY7_DO_0_13C_VIRUS_ENRICHED_8,CLEAN_DAY7_DO_0_13C_VIRUS_ENRICHED_9
0,day0_DO_0_env_virus_control_000000000002,8.99883,0.00000,0.00000,0.00000,0.37229,0.61834,0.00000,0.00000,0.23784,2.50234,0.00000
1,day0_DO_0_env_virus_control_000000000003,57.81238,1.04887,6.27423,1.80315,1.28998,12.68748,0.09286,2.31795,9.92778,33.66169,0.23378
2,day0_DO_0_env_virus_control_000000000004,57.09776,39.27610,0.55617,0.00000,6.95003,0.99757,8.29821,0.00000,1.08245,6.91285,32.50595
3,day0_DO_0_env_virus_control_000000000008,5.13730,0.00000,0.13179,0.83113,0.00000,0.00000,0.00000,0.00000,5.05470,0.30029,0.00000
4,day0_DO_0_env_virus_control_000000000010,18.19059,2.14205,0.00000,0.00000,1.00000,1.72037,0.22216,0.00000,0.09589,4.55301,0.75402
...,...,...,...,...,...,...,...,...,...,...,...,...
31483,day7_DO_0_13C_virus_enriched_000000016813,1.46192,2.36884,0.00000,0.00000,2.79597,0.10316,1.43640,0.00000,0.00000,4.35003,5.15460
31484,day7_DO_0_13C_virus_enriched_000000016815,3.29724,0.28326,0.00000,0.00000,0.00240,1.82861,0.00000,0.00000,0.92609,4.70715,0.13344
31485,day7_DO_0_13C_virus_enriched_000000016816,3.97817,5.05813,0.00000,0.00000,0.00000,0.00000,0.83512,0.00000,0.00000,0.33512,4.35635
31486,day7_DO_0_13C_virus_enriched_000000016817,25.05970,4.17909,1.01210,0.02068,3.96762,1.75380,4.21186,0.13227,1.03394,13.88880,9.90207


Step 2: Use the 'org_dna_density' function to calculate the mean DNA density of each vOTUs:

In [20]:
viral_ve_relative_abundance_filter=viral_ve_relative_abundance[viral_ve_relative_abundance['samples'] != 'CLEAN_DAY0_DO_0_ENV_VIRUS_CONTROL_NONE']

# Calculate the sum of each column
column_sums = viral_ve_relative_abundance_filter.sum()

# Identify columns with a sum of zero
columns_to_drop = column_sums[column_sums == 0].index

# Drop the columns
viral_ve_relative_abundance_filter = viral_ve_relative_abundance_filter.drop(columns=columns_to_drop)
viral_ve_relative_abundance_filter

contig,samples,day0_DO_0_env_virus_control_000000000002,day0_DO_0_env_virus_control_000000000003,day0_DO_0_env_virus_control_000000000004,day0_DO_0_env_virus_control_000000000008,day0_DO_0_env_virus_control_000000000010,day0_DO_0_env_virus_control_000000000014,day0_DO_0_env_virus_control_000000000015,day0_DO_0_env_virus_control_000000000016,day0_DO_0_env_virus_control_000000000017,...,day7_DO_0_13C_virus_enriched_000000016806,day7_DO_0_13C_virus_enriched_000000016807,day7_DO_0_13C_virus_enriched_000000016808,day7_DO_0_13C_virus_enriched_000000016809,day7_DO_0_13C_virus_enriched_000000016810,day7_DO_0_13C_virus_enriched_000000016813,day7_DO_0_13C_virus_enriched_000000016815,day7_DO_0_13C_virus_enriched_000000016816,day7_DO_0_13C_virus_enriched_000000016817,day7_DO_0_13C_virus_enriched_000000016818
1,CLEAN_DAY7_DO_0_12C_VIRUS_ENRICHED_10_12,0.0,2.87647e-06,0.000107712,0.0,5.87443e-06,2.93039e-05,0.0,1.65145e-05,7.71198e-06,...,6.61986e-06,8.28778e-08,0.0,4.5674e-07,2.9911e-06,6.49639e-06,7.76822e-07,1.38716e-05,1.14609e-05,2.14097e-06
2,CLEAN_DAY7_DO_0_12C_VIRUS_ENRICHED_1_6,0.0,2.63818e-05,2.33857e-06,5.54156e-07,0.0,3.55447e-07,8.29177e-06,0.0,4.03292e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.25565e-06,1.61855e-05
3,CLEAN_DAY7_DO_0_12C_VIRUS_ENRICHED_7,0.0,2.23651e-05,0.0,1.03088e-05,0.0,0.0,2.40083e-05,0.0,9.81806e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.56488e-07,1.3723e-05
4,CLEAN_DAY7_DO_0_12C_VIRUS_ENRICHED_8,4.6385e-06,1.60723e-05,8.65928e-05,0.0,1.24593e-05,2.64301e-05,0.0,0.0,2.48916e-05,...,2.93385e-05,4.1452e-06,6.69457e-07,7.57528e-06,8.49722e-06,3.4836e-05,2.98666e-08,0.0,4.94339e-05,2.86827e-06
5,CLEAN_DAY7_DO_0_12C_VIRUS_ENRICHED_9,7.18359e-06,0.000147398,1.15894e-05,0.0,1.99866e-05,0.0,1.52599e-05,0.0,0.000337676,...,0.0,0.0,3.22952e-06,6.93677e-06,2.0651e-06,1.19843e-06,2.1244e-05,0.0,2.0375e-05,1.57897e-05
6,CLEAN_DAY7_DO_0_13C_VIRUS_ENRICHED_10_12,0.0,1.35794e-06,0.00012135,0.0,3.24874e-06,3.22076e-05,0.0,5.49867e-06,1.82505e-05,...,1.59454e-05,5.17582e-06,0.0,0.0,1.26634e-05,2.10055e-05,0.0,1.22125e-05,6.15929e-05,0.0
7,CLEAN_DAY7_DO_0_13C_VIRUS_ENRICHED_1_6,0.0,1.36e-05,0.0,0.0,0.0,0.0,0.0,0.0,7.03857e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.76042e-07,0.0
8,CLEAN_DAY7_DO_0_13C_VIRUS_ENRICHED_7,7.92372e-07,3.30741e-05,3.60615e-06,1.68396e-05,3.19457e-07,1.98777e-06,2.09564e-05,3.51128e-07,9.43983e-05,...,1.90063e-06,6.5396e-07,2.89515e-06,3.06496e-07,1.05073e-06,0.0,3.08524e-06,0.0,3.44456e-06,1.31844e-05
9,CLEAN_DAY7_DO_0_13C_VIRUS_ENRICHED_8,6.99981e-06,9.41619e-05,1.93373e-05,8.40012e-07,1.27361e-05,5.35177e-06,1.16807e-05,1.59181e-06,0.00018371,...,1.45568e-05,5.84516e-06,1.42715e-05,1.36223e-05,1.10761e-05,1.21684e-05,1.31673e-05,9.37429e-07,3.88512e-05,1.36493e-05
10,CLEAN_DAY7_DO_0_13C_VIRUS_ENRICHED_9,0.0,8.3749e-07,0.00011645,0.0,2.70123e-06,3.34125e-05,0.0,1.62125e-05,1.6606e-05,...,2.29748e-05,4.16374e-06,2.02514e-06,6.31811e-07,1.01923e-05,1.8466e-05,4.78039e-07,1.56063e-05,3.54735e-05,0.0


Step 3: Use the 'AtomFractionExcess' function to calculate the EAF of each vOTU in the viral enriched fraction and export to csv file to be used in R script for figure construction.

In [18]:
# Create the cell_density_df DataFrame
viral_density_df = pd.DataFrame({
    "fraction": ["6","7","8","9","10", "6","7","8","9","10"],
    "treatment": ["12C", "12C", "12C", "12C", "12C", "13C", "13C", "13C", "13C", "13C"],
    "density": [1.71001104,1.70454724,1.69908344,1.69361964,1.69034136,
                1.7111038,1.70454724,1.69799068,1.69252688,1.68706308],

    "qpcr_ratio": [0.166359643,0.390011325,0.62654109,1,0.531697631,
                   0.170752811,0.455733261,0.92387403,1,0.341295116],
    "filtrate_type": ["viral fraction"] * 10
})

# Remove rows where fraction is 12 and drop filtrate_type column
viral_density_df = viral_density_df[viral_density_df["fraction"] != "12"].drop(columns=["filtrate_type"])

# Save to CSV
viral_density_df.to_csv("/projects/luo_lab/Siders_data/results/tables/viral_density_table.csv", index=False)
viral_density_df

Unnamed: 0,fraction,treatment,density,qpcr_ratio
0,6,12C,1.71001,0.16636
1,7,12C,1.70455,0.39001
2,8,12C,1.69908,0.62654
3,9,12C,1.69362,1.0
4,10,12C,1.69034,0.5317
5,6,13C,1.7111,0.17075
6,7,13C,1.70455,0.45573
7,8,13C,1.69799,0.92387
8,9,13C,1.69253,1.0
9,10,13C,1.68706,0.3413


In [19]:
viral_ve_relative_abundance_long=org_dna_denisity(viral_ve_relative_abundance_filter, viral_density_df, 'samples','organism','count', 'treatment','fraction')
viral_ve_relative_abundance_long

  grouped = joined_df.groupby(['organism', 'treatment']).apply(lambda x: (x['weighted']).sum())


treatment,organism,12C,13C
0,day0_DO_0_env_virus_control_000000000003,1.69492,1.69918
1,day0_DO_0_env_virus_control_000000000004,1.69819,1.69339
2,day0_DO_0_env_virus_control_000000000010,1.69515,1.69705
3,day0_DO_0_env_virus_control_000000000014,1.69912,1.69349
4,day0_DO_0_env_virus_control_000000000015,1.69842,
...,...,...,...
25338,day7_DO_0_13C_virus_enriched_000000016813,1.69880,1.69459
25339,day7_DO_0_13C_virus_enriched_000000016815,1.69362,1.69846
25340,day7_DO_0_13C_virus_enriched_000000016816,,1.69281
25341,day7_DO_0_13C_virus_enriched_000000016817,1.69711,1.69550


In [21]:
viral_ve_relative_abundance_long=pd.merge(viral_ve_relative_abundance_long, vGC_content)
viral_ve_relative_abundance_long

Unnamed: 0,organism,12C,13C,GC
0,day0_DO_0_env_virus_control_000000000003,1.69492,1.69918,0.4308
1,day0_DO_0_env_virus_control_000000000004,1.69819,1.69339,0.3146
2,day0_DO_0_env_virus_control_000000000010,1.69515,1.69705,0.3953
3,day0_DO_0_env_virus_control_000000000014,1.69912,1.69349,0.3130
4,day0_DO_0_env_virus_control_000000000015,1.69842,,0.5054
...,...,...,...,...
25338,day7_DO_0_13C_virus_enriched_000000016813,1.69880,1.69459,0.3166
25339,day7_DO_0_13C_virus_enriched_000000016815,1.69362,1.69846,0.4090
25340,day7_DO_0_13C_virus_enriched_000000016816,,1.69281,0.3079
25341,day7_DO_0_13C_virus_enriched_000000016817,1.69711,1.69550,0.3582


In [52]:
viral_ve_atomic_fraction=AtomFractionExcess(viral_ve_relative_abundance_long).dropna()
# Filter viralcell_atomic_fraction to keep only organisms that are in the filtered list
viral_ve_atomic_fraction_filtered = viral_ve_atomic_fraction[viral_ve_atomic_fraction['organism'].isin(filtered_organisms)]
viral_ve_atomic_fraction_filtered.to_csv('/projects/luo_lab/Siders_data/results/tables/viral_atomic_fraction_ve.csv', index=False) 
viral_ve_atomic_fraction_filtered


Unnamed: 0,organism,12C,13C,GC,shift,M_light,M_HeavyMax,M_Lab,EAF
0,day0_DO_0_env_virus_control_000000000003,1.69492,1.69918,0.4308,0.00425,307.90468,317.66439,308.67730,0.07828
1,day0_DO_0_env_virus_control_000000000004,1.69819,1.69339,0.3146,-0.00480,307.84704,317.66471,306.97664,-0.08767
2,day0_DO_0_env_virus_control_000000000010,1.69515,1.69705,0.3953,0.00189,307.88707,317.66449,308.23066,0.03475
3,day0_DO_0_env_virus_control_000000000014,1.69912,1.69349,0.3130,-0.00563,307.84625,317.66471,306.82620,-0.10274
6,day0_DO_0_env_virus_control_000000000017,1.69516,1.69944,0.4616,0.00428,307.91995,317.66430,308.69667,0.07882
...,...,...,...,...,...,...,...,...,...
25336,day7_DO_0_13C_virus_enriched_000000016809,1.69584,1.69780,0.3767,0.00196,307.87784,317.66454,308.23395,0.03598
25337,day7_DO_0_13C_virus_enriched_000000016810,1.69756,1.69548,0.3584,-0.00208,307.86877,317.66459,307.49162,-0.03807
25338,day7_DO_0_13C_virus_enriched_000000016813,1.69880,1.69459,0.3166,-0.00420,307.84803,317.66470,307.08610,-0.07675
25339,day7_DO_0_13C_virus_enriched_000000016815,1.69362,1.69846,0.4090,0.00484,307.89386,317.66445,308.77312,0.08899
