# Split feature table by distribution 

## Overview

This notebook contains code to analyze mucus, tissue, and skeleton feature tables. To do this we are first going to split out each table to test if they are at above, below or at the prevelance and abundance predicted by neutral theory. 
The result of this first table will be 9 possible tables mucus_above, mucus_below, mucus_neutral, tissue_above, tissue_below...etc

## Import libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from qiime2 import Artifact
from os import listdir
import pandas as pd
from qiime2 import Metadata
from qiime2.plugins.diversity.visualizers import alpha_group_significance
from qiime2.plugins.feature_table.methods import filter_features
from qiime2.plugins.diversity.pipelines import alpha
from qiime2.plugins.feature_table.methods import filter_features_conditionally
from qiime2.plugins.feature_table.methods import merge
import qiime2.plugins.feature_table.actions as feature_table_actions

## Load data and metadata

In [3]:
# Load the data
print("About to load the feature table")
feature_table = Artifact.load("../../Neutral Model Analysis/input/carib_silva_merged_table.qza")
print("Done")

# Load the metadata
print("About to load the metadata table")
metadata = Metadata.load("../../Neutral Model Analysis/input/carib_merged_mapping.txt")
print("Done")

#Load the data as csv files
mucus_data = pd.read_csv("../input/M_rarefied_table.csv")
mucus_data = mucus_data.set_index("id")
mucus_data = mucus_data.rename_axis('#OTU ID')
mucus_data.to_csv("../output/M_rarefied_table_index_renamed.tsv", sep = '\t' )
print("Done")
mucus_data
!biom convert --input-fp ../output/M_rarefied_table_index_renamed.tsv -o ../output/M_rarefied_table_index_renamed.biom --table-type='OTU table' --to-json
# Turn BIOM file into QIIME 2 artifact (qza)
!qiime tools import \
  --input-path ../output/M_rarefied_table_index_renamed.biom \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV100Format \
  --output-path ../output/M_rarefied_table.qza

# Validate QIIME 2 artifact file
!qiime tools validate ../output/M_rarefied_table.qza

#Load the data as csv files
tissue_data = pd.read_csv("../input/T_rarefied_table.csv")
tissue_data = tissue_data.set_index("id")
tissue_data = tissue_data.rename_axis('#OTU ID')
tissue_data.to_csv("../output/T_rarefied_table_index_renamed.tsv", sep = '\t' )
print("Done")
tissue_data
!biom convert --input-fp ../output/T_rarefied_table_index_renamed.tsv -o ../output/T_rarefied_table_index_renamed.biom --table-type='OTU table' --to-json
# Turn BIOM file into QIIME 2 artifact (qza)
!qiime tools import \
  --input-path ../output/T_rarefied_table_index_renamed.biom \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV100Format \
  --output-path ../output/T_rarefied_table.qza

# Validate QIIME 2 artifact file
!qiime tools validate ../output/T_rarefied_table.qza


#Load the data as csv files
skeleton_data = pd.read_csv("../input/S_rarefied_table.csv")
skeleton_data = skeleton_data.set_index("id")
skeleton_data = skeleton_data.rename_axis('#OTU ID')
skeleton_data.to_csv("../output/S_rarefied_table_index_renamed.tsv", sep = '\t' )
print("Done")
skeleton_data
!biom convert --input-fp ../output/S_rarefied_table_index_renamed.tsv -o ../output/S_rarefied_table_index_renamed.biom --table-type='OTU table' --to-json
# Turn BIOM file into QIIME 2 artifact (qza)
!qiime tools import \
  --input-path ../output/S_rarefied_table_index_renamed.biom \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV100Format \
  --output-path ../output/S_rarefied_table.qza

# Validate QIIME 2 artifact file
!qiime tools validate ../output/S_rarefied_table.qza

About to load the feature table
Done
About to load the metadata table
Done
Done
[32mImported ../output/M_rarefied_table_index_renamed.biom as BIOMV100Format to ../output/M_rarefied_table.qza[0m
[0m[32mResult ../output/M_rarefied_table.qza appears to be valid at level=max.[0m
[0mDone
[32mImported ../output/T_rarefied_table_index_renamed.biom as BIOMV100Format to ../output/T_rarefied_table.qza[0m
[0m[32mResult ../output/T_rarefied_table.qza appears to be valid at level=max.[0m
[0mDone
[32mImported ../output/S_rarefied_table_index_renamed.biom as BIOMV100Format to ../output/S_rarefied_table.qza[0m
[0m[32mResult ../output/S_rarefied_table.qza appears to be valid at level=max.[0m
[0m

## Calculating mucus alpha diversity

In the next section of code we load our mucus qza file and we calculate alpha diversity.

In [4]:
# Load QZA feature tables
mucus_table = Artifact.load("../output/M_rarefied_table.qza")

mucus_data
# Filter by abundance and prevalence
filtered_mucus_result = filter_features_conditionally(table=mucus_table, abundance=0.01, prevalence=1/50)
filtered_mucus_table = filtered_mucus_result.filtered_table
print("Done")

# Further filter by min frequency and sample occurrence
filtered_mucus_result = filter_features(table=filtered_mucus_table, min_frequency=100, min_samples=2, filter_empty_samples=True)
filtered_mucus_table = filtered_mucus_result.filtered_table
print("Done")

# Calculate observed features (species richness)
alpha_obs_mucus_result = alpha(table=filtered_mucus_table, metric="observed_features")
observed_mucus_alpha_diversity = alpha_obs_mucus_result.alpha_diversity
obs_mucus_alpha_diversity_df = observed_mucus_alpha_diversity.view(pd.Series)
print("Done")


# Calculate Gini index (evenness)
alpha_gini_mucus_result = alpha(table=filtered_mucus_table, metric="gini_index")
gini_index_mucus = alpha_gini_mucus_result.alpha_diversity
gini_mucus_df = gini_index_mucus.view(pd.Series)
print("Done")



Done
Done


  warn(f"{func.__name__} is deprecated as of {ver}.")
  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


Done
1256-012-C121-M               0.986786
1256-019-C123-M               0.965825
1256-022-C127-M               0.993080
1256-025-C128-M               0.944596
1256-028-C129-M               0.993080
                                ...   
E7.6.Mon.aequ.3.20150620.M    0.948491
E7.6.Mon.aequ.4.20150620.M    0.949207
E7.7.Pav.vari.1.20150622.M    0.834301
E7.7.Pav.vari.1.20150623.M    0.979678
E7.7.Pav.vari.2.20150622.M    0.857870
Name: gini_index, Length: 292, dtype: float64


  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


In [10]:
## next step
##give path to the file and make a function to

In [9]:
tissue_table = Artifact.load("../output/T_rarefied_table.qza")
skeleton_table = Artifact.load("../output/S_rarefied_table.qza")


tissue_data
# Filter by abundance and prevalence
filtered_tissue_result = filter_features_conditionally(table=tissue_table, abundance=0.01, prevalence=1/50)
filtered_tissue_table = filtered_tissue_result.filtered_table
print("Done")

# Further filter by min frequency and sample occurrence
filtered_tissue_result = filter_features(table=filtered_tissue_table, min_frequency=100, min_samples=2, filter_empty_samples=True)
filtered_tissue_table = filtered_tissue_result.filtered_table
print("Done")

# Calculate observed features (species richness)
alpha_obs_tissue_result = alpha(table=filtered_tissue_table, metric="observed_features")
observed_tissue_features = alpha_obs_tissue_result.alpha_diversity
obs_tissue_df = observed_tissue_features.view(pd.Series)
print("Done")

# Calculate Gini index (evenness)
alpha_gini_tissue_result = alpha(table=filtered_tissue_table, metric="gini_index")
gini_tissue_index = alpha_gini_tissue_result.alpha_diversity
gini_tissue_df = gini_tissue_index.view(pd.Series)
print("Done")



skeleton_data
# Filter by abundance and prevalence
filtered_skeleton_result = filter_features_conditionally(table=skeleton_table, abundance=0.01, prevalence=1/50)
filtered_skeleton_table = filtered_skeleton_result.filtered_table
print("Done")

# Further filter by min frequency and sample occurrence
filtered_skeleton_result = filter_features(table=skeleton_table, min_frequency=100, min_samples=2, filter_empty_samples=True)
filtered_skeleton_table = filtered_skeleton_result.filtered_table
print("Done")

# Calculate observed features (species richness)
alpha_obs_skeleton_result = alpha(table=filtered_skeleton_table, metric="observed_features")
observed_skeleton_features = alpha_obs_skeleton_result.alpha_diversity
obs_skeleton_df = observed_skeleton_features.view(pd.Series)
print("Done")

# Calculate Gini index (evenness)
alpha_gini_skeleton_result = alpha(table=filtered_skeleton_table, metric="gini_index")
gini_skeleton_index = alpha_gini_skeleton_result.alpha_diversity
gini_skeleton_df = gini_skeleton_index.view(pd.Series)
print("Done")

Done
Done


  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


Done


  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


Done
Done
Done


  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


Done
Done


  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


In [4]:
%matplotlib inline
# Calculate mean richness for each sample type
means = [obs_mucus_df.mean(), obs_tissue_df.mean(), obs_skeleton_df.mean() ]

# Labels for the bars
labels = ['Mucus', 'Tissue', 'Skeleton']

# Plot
plt.figure(figsize=(8, 6))
plt.bar(labels, means)
plt.title('Average Observed Features (Richness)')
plt.ylabel('Observed Features')
plt.xlabel('Sample Type')
plt.tight_layout()
plt.show()

NameError: name 'obs_mucus_df' is not defined

## Calculate alpha diversity for mucus, skeleton, and tissue tables

In the next section of code we calculate alpha diversity for feature table.

In [5]:
def calc_alpha_diversity(feature_table_path, sample_type, metrics=["observed_features", "gini_index"]):
    """
    Calculate and return alpha diversity metrics for a feature table.
    
    Parameters:
    - feature_table_path: str, path to the .qza file
    - sample_type: str, name of the sample (e.g., 'tissue', 'skeleton', 'mucus')
    - metrics: list, alpha diversity metrics to calculate
    
    Returns:
    - DataFrame with sample IDs as index and metrics + sample type as columns
    """
    # Load feature table
    feature_table = Artifact.load(feature_table_path)

    # Filter by abundance and prevalence
    filtered_result = filter_features_conditionally(
        table=feature_table, abundance=0.01, prevalence=1/50)
    
    filtered_table = filtered_result.filtered_table
    print(f"{sample_type} - Done: abundance/prevalence filtering")

    # Further filter by min frequency and sample occurrence
    filtered_result = filter_features(
        table=filtered_table, min_frequency=100, min_samples=2, filter_empty_samples=True)
    
    filtered_table = filtered_result.filtered_table
    print(f"{sample_type} - Done: frequency/sample filtering")

    # Collect alpha diversity results
    alpha_results = {}
    for metric in metrics:
        alpha_result = alpha(table=filtered_table, metric=metric)
        alpha_series = alpha_result.alpha_diversity.view(pd.Series)
        alpha_results[metric] = alpha_series
        print(f"{sample_type} - Done: {metric}")

    # Combine into single DataFrame
    df = pd.DataFrame(alpha_results)
    df["sample_type"] = sample_type
    return df


In [7]:
mucus_df = calc_alpha_diversity("../output/M_rarefied_table.qza", "Mucus")
mucus_df.to_csv("Mucus_alpha_diversity.csv")

tissue_df = calc_alpha_diversity("../output/T_rarefied_table.qza", "Tissue")
tissue_df.to_csv("Tisse_alpha_diversity.csv")

skeleton_df = calc_alpha_diversity("../output/S_rarefied_table.qza", "Skeleton")
skeleton_df.to_csv("Skeleton_alpha_diversity.csv")

Mucus - Done: abundance/prevalence filtering
Mucus - Done: frequency/sample filtering


  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


Mucus - Done: observed_features


  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


Mucus - Done: gini_index
Tissue - Done: abundance/prevalence filtering
Tissue - Done: frequency/sample filtering


  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


Tissue - Done: observed_features


  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


Tissue - Done: gini_index
Skeleton - Done: abundance/prevalence filtering
Skeleton - Done: frequency/sample filtering


  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


Skeleton - Done: observed_features
Skeleton - Done: gini_index


  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


## Filter feature table

Filter the feature table to just microbes with a minimum abundance and prevalance.
Our feature table has many zero counts representing microbes that are present in just a few samples.
Therefore we want to remove rare microbes that are present in fewer than 1/50 samples, or less than 1% of the total abundance.

We also filter the feature table second time to a minimum frequency of 100 counts per microbe and occurence in 2 minimum samples.

In [None]:
# Apply filtering 
filtered_feature_table_results = filter_features_conditionally(table = feature_table, abundance = 0.01, prevalence = 1/50)
filtered_feature_table = filtered_feature_table_results.filtered_table
df = filtered_feature_table.view(pd.DataFrame)

In [None]:
# Then apply additional feature filtering
filtered_feature_table_results = filter_features(
    table=filtered_feature_table,
    min_frequency=100,  # Minimum total frequency for a feature to be retained
    min_samples=2,      # Minimum number of samples a feature must be present in
    filter_empty_samples=True  # Remove samples with no features after filtering    
)
filtered_feature_table = filtered_feature_table_results.filtered_table


## Calculate observed features

We calculate the observed features(species richness) of microbes from our filtered feature table. we're counting how many different types of microbes show up in each sample. The more different types of microbes we find, the higher our diversity count.

In [None]:
#calculate observed features

alpha_diversity_results = alpha(table = filtered_feature_table, metric = "observed_features")
observed_features = alpha_diversity_results.alpha_diversity

print(observed_features)

## Calculate gini index

Calculate the gini index of microbes from our filtered feature table. This measures how evenly our microbes are distributed across our samples. A lower Gini index (closer to 0) means the microbes are distributed pretty evenly, while a higher value (closer to 1) tells us that some microbes are much more abundant than others.

In [None]:
#calculate gini index

alpha_diversity_results = alpha(table = filtered_feature_table, metric = "gini_index")
gini_index = alpha_diversity_results.alpha_diversity

print(gini_index)

## Compare diversity of microbes

First we load in our sample metadata. Then we create visualization to help us see if there are significant differences in the number of microbes (observed features) between these groups and saves it as a QZV file that we can view. 

In [None]:
#compare observed features within alpha diversity
metadata = Metadata.load("../../Neutral Model Analysis/input/carib_merged_mapping.txt")

#get visualization of alpha group significance
alpha_group_significance_results = alpha_group_significance(alpha_diversity = observed_features, metadata = metadata)
observed_features_visualization = alpha_group_significance_results.visualization
observed_features_visualization.save("../../Neutral Model Analysis/output/observed_features_kruskal_wallis.qzv")


In [None]:
def calculate_alpha_diversity(feature_table, metrics):
    """Calculate multiple alpha diversity metrics for the feature table
    
    Parameters:
    feature_table -- QIIME2 artifact of the feature table
    metrics -- list of metrics to calculate
    
    Returns -- alpha diversity results
    """
    if metrics is None:
        metrics = ['observed_features', 'gini_index']
    
    alpha_diversity_results = {}
    
    #calculate observed features

    alpha_diversity_results = alpha(table = filtered_feature_table, metric = "observed_features")
    observed_features = alpha_diversity_results.alpha_diversity
    
    
    #calculate gini index
    alpha_diversity_results = alpha(table = filtered_feature_table, metric = "gini_index")
    gini_index = alpha_diversity_results.alpha_diversity


    

## Analyze combined microbial taxonomy 

First we load our data and convert it from csv to qza format

In [8]:
#Load the data as csv files
mucus_combined_table = pd.read_csv("../input/Mucus_combinded_table.csv") 
mucus_combined_table = mucus_combined_table.set_index("id")
mucus_combined_table = mucus_combined_table.rename_axis('#OTU ID')
mucus_combined_table.to_csv("../output/mucus_combined_table_index_renamed.tsv", sep = '\t' )
print("Done")
mucus_combined_table
!biom convert --input-fp ../output/mucus_combined_table_index_renamed.tsv -o ../output/mucus_combined_table.biom --table-type='OTU table' --to-json

# Turn BIOM file into QIIME 2 artifact (qza)
!qiime tools import \
  --input-path ../output/mucus_combined_table.biom \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV100Format \
  --output-path ../output/mucus_combined_table.qza

#validate QIIME 2 artifact file
!qiime tools validate ../output/mucus_combined_table.qza

Done
[32mImported ../output/mucus_combined_table.biom as BIOMV100Format to ../output/mucus_combined_table.qza[0m
[0m[32mResult ../output/mucus_combined_table.qza appears to be valid at level=max.[0m
[0m

In [24]:
listdir("../input")

['Mucus_above_mapping.csv',
 'Mucus_above_table.tsv',
 'Mucus_above_taxonomy.tsv',
 'Mucus_below_mapping.csv',
 'Mucus_below_table.tsv',
 'Mucus_below_taxonomy.tsv',
 'Mucus_combinded_table.csv',
 'Mucus_neutral_table.tsv',
 'Mucus_neutral_taxonomy.tsv',
 'Mucus_qiime_neutral_mapping.csv',
 'Mucus_qiime_neutral_taxonomy.csv',
 'Skeleton_above_mapping.csv',
 'Skeleton_above_table.tsv',
 'Skeleton_above_taxonomy.tsv',
 'Skeleton_below_mapping.csv',
 'Skeleton_below_table.tsv',
 'Skeleton_below_taxonomy.tsv',
 'Skeleton_combinded_table.csv',
 'Skeleton_neutral_table.tsv',
 'Skeleton_neutral_taxonomy.tsv',
 'Skeleton_qiime_neutral_mapping.csv',
 'Skeleton_qiime_neutral_taxonomy.csv',
 'Tissue_above_mapping.csv',
 'Tissue_above_table.tsv',
 'Tissue_above_taxonomy.tsv',
 'Tissue_below_mapping.csv',
 'Tissue_below_table.tsv',
 'Tissue_below_taxonomy.tsv',
 'Tissue_combinded_table.csv',
 'Tissue_neutral_table.tsv',
 'Tissue_neutral_taxonomy.tsv',
 'Tissue_qiime_neutral_mapping.csv',
 'Tissue_q

## Test prevalence differences across compartments

we merged the tables by compartments to analyze if prevalence is different across compartments.

In [9]:
compartments = ["Mucus", "Tissue", "Skeleton"]
categories = ["above", "below", "neutral"]

# Convert TSV to CSV first
for compartment in compartments:
    for category in categories:
        input_tsv = f"../input/{compartment}_{category}_table.tsv"
        output_csv = f"../input/{compartment}_{category}_table.csv"
        tsv_data = pd.read_csv(input_tsv, sep='\t')
        tsv_data.to_csv(output_csv, index=False)
        print(f"Converted {input_tsv} to {output_csv}")

for compartment in compartments:
    for category in categories:
        print(f"{compartment}_{category}")
        input_csv = f"../input/{compartment}_{category}_table.csv"
        print(input_csv)
        comp_cat_table = pd.read_csv(input_csv, sep=',')
        print(comp_cat_table.columns)
        comp_cat_table = comp_cat_table.set_index ("id")
        comp_cat_table = comp_cat_table.rename_axis("#OTU ID")
        output_csv = f"../output/{compartment}_{category}_table_index_renamed.tsv"
        comp_cat_table.to_csv(output_csv, sep = '\t' )
        print(output_csv)
        output_biom = f"../output/{compartment}_{category}_table.biom"
        !biom convert --input-fp $output_csv -o $output_biom --table-type='OTU table' --to-json
    
        # Turn BIOM file into QIIME 2 artifact (qza)
        print(f"# Turn BIOM file into QIIME 2 artifact (qza) for {compartment}_{category}")
        output_qza = f"../output/{compartment}_{category}_feature_table.qza"
        !qiime tools import \
          --input-path {output_biom} \
          --type 'FeatureTable[Frequency]' \
          --input-format BIOMV100Format \
          --output-path {output_qza}
   
    
        # Validate the QIIME 2 artifact
        print(f"Validating QIIME 2 artifact: {output_qza}")
        !qiime tools validate {output_qza}


Converted ../input/Mucus_above_table.tsv to ../input/Mucus_above_table.csv
Converted ../input/Mucus_below_table.tsv to ../input/Mucus_below_table.csv
Converted ../input/Mucus_neutral_table.tsv to ../input/Mucus_neutral_table.csv
Converted ../input/Tissue_above_table.tsv to ../input/Tissue_above_table.csv
Converted ../input/Tissue_below_table.tsv to ../input/Tissue_below_table.csv
Converted ../input/Tissue_neutral_table.tsv to ../input/Tissue_neutral_table.csv
Converted ../input/Skeleton_above_table.tsv to ../input/Skeleton_above_table.csv
Converted ../input/Skeleton_below_table.tsv to ../input/Skeleton_below_table.csv
Converted ../input/Skeleton_neutral_table.tsv to ../input/Skeleton_neutral_table.csv
Mucus_above
../input/Mucus_above_table.csv
Index(['id', '1256-012-C121-M_above', '1256-019-C123-M_above',
       '1256-022-C127-M_above', '1256-025-C128-M_above',
       '1256-028-C129-M_above', '1256-040-C143-M_above',
       '1256-043-C144-M_above', '1256-046-C145-M_above',
       '1256

../output/Skeleton_above_table_index_renamed.tsv
# Turn BIOM file into QIIME 2 artifact (qza) for Skeleton_above
[32mImported ../output/Skeleton_above_table.biom as BIOMV100Format to ../output/Skeleton_above_feature_table.qza[0m
[0mValidating QIIME 2 artifact: ../output/Skeleton_above_feature_table.qza
[32mResult ../output/Skeleton_above_feature_table.qza appears to be valid at level=max.[0m
[0mSkeleton_below
../input/Skeleton_below_table.csv
Index(['id', '1256-004-C118-S_below', '1256-007-C119-S_below',
       '1256-010-C120-S_below', '1256-013-C121-S_below',
       '1256-016-C122-S_below', '1256-020-C123-S_below',
       '1256-023-C127-S_below', '1256-026-C128-S_below',
       '1256-029-C129-S_below',
       ...
       'E7.5.Gal.fasc.1.20150622.S_below', 'E7.6.Acr.hyac.1.20150622.S_below',
       'E7.6.Acr.hyac.2.20150622.S_below', 'E7.6.Acr.hyac.3.20150622.S_below',
       'E7.6.Mon.aequ.1.20150620.S_below', 'E7.6.Mon.aequ.2.20150620.S_below',
       'E7.6.Mon.aequ.3.20150620.

In [8]:
listdir("../output/")

['Mucus_alpha_diversity.csv',
 'Mucus_combined_table.biom',
 'mucus_combined_table.qza',
 'Mucus_combined_table_index_renamed.tsv',
 'Mucus_feature_table.qza',
 'Mucus_neutral_feature_table.qza',
 'Mucus_neutral_table.biom',
 'Mucus_neutral_table_index_renamed.tsv',
 'Skeleton_alpha_diversity.csv',
 'Skeleton_combined_table.biom',
 'Skeleton_combined_table_index_renamed.tsv',
 'Skeleton_feature_table.qza',
 'Skeleton_neutral_feature_table.qza',
 'Skeleton_neutral_table.biom',
 'Skeleton_neutral_table_index_renamed.tsv',
 'Tissue_alpha_diversity.csv',
 'Tissue_combined_table.biom',
 'Tissue_combined_table_index_renamed.tsv',
 'Tissue_feature_table.qza',
 'Tissue_neutral_feature_table.qza',
 'Tissue_neutral_table.biom',
 'Tissue_neutral_table_index_renamed.tsv']

## Merge feature tables across compartments and categories

Here we merge feature tables across compartments and categories. Our goal is to return a combined metadata file to allow us to compare alpha diversity and other parameters across categories. 

In [None]:
compartments = ["Mucus", "Tissue", "Skeleton"]
categories = ["neutral", "above", "below"]
merged_table = None
for category in categories:
    for compartment in compartments:
        print(f"my compartment is {compartment} and my category is {category}")
        input_table = f"../output/{compartment}_{category}_feature_table.qza"
        compartment_table = Artifact.load(input_table)
        
        
        if merged_table is None:
            merged_table = compartment_table
            continue 
        
        #merge tables using sum method
        merged_result = merge(
        tables = [compartment_table, merged_table],
        overlap_method='sum')
        
        print(f"Merged table results {merged_result}:")
        
        merged_table = merged_result.merged_table
        
        print(merged_result.merged_table)
        
merged_table.save("../output/all_compartment_category_feature_table.qza")
 

## Summarize all compartment feature table

Visualize the table from last step. 

In [12]:
# Summarize the feature table
print("Summarizing the feature table...")
summary_result, = feature_table_actions.summarize(
    table= merged_table
)

print(summary_result)

summary_result.save("../output/all_compartment_category_feature_table_summary.qzv")

Summarizing the feature table...


  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):


<visualization: Visualization uuid: a4deeec3-d9c9-4637-bfab-97b5d17a470a>


'../output/all_compartment_category_feature_table_summary.qzv'

## Create a combined metadata file across compartments and categories

Here we load our metadata for each compartment and category in order to combine them into a single QIIME2 metadata file. 

In [6]:
# Load the data

print("About to load the feature table")
feature_table = Artifact.load("../input/metadata_files/")
print("Done")

# Load the metadata using loop

compartments = ["Mucus", "Tissue", "Skeleton"]
categories = ["neutral", "above", "below"]
merged_metadata = None


for category in categories:
    for compartment in compartments:
        print(f"my compartment is {compartment} and my category is {category}")
        
        # Convert CSV to TSV
        input_csv = f"../input/{compartment}_{category}_metadata.csv"
        input_tsv = f"../input/{compartment}_{category}_metadata.tsv"
        
        df = pd.read_csv(input_csv)
        df.to_csv(input_tsv, sep='\t', index=False)
        
        compartment_metadata = Metadata.load(input_tsv)
        
        if merged_metadata is None:
            merged_metadata = compartment_metadata
            continue
        
        # Merge metadata
        merged_result = merge(
            tables=[compartment_metadata, merged_metadata])
        
        print(f"Merged metadata results {merged_result}:")
        merged_metadata = merged_result.merged_table

print("Done loading metadata")

# Convert to CSV
metadata_df = merged_metadata.to_dataframe()
metadata_df.to_csv("../output/merged_metadata.csv")
print("Metadata saved as CSV")

metadata = metadata_df


# Summarize the feature table
print("Summarizing the feature table...")
summary_result = feature_table.actions.summarize(
    table=feature_table,
    sample_metadata=sample_metadata
)


# alpha/beta diversity analysis

# Calculate alpha diversity
alpha_result = alpha(table=feature_table, metric="observed_features")
observed_features = alpha_result.alpha_diversity

alpha_result = alpha(table=feature_table, metric="gini_index")
gini_index = alpha_result.alpha_diversity


About to load the feature table


ValueError: Archive does not contain a correctly formatted VERSION file.

In [10]:
compartments = ["Mucus", "Tissue", "Skeleton"]
for compartment in compartments:
    print(compartment)
    input_csv = f"../input/{compartment}_combinded_table.csv"
    print(input_csv)
    combined_table = pd.read_csv(input_csv)
    combined_table = combined_table.set_index("id")
    combined_table = combined_table.rename_axis("#OTU ID")
    output_csv = f"../output/{compartment}_combined_table_index_renamed.tsv"
    combined_table.to_csv(output_csv, sep = '\t' )
    print(output_csv)
    output_biom = f"../output/{compartment}_combined_table.biom"
    !biom convert --input-fp $output_csv -o $output_biom --table-type='OTU table' --to-json
    
    # Turn BIOM file into QIIME 2 artifact (qza)
    print(f"# Turn BIOM file into QIIME 2 artifact (qza) for {compartment}")
    output_qza = f"../output/{compartment}_feature_table.qza"
    !qiime tools import \
      --input-path {output_biom} \
      --type 'FeatureTable[Frequency]' \
      --input-format BIOMV100Format \
      --output-path {output_qza}
    
    # Validate the QIIME 2 artifact
    print(f"Validating QIIME 2 artifact: {output_qza}")
    !qiime tools validate {output_qza}

# Define the alpha diversity calculation function
def calc_alpha_diversity(feature_table_path, sample_type, metrics=["observed_features", "gini_index"]):
    """
    Calculate and return alpha diversity metrics for a feature table.
    
    Parameters:
    - feature_table_path: str, path to the .qza file
    - sample_type: str, name of the sample (e.g., 'tissue', 'skeleton', 'mucus')
    - metrics: list, alpha diversity metrics to calculate
    
    Returns:
    - DataFrame with sample IDs as index and metrics + sample type as columns
    """
    # Load feature table
    feature_table = Artifact.load(feature_table_path)
    # Filter by abundance and prevalence
    filtered_result = filter_features_conditionally(
        table=feature_table, abundance=0.01, prevalence=1/50)
    
    filtered_table = filtered_result.filtered_table
    print(f"{sample_type} - Done: abundance/prevalence filtering")
    
    # Further filter by min frequency and abundance
    filtered_result = filter_features(
        table=filtered_table, min_frequency=100, min_samples=2, filter_empty_samples=True)
    
    filtered_table = filtered_result.filtered_table
    print(f"{sample_type} - Done: frequency/sample filtering")
   # Collect alpha diversity results
    alpha_results = {}
    for metric in metrics:
        alpha_result = alpha(table=filtered_table, metric=metric)
        alpha_series = alpha_result.alpha_diversity.view(pd.Series)
        alpha_results[metric] = alpha_series
        print(f"{sample_type} - Done: {metric}")
    # Combine into single DataFrame
    df = pd.DataFrame(alpha_results)
    df["sample_type"] = sample_type
    return df

# Alpha diversity calculation for all compartments
print("Calculating alpha diversity metrics for all compartments...")

# Define metrics to use
metrics = ["observed_features", "gini_index"]
all_alpha_results = []

# Loop through all compartments for alpha diversity calculation
for compartment in compartments:
    print(f"\nProcessing alpha diversity for {compartment}...")
    feature_table_qza = f"../output/{compartment}_feature_table.qza"
    
    # Calculate alpha diversity using the function
    alpha_df = calc_alpha_diversity(
        feature_table_path=feature_table_qza,
        sample_type=compartment.lower(),
        metrics=metrics
    )
    
    # Add to the collection
    all_alpha_results.append(alpha_df)
    
    # Save individual compartment results
    output_csv = f"../output/{compartment}_alpha_diversity.csv"
    alpha_df.to_csv(output_csv)
    print(f"Alpha diversity results saved to {output_csv}")


print("All processing complete!")



Mucus
../input/Mucus_combinded_table.csv
../output/Mucus_combined_table_index_renamed.tsv
# Turn BIOM file into QIIME 2 artifact (qza) for Mucus
[32mImported ../output/Mucus_combined_table.biom as BIOMV100Format to ../output/Mucus_feature_table.qza[0m
[0mValidating QIIME 2 artifact: ../output/Mucus_feature_table.qza
[32mResult ../output/Mucus_feature_table.qza appears to be valid at level=max.[0m
[0mTissue
../input/Tissue_combinded_table.csv
../output/Tissue_combined_table_index_renamed.tsv
# Turn BIOM file into QIIME 2 artifact (qza) for Tissue
[32mImported ../output/Tissue_combined_table.biom as BIOMV100Format to ../output/Tissue_feature_table.qza[0m
[0mValidating QIIME 2 artifact: ../output/Tissue_feature_table.qza
[32mResult ../output/Tissue_feature_table.qza appears to be valid at level=max.[0m
[0mSkeleton
../input/Skeleton_combinded_table.csv
../output/Skeleton_combined_table_index_renamed.tsv
# Turn BIOM file into QIIME 2 artifact (qza) for Skeleton
[32mImported ../

  warn(f"{func.__name__} is deprecated as of {ver}.")
  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


mucus - Done: observed_features


  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


mucus - Done: gini_index
Alpha diversity results saved to ../output/Mucus_alpha_diversity.csv

Processing alpha diversity for Tissue...
tissue - Done: abundance/prevalence filtering
tissue - Done: frequency/sample filtering


  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


tissue - Done: observed_features


  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


tissue - Done: gini_index
Alpha diversity results saved to ../output/Tissue_alpha_diversity.csv

Processing alpha diversity for Skeleton...
skeleton - Done: abundance/prevalence filtering
skeleton - Done: frequency/sample filtering


  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


skeleton - Done: observed_features
skeleton - Done: gini_index
Alpha diversity results saved to ../output/Skeleton_alpha_diversity.csv
All processing complete!


  df[cols] = df[cols].apply(pd.to_numeric, errors='ignore')


In [9]:
from os import listdir
listdir("../output")

['all_compartment_category_feature_table.qza',
 'all_compartment_category_feature_table_summary.qzv',
 'Mucus_above_feature_table.qza',
 'Mucus_above_table.biom',
 'Mucus_above_table_index_renamed.tsv',
 'Mucus_alpha_diversity.csv',
 'Mucus_below_feature_table.qza',
 'Mucus_below_table.biom',
 'Mucus_below_table_index_renamed.tsv',
 'Mucus_combined_table.biom',
 'mucus_combined_table.qza',
 'Mucus_combined_table_index_renamed.tsv',
 'Mucus_feature_table.qza',
 'Mucus_neutral_feature_table.qza',
 'Mucus_neutral_table.biom',
 'Mucus_neutral_table_index_renamed.tsv',
 'Skeleton_above_feature_table.qza',
 'Skeleton_above_table.biom',
 'Skeleton_above_table_index_renamed.tsv',
 'Skeleton_alpha_diversity.csv',
 'Skeleton_below_feature_table.qza',
 'Skeleton_below_table.biom',
 'Skeleton_below_table_index_renamed.tsv',
 'Skeleton_combined_table.biom',
 'Skeleton_combined_table_index_renamed.tsv',
 'Skeleton_feature_table.qza',
 'Skeleton_neutral_feature_table.qza',
 'Skeleton_neutral_table.bi

In [8]:
from os import listdir
listdir("../input")

['Mucus_above_mapping.csv',
 'Mucus_above_table.csv',
 'Mucus_above_table.tsv',
 'Mucus_above_taxonomy.tsv',
 'Mucus_below_mapping.csv',
 'Mucus_below_table.csv',
 'Mucus_below_table.tsv',
 'Mucus_below_taxonomy.tsv',
 'Mucus_combinded_table.csv',
 'Mucus_neutral_table.csv',
 'Mucus_neutral_table.tsv',
 'Mucus_neutral_taxonomy.tsv',
 'Mucus_qiime_neutral_mapping.csv',
 'Mucus_qiime_neutral_taxonomy.csv',
 'Skeleton_above_mapping.csv',
 'Skeleton_above_table.csv',
 'Skeleton_above_table.tsv',
 'Skeleton_above_taxonomy.tsv',
 'Skeleton_below_mapping.csv',
 'Skeleton_below_table.csv',
 'Skeleton_below_table.tsv',
 'Skeleton_below_taxonomy.tsv',
 'Skeleton_combinded_table.csv',
 'Skeleton_neutral_table.csv',
 'Skeleton_neutral_table.tsv',
 'Skeleton_neutral_taxonomy.tsv',
 'Skeleton_qiime_neutral_mapping.csv',
 'Skeleton_qiime_neutral_taxonomy.csv',
 'Tissue_above_mapping.csv',
 'Tissue_above_table.csv',
 'Tissue_above_table.tsv',
 'Tissue_above_taxonomy.tsv',
 'Tissue_below_mapping.csv',
