In [1]:
import json
import os
import sys
import inspect

from src.data import make_dataset
from src.features import build_features
from src.features import metrics_analysis

In [2]:
## Creating paths to store temp and out data ##
if not os.path.exists("data/temp"):
    os.makedirs("data/temp")
if not os.path.exists("data/out"):
    os.makedirs("data/out")

In [3]:
from qiime2.plugins import feature_table
from qiime2 import Artifact
from qiime2.plugins.sample_classifier.pipelines import classify_samples
from qiime2.plugins.feature_table.methods import filter_samples
from qiime2 import Metadata
from qiime2.plugins.diversity.pipelines import core_metrics
from qiime2.plugins.diversity.pipelines import core_metrics_phylogenetic
from qiime2.plugins.feature_table.visualizers import summarize
from qiime2.plugins.diversity.methods import umap


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import biom
import skbio

import seaborn as sns
# %matplotlib inline 

## Loading Data

In [4]:
## Obtaining file paths
with open("config/data-params.json") as fh:
    file_paths = json.load(fh)

In [6]:
table = make_dataset.read_feature_table(file_paths["feature_table_path"])
metadata = make_dataset.read_metadata(file_paths["metadata_path"])
tree = make_dataset.read_tree_table(file_paths["tree_path"])

  metadata = pd.read_csv(path, sep='\t', index_col=0)


In [7]:
biom_table = table.view(biom.Table)
print(biom_table.head())

# Constructed from biom file
#OTU ID	11666.BLANK7.7B	11666.BLANK5.5B	11666.G0341A	11666.BLANK3.3A	11666.BLANK5.5E
AACATAAGGGGCAAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGTCTGTTAAGTCAGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG	0.0	0.0	0.0	0.0	0.0
AACATAGGGGGCAAGCGTTGCCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGTCTGTTAAGTCAGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG	0.0	0.0	0.0	0.0	0.0
AACATAGGGGGCAAGCGTTGTCCGGAAACACTGGGCGTAAAGGGCGCGTAGGCGGTCTGTTAAGTCGGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG	0.0	0.0	0.0	0.0	0.0
AACATAGGGGGCAAGCGTTGTCCGGAATCACTGGGCATAAAGGGCGCGTAGGTGGTTTGTTAAGTCAGATGTGAAATGTAGGGGCTCAACCCCTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG	0.0	0.0	0.0	0.0	0.0
AACATAGGGGGCAAGCGTTGTCCGGAATCACTGGGCGTAAAGAGCGCGTAGGTGGTCTGTTAAGTCAGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG	0.0	0.0	0.0	0.0	0.0


## Load Metadata

In [11]:
qiime_metadata = Metadata.load("data/temp/final_metadata.tsv") #Cleaned metadata
qiime_metadata.to_dataframe()

Unnamed: 0_level_0,abdominal_obesity_ncep_v2,ckd_v2,diabetes2_v2,hypertension2_v2,precvd_v2,elevated_bp_selfmeds_v2,dyslipidemia_v2
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
11666.G0001A,1.0,0.0,0.0,1.0,0.0,1.0,0.0
11666.G0002A,1.0,0.0,1.0,1.0,1.0,1.0,0.0
11666.G0003A,1.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G0004A,0.0,1.0,0.0,1.0,0.0,1.0,0.0
11666.G0005A,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...
11666.G1777A,0.0,1.0,0.0,1.0,0.0,1.0,1.0
11666.G1778A,1.0,1.0,0.0,1.0,1.0,1.0,0.0
11666.G1779A,1.0,0.0,1.0,1.0,0.0,1.0,0.0
11666.G1780A,1.0,1.0,1.0,1.0,0.0,1.0,1.0


In [12]:
#Filter feature table based on cleaned metadata
updated_feature_table = filter_samples(table, metadata = qiime_metadata).filtered_table
biom_table = updated_feature_table.view(biom.Table)
biom_table #around 4000 rows was removed

57241 x 1750 <class 'biom.table.Table'> with 343411 nonzero entries (0% dense)

## Feature Table Exploration

### The analyses below does NOT incorporate a phylogenetic tree for now

In [14]:
summary = summarize(updated_feature_table, qiime_metadata)
summary.visualization.save('data/out/ft_summary')

'data/out/ft_summary.qzv'

From the summary visualizations and statistics, we see that most features only appear in less than 3 samples, therefore we are going to drop the features that appear less than 3 times in order to reduce noise.

In [15]:
feat_table = updated_feature_table.view(pd.DataFrame)
feat_table.shape

(1750, 57241)

In [16]:
#Drop the feature columns that appear in less than 3 samples
feat_table_3 = feat_table[feat_table.columns[((feat_table > 0).sum() > 3)]]

#Import DataFrame back into FeatureTable artifact and export the summary
cleaned_feature_table = Artifact.import_data("FeatureTable[Frequency]", feat_table_3)

summary_cleaned = summarize(cleaned_feature_table, qiime_metadata)
summary_cleaned.visualization.save('data/out/ft_summary_3')

'data/out/ft_summary_3.qzv'

In [17]:
feat_table_3.shape


(1750, 3988)

In [18]:
#Drop the feature columns that appear in less than 10 samples
feat_table_10 = feat_table[feat_table.columns[((feat_table > 0).sum() > 10)]]

#Import DataFrame back into FeatureTable artifact and export the summary
cleaned_feature_table = Artifact.import_data("FeatureTable[Frequency]", feat_table_10)

summary_cleaned = summarize(cleaned_feature_table, qiime_metadata)
summary_cleaned.visualization.save('data/out/ft_summary_10')

'data/out/ft_summary_10.qzv'

In [19]:
feat_table_10.shape


(1750, 2003)

# Feature Table Metrics Analysis

First figure out the feature table rarefication, save the plots generated by Qiime2 core_metrics, then create the following:
1. Distance matrices: Unifrac distance matrix, Jaccard distance matrix, Bray Curtis distance matrix
2. PCOA plots with the different distance matrices
    - save the plots for visualization
    
    
3. UMAP plots with the different distance matrices
    - save the plots for visualization
    
    
4. Finally follow up with a statistical test or regression
    - Ex) PERMANOVA test on PCOA results
    - Ex) Use the reduced dimension embeddings to feed into the regression model

With the information we gained from the summary, we decided to rarefy the table with a sampling depth of _7930_ we retained _12,489,750 (52.48%) features in 1575 (90.00%) samples_ at the specifed sampling depth. We made this decision to maximize the amount of features while preserving the amount of samples in our data.

In [38]:
# Create the feature table metrics object

feat_table = cleaned_feature_table
depth = 7930
metadata = qiime_metadata
feature_table_metrics = metrics_analysis.extract_core_metrics(feat_table, depth, metadata)


  warn(
  warn(


In [39]:
# # Create the feature table metrics object with tree
# feat_table = cleaned_feature_table
# depth = 7930
# metadata = qiime_metadata
# # feature_table_metrics = core_metrics_phylogenetic(feat_table, tree, depth, metadata)



In [40]:
# from qiime2.plugins.diversity_lib.methods import unweighted_unifrac
# '''
# Parameters
# ----------
# table : FeatureTable[Frequency | RelativeFrequency | PresenceAbsence]
#     The feature table containing the samples for which Unweighted Unifrac
#     should be computed.
# phylogeny : Phylogeny[Rooted]
#     Phylogenetic tree containing tip identifiers that correspond to the
#     feature identifiers in the table. This tree can contain tip ids that
#     are not present in the table, but all feature ids in the table must be
#     present in this tree.

# '''
# u_unifrac = unweighted_unifrac(feat_table, tree)
# u_unifrac

In [41]:
# from qiime2.plugins.phylogeny.methods import filter_table

# filtered_table_ = filter_table(feat_table, tree)
# filtered_table_.filtered_table.view(pd.DataFrame)

In [42]:
#Exploration of the dataset
# Print rarefied table
rarefied_table = feature_table_metrics.rarefied_table
rarefied_df = rarefied_table.view(pd.DataFrame)
rarefied_df.shape #Rarefied table dropped some samples that are less than the n=7930 sampling depth

(1537, 2003)

In [45]:
#Calculate Distance matrices
distance_matrices = metrics_analysis.extract_distance_matrices(feature_table_metrics)
distance_matrices

AttributeError: 'Results' object has no attribute 'unweighted_unifrac_distance_matrix'

In [None]:
#Calculate the PCOA matrices
pcoa_matrices = metrics_analysis.extract_pcoa_results(feature_table_metrics)
pcoa_matrices

In [None]:
#Calculate the Emperor visualization and output
pcoa_emperor_plots = metrics_analysis.extract_pcoa_emperor_vis(feature_table_metrics)
pcoa_emperor_plots

In [33]:
rarefied_table

<artifact: FeatureTable[Frequency] uuid: b84b901e-d02b-43c6-a6ea-5e05b363a478>

In [48]:
feature_table_metrics.jaccard_pcoa_results

'jc_pcoa_matrix.qza'

In [53]:
def save_pcoa_outputs(metrics):
    jac_pcoa = metrics.jaccard_pcoa_results
    bc_pcoa = metrics.bray_curtis_pcoa_results
    
    jaccard_emperor = metrics.jaccard_emperor
    bray_curtis_emperor = metrics.bray_curtis_emperor
    
    jac_pcoa.save('data/out/jac_pcoa_matrix')
    bc_pcoa.save('data/out/bc_pcoa_matrix')
    
    jaccard_emperor.save('data/out/jac_pcoa_emp')
    bray_curtis_emperor.save('data/out/bc_pcoa_emp')
    
    return

save_pcoa_outputs(feature_table_metrics)