In [1]:
import json
import os
from src.data import make_dataset
from src.features import build_features


In [2]:
## Creating paths to store temp and out data ##
if not os.path.exists("data/temp"):
    os.makedirs("data/temp")
if not os.path.exists("data/out"):
    os.makedirs("data/out")

In [3]:
from qiime2.plugins import feature_table
from qiime2 import Artifact
from qiime2.plugins.sample_classifier.pipelines import classify_samples
from qiime2.plugins.feature_table.methods import filter_samples
from qiime2 import Metadata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import biom
import seaborn as sns
# %matplotlib inline 

## Loading Data

In [4]:
## Obtaining file paths
with open("config/data-params.json") as fh:
    file_paths = json.load(fh)

In [5]:
table = make_dataset.read_feature_table(file_paths["feature_table_path"])
metadata = make_dataset.read_metadata(file_paths["metadata_path"])

  metadata = pd.read_csv(path, sep='\t', index_col=0)


In [7]:
biom_table = table.view(biom.Table)
print(biom_table.head())

# Constructed from biom file
#OTU ID	11666.BLANK7.7B	11666.BLANK5.5B	11666.G0341A	11666.BLANK3.3A	11666.BLANK5.5E
AACATAAGGGGCAAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGTCTGTTAAGTCAGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG	0.0	0.0	0.0	0.0	0.0
AACATAGGGGGCAAGCGTTGCCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGTCTGTTAAGTCAGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG	0.0	0.0	0.0	0.0	0.0
AACATAGGGGGCAAGCGTTGTCCGGAAACACTGGGCGTAAAGGGCGCGTAGGCGGTCTGTTAAGTCGGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG	0.0	0.0	0.0	0.0	0.0
AACATAGGGGGCAAGCGTTGTCCGGAATCACTGGGCATAAAGGGCGCGTAGGTGGTTTGTTAAGTCAGATGTGAAATGTAGGGGCTCAACCCCTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG	0.0	0.0	0.0	0.0	0.0
AACATAGGGGGCAAGCGTTGTCCGGAATCACTGGGCGTAAAGAGCGCGTAGGTGGTCTGTTAAGTCAGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG	0.0	0.0	0.0	0.0	0.0


## Load Metadata

In [8]:
qiime_metadata = Metadata.load("data/temp/final_metadata.tsv")
qiime_metadata

Metadata
--------
1644 IDs x 14 columns
abdominal_obesity_ncep_v2: ColumnProperties(type='numeric', missing_scheme='blank')
ckd_v2:                    ColumnProperties(type='categorical', missing_scheme='blank')
diabetes2_v2:              ColumnProperties(type='numeric', missing_scheme='blank')
hypertension2_v2:          ColumnProperties(type='numeric', missing_scheme='blank')
precvd_v2:                 ColumnProperties(type='numeric', missing_scheme='blank')
elevated_bp_selfmeds_v2:   ColumnProperties(type='numeric', missing_scheme='blank')
dyslipidemia_v2:           ColumnProperties(type='numeric', missing_scheme='blank')
gender_v2:                 ColumnProperties(type='categorical', missing_scheme='blank')
agegroup_c6_v2:            ColumnProperties(type='numeric', missing_scheme='blank')
bmigrp_c6_v2:              ColumnProperties(type='numeric', missing_scheme='blank')
center:                    ColumnProperties(type='categorical', missing_scheme='blank')
us_born_v2:             

In [9]:
#Filter feature table based on cleaned metadata
updated_feature_table = filter_samples(table, metadata = qiime_metadata).filtered_table
biom_table = updated_feature_table.view(biom.Table)
biom_table #around 4000 rows was removed

54448 x 1644 <class 'biom.table.Table'> with 323178 nonzero entries (0% dense)

## Feature Table Exploration

### The analyses below does NOT incorporate a phylogenetic tree for now

In [10]:
from qiime2.plugins.diversity.pipelines import core_metrics
from qiime2.plugins.feature_table.visualizers import summarize
from qiime2.plugins.diversity.methods import umap

In [11]:
summary = summarize(updated_feature_table, qiime_metadata)
#summary.visualization.save('ft_summary')

From the summary visualizations and statistics, we see that most features only appear in less than 3 samples, therefore we are going to drop the features that appear less than 3 times in order to reduce noise.

In [12]:
feat_table = updated_feature_table.view(pd.DataFrame)
feat_table.shape

(1644, 54448)

In [13]:
#Drop the feature columns that appear in less than 3 samples
feat_table = feat_table[feat_table.columns[((feat_table > 0).sum() > 3)]]
feat_table.shape

(1644, 3873)

In [14]:
# DO more cleaning here, not sure what to do yet

In [15]:
#Import DataFrame back into FeatureTable artifact and export the summary
cleaned_feature_table = Artifact.import_data("FeatureTable[Frequency]", feat_table)

summary_cleaned = summarize(cleaned_feature_table, qiime_metadata)
# summary_cleaned.visualization.save('ft_cleaned_summary')

In [16]:
metrics = core_metrics(cleaned_feature_table, sampling_depth = 10000, metadata = qiime_metadata)
metrics

  warn(
  warn(


Results (name = value)
---------------------------------------------------------------------------------------------------------------
rarefied_table              = <artifact: FeatureTable[Frequency] uuid: e6dcdcbd-e5f0-4d84-81c7-01d9cfa41347>
observed_features_vector    = <artifact: SampleData[AlphaDiversity] uuid: 343fc43d-e070-4db7-a4c5-55cfca9b185e>
shannon_vector              = <artifact: SampleData[AlphaDiversity] uuid: 47924f43-94ae-4e59-bd88-fc3fdf8963eb>
evenness_vector             = <artifact: SampleData[AlphaDiversity] uuid: d9f8e42c-952f-4520-ad10-c8b1d605f7c4>
jaccard_distance_matrix     = <artifact: DistanceMatrix uuid: 301aa8cc-097c-4192-a5b1-878e279518bb>
bray_curtis_distance_matrix = <artifact: DistanceMatrix uuid: 1091ba90-fa1d-4dce-9220-fcc871769714>
jaccard_pcoa_results        = <artifact: PCoAResults uuid: f0ee29ad-1ede-4b22-b4f3-e71814dab7e4>
bray_curtis_pcoa_results    = <artifact: PCoAResults uuid: 45702490-2199-44cf-841a-be1bf8c27b16>
jaccard_emperor           

## To implement
First figure out the feature table rarefication, save the plots generated by Qiime2 core_metrics, then create the following:
1. Distance matrices: Unifrac distance matrix, Jaccard distance matrix, Bray Curtis distance matrix
2. PCOA plots with the different distance matrices
    - save the plots for visualization
    
    
3. UMAP plots with the different distance matrices
    - save the plots for visualization
    
    
4. Finally follow up with a statistical test or regression
    - Ex) PERMANOVA test on PCOA results
    - Ex) Use the reduced dimension embeddings to feed into the regression model

In [20]:
#Jaccard Distance Matrix

In [21]:
#Bray-Curtis Distance Matrix

In [17]:
metrics.rarefied_table.view(pd.DataFrame)

Unnamed: 0,AACATAGGGGGCAAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGCGGTCTGTTAAGTCGGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCCGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG,AACATAGGGGGCAAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGCGGTCTGTTAAGTCGGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG,AACATAGGGGGCAAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGCGGTTTGTTAAGTCGGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG,AACATAGGGGGCAAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGTCTGTTAAGTCAGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG,AACATAGGGGGCAAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGTGTGTTAAGTCAGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG,AACATAGGGGGCAAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGTTTGTTAAGTCAGATGTGAAATGTAGGGGCTCAACCCCTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTAGGTGTAAAGGGAGCGCAGGCGGATTGGCAAGTTGGGAGTGAAATCTATGGGCTCAACCCATAAATTGCTTTCAAAACTGTCAGTCTTGAGTGGTGTAGAGGTAGGCGGAATTCCCGG,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGCGTAAAGGGAGCGCAGGCGGACCGGCAAGTTGGAAGTGAAAACTATGGGCTCAACCCATAAATTGCTTTCAAAACTGCTGGCCTTGAGTAGTGCAGAGGTAGGTGGAATTCCCGG,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGACCGGCAAGTTGGAAGTGAAAACCACGGGCTCAACCCGTGAATTGCTTTCAAAACTGCTGGCCTTGAGTAGTGCAGAGGTAGGTGGAATTCCCGG,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGACCGGCAAGTTGGAAGTGAAAACCATAGGCTCAACCTATGGATTGCTTTCAAAACTGCTGGCCTTGAGTAGTGCAGAGGTAGGTGGAATTCCCGG,...,TACGTGGGTCGCAAGCGTTATCCGGAATCATTGGGCGTAAAGAGTGAGCAGGCGGTCCGGGAAGTCCGCGGTGAAATGCGGGGGCTCAACCCCCGCAGGCCGCGGATACTTCCGGTCTGGGGTGCGGGAGAGGCGGACGGAACTCCCCGT,TACGTGGGTCGCAAGCGTTATCCGGAATCATTGGGCGTAAAGAGTGAGCAGGCGGTCCGGGAAGTCCGCGGTGAAATGCGGGGGCTCAACCCCCGCAGGCCGCGGATACTTCCGGTCTGGGGTGCGGGAGAGGCGGACGGAACTCCGCGG,TACGTGGGTCGCAAGCGTTATCCGGAATCATTGGGCGTAAAGAGTGAGCAGGCGGTCCGGGAAGTCCGCGGTGAAATGCGGGGGCTCAACCCCCGCAGGCCGCGGATACTTCCGGTCTGGGGTGCGGGAGAGGCGGACGGAACTCCGCGT,TACGTGGGTCGCAAGCGTTATCCGGAATCATTGGGCGTAAAGAGTGAGCAGGCGGTCCGGGAAGTCCGCGGTGAAATGCGGGGGCTCAACCCCCGCAGGCCGCGGATACTTCCGGTCTGGGGTGCGGGAGAGGCGGACGGGAATCCCCGG,TACGTGGGTCGCAAGCGTTATCCGGAATCATTGGGCGTAAAGAGTGAGCAGGCGGTCCGGGAAGTCCGCGGTGAAATGCGGGGGCTCAACCCCCGCAGGCCGCGGATACTTCCGGTCTGGGGTGCGGGAGAGGCGGACGGGACTCCCCGG,TACGTGGGTCGCAAGCGTTATCCGGAATCATTGGGCGTAAAGAGTGAGCAGGCGGTCCGGGAAGTCCGCGGTGAAATGCGGGGGCTCAACCCCCGCAGGCCGCGGATACTTCCGGTCTGGGGTGCGGGAGAGGCGGACGGGACTCCGCGG,TACGTGGGTCGCAAGCGTTATCCGGAATCATTGGGCGTAAAGAGTGAGCAGGCGGTCCGGGAAGTCCGCGGTGAAATGCGGGGGCTCAACCCCCGCAGGCCGCGGATACTTCCGGTCTGGGGTGCGGGAGAGGCGGACGGGACTCCGCGT,TACGTGGGTCGCAAGCGTTATCCGGAATCATTGGGCGTAAAGAGTGAGCAGGCGGTCCGGGAAGTCCGCGGTGAAATGCGGGGGCTCAACCCCCGCAGGCCGCGGATACTTCCGGTCTGGGGTGCGGGAGAGGCGGGCGGGAATCCCCGG,TACGTGGGTCGCAAGCGTTATCCGGAATCATTGGGCGTAAAGAGTGAGCAGGCGGTCCGGGAAGTCCGCGGTGAAATGCGGGGGCTCAACCCCCGCAGGCCGCGGATACTTCCGGTCTGGGGTGCGGGAGAGGCGGGCGGGACTCCGCGG,TCCGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTCCGCGTGTTGTGAAATGTCGCCGCTCACCGTCTGCCCTGCCGCGCGCCCTGGTTTCCTTGCGTCCGCCCCCCGTGGGCGGCCTTCGTGG
11666.G0393A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G0149A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G0091A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,143.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G0377A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G0363A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11666.G1518B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1501A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1685A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1536A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# metrics.jaccard_emperor.save('ft_jacc_emperor')

In [19]:
#reducer = umap.UMAP(n_components = 2, n_neighbors = 15, metric = 'jaccard', random_state = 0)
