In [None]:
import json
import os
import sys
import inspect

from src.data import make_dataset
from src.features import build_features
from src.features import metrics_analysis

In [None]:
## Creating paths to store temp and out data ##
if not os.path.exists("data/temp"):
    os.makedirs("data/temp")
if not os.path.exists("data/out"):
    os.makedirs("data/out")

In [None]:
from qiime2.plugins import feature_table
from qiime2 import Artifact
from qiime2.plugins.sample_classifier.pipelines import classify_samples
from qiime2.plugins.feature_table.methods import filter_samples
from qiime2 import Metadata
from qiime2.plugins.diversity.pipelines import core_metrics
from qiime2.plugins.diversity.pipelines import core_metrics_phylogenetic
from qiime2.plugins.feature_table.visualizers import summarize
from qiime2.plugins.diversity.methods import umap


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import biom
import skbio

import seaborn as sns
# %matplotlib inline 

## Loading Data

In [None]:
## Obtaining file paths
with open("config/data-params.json") as fh:
    file_paths = json.load(fh)

In [None]:
table = make_dataset.read_feature_table(file_paths["feature_table_path"])
metadata = make_dataset.read_metadata(file_paths["metadata_path"])

In [None]:
biom_table = table.view(biom.Table)
print(biom_table.head())

### Loading the Phylogeny Tree

In [None]:
# tree_path = file_paths["tree_path"]

# tree = skbio.TreeNode.read(tree_path) #READING TREE TAKES VERY LONG, TIMING OUT
# tree_artifact = Artifact.import_data('Phylogeny[Rooted]', tree)
# tree_artifact

In [None]:
# tree_artifact.save('tree_file')
Artifact.load('data/raw/tree_file.qza')

## Load Metadata

In [None]:
qiime_metadata = Metadata.load("data/temp/final_metadata.tsv")
qiime_metadata

In [None]:
#Filter feature table based on cleaned metadata
updated_feature_table = filter_samples(table, metadata = qiime_metadata).filtered_table
biom_table = updated_feature_table.view(biom.Table)
biom_table #around 4000 rows was removed

## Feature Table Exploration

### The analyses below does NOT incorporate a phylogenetic tree for now

In [None]:
summary = summarize(updated_feature_table, qiime_metadata)
summary.visualization.save('data/out/ft_summary')

From the summary visualizations and statistics, we see that most features only appear in less than 3 samples, therefore we are going to drop the features that appear less than 3 times in order to reduce noise.

With the information we gained from the summary, we decided to rarefy the table with a sampling depth of _7930_ we retained _12,489,750 (52.48%) features in 1575 (90.00%) samples_ at the specifed sampling depth. We made this decision to maximize the amount of features while preserving the amount of samples in our data.

In [None]:
feat_table = updated_feature_table.view(pd.DataFrame)
feat_table.shape

In [None]:
#Drop the feature columns that appear in less than 3 samples
feat_table_3 = feat_table[feat_table.columns[((feat_table > 0).sum() > 3)]]
feat_table_3.shape

#Import DataFrame back into FeatureTable artifact and export the summary
cleaned_feature_table = Artifact.import_data("FeatureTable[Frequency]", feat_table_3)

summary_cleaned = summarize(cleaned_feature_table, qiime_metadata)
summary_cleaned.visualization.save('data/out/ft_summary_3')

In [None]:
#Drop the feature columns that appear in less than 10 samples
feat_table_10 = feat_table[feat_table.columns[((feat_table > 0).sum() > 10)]]
feat_table_10.shape

#Import DataFrame back into FeatureTable artifact and export the summary
cleaned_feature_table = Artifact.import_data("FeatureTable[Frequency]", feat_table_10)

summary_cleaned = summarize(cleaned_feature_table, qiime_metadata)
summary_cleaned.visualization.save('data/out/ft_summary_10')

# Feature Table Metrics Analysis

First figure out the feature table rarefication, save the plots generated by Qiime2 core_metrics, then create the following:
1. Distance matrices: Unifrac distance matrix, Jaccard distance matrix, Bray Curtis distance matrix
2. PCOA plots with the different distance matrices
    - save the plots for visualization
    
    
3. UMAP plots with the different distance matrices
    - save the plots for visualization
    
    
4. Finally follow up with a statistical test or regression
    - Ex) PERMANOVA test on PCOA results
    - Ex) Use the reduced dimension embeddings to feed into the regression model

In [None]:
# Create the feature table metrics object
feat_table = cleaned_feature_table
depth = 7930
metadata = qiime_metadata

feature_table_metrics = metrics_analysis.extract_core_metrics(feat_table, depth, metadata)

In [None]:
#Exploration of the dataset
# Print rarefied table
rarefied_table = feature_table_metrics.rarefied_table

In [None]:
#Calculate Distance matrices
distance_matrices = metrics_analysis.extract_distance_matrices(feature_table_metrics)
distance_matrices

In [None]:
#Calculate the PCOA matrices
pcoa_matrices = metrics_analysis.extract_pcoa_results(feature_table_metrics)
pcoa_matrices

In [None]:
#Calculate the Emperor visualization and output
pcoa_emperor_plots = metrics_analysis.extract_pcoa_emperor_vis(feature_table_metrics)
pcoa_emperor_plots

In [None]:
rarefied_df = rarefied_table.view(pd.DataFrame)
rarefied_df.shape #Rarefied table dropped some samples that are less than the n=7930 sampling depth

In [None]:
distance_matrices