## Importing Packages and Source Code

In [1]:
import json
import os
import sys
## biom used to view Qiime Artifacts
import biom
%matplotlib inline 

In [2]:
## Import Source Code
from src.data import make_dataset
from src.features import build_features, metrics_analysis
from src.models import make_models, evaluate_models
from src.visualizations import make_visualizations

In [3]:
## Creating paths to store temp and out data
if not os.path.exists("data/temp"):
    os.makedirs("data/temp")
if not os.path.exists("data/out"):
    os.makedirs("data/out")

In [4]:
## Used to format graph
import matplotlib.pyplot as plt

## Reading Data

In [5]:
## Obtaining file paths
with open("config/data-params.json") as fh:
    file_paths = json.load(fh)

In [6]:
feature_table = make_dataset.read_feature_table(file_paths["feature_table_path"])
metadata = make_dataset.read_metadata(file_paths["metadata_path"])

  metadata = pd.read_csv(path, sep='\t', index_col=0)


In [7]:
biom_table = make_dataset.feature_table_biom_view(feature_table)
print(biom_table.head())

# Constructed from biom file
#OTU ID	11666.BLANK7.7B	11666.BLANK5.5B	11666.G0341A	11666.BLANK3.3A	11666.BLANK5.5E
AACATAAGGGGCAAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGTCTGTTAAGTCAGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG	0.0	0.0	0.0	0.0	0.0
AACATAGGGGGCAAGCGTTGCCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGTCTGTTAAGTCAGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG	0.0	0.0	0.0	0.0	0.0
AACATAGGGGGCAAGCGTTGTCCGGAAACACTGGGCGTAAAGGGCGCGTAGGCGGTCTGTTAAGTCGGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG	0.0	0.0	0.0	0.0	0.0
AACATAGGGGGCAAGCGTTGTCCGGAATCACTGGGCATAAAGGGCGCGTAGGTGGTTTGTTAAGTCAGATGTGAAATGTAGGGGCTCAACCCCTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG	0.0	0.0	0.0	0.0	0.0
AACATAGGGGGCAAGCGTTGTCCGGAATCACTGGGCGTAAAGAGCGCGTAGGTGGTCTGTTAAGTCAGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG	0.0	0.0	0.0	0.0	0.0


In [8]:
metadata.head()

Unnamed: 0_level_0,abdominal_obesity_idf_v2,abdominal_obesity_ncep_v2,ac_ratio_gt30,ac_ratio_gt30_v2,age_units,age_v2,agegroup_c6_nhanes_v2,agegroup_c6_v2,anonymized_name,antibiotic,...,taxon_id,title,type_stool,us_born_v2,weight_norm_overall_v2,weight_units,yes_no,yogurt,yrs_btwn_viv2,yrsus_c2_v2
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11666.BLANK1.1A,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,BLANK1.1A,not applicable,...,256318.0,GOLD study,not applicable,not applicable,not applicable,not applicable,,not applicable,not applicable,not applicable
11666.BLANK1.1A.ITS,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,BLANK1.1A.ITS,not applicable,...,256318.0,GOLD study,not applicable,not applicable,not applicable,not applicable,,not applicable,not applicable,not applicable
11666.BLANK1.1B,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,BLANK1.1B,not applicable,...,256318.0,GOLD study,not applicable,not applicable,not applicable,not applicable,,not applicable,not applicable,not applicable
11666.BLANK1.1B.ITS,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,BLANK1.1B.ITS,not applicable,...,256318.0,GOLD study,not applicable,not applicable,not applicable,not applicable,,not applicable,not applicable,not applicable
11666.BLANK1.1C,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,BLANK1.1C,not applicable,...,256318.0,GOLD study,not applicable,not applicable,not applicable,not applicable,,not applicable,not applicable,not applicable


# Create Features

In [9]:
## Obtaining file paths
with open("config/feature-params.json") as fh:
    feature_params = json.load(fh)

In [10]:
organized_metadata = build_features.organize_metadata(metadata, biom_table.ids(), **feature_params)

In [11]:
organized_metadata[0]

Unnamed: 0_level_0,abdominal_obesity_ncep_v2,ckd_v2,diabetes2_v2,hypertension2_v2,precvd_v2,elevated_bp_selfmeds_v2,dyslipidemia_v2,hispanic_origin
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
11666.G0001A,1.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0
11666.G0002A,1.0,0.0,1.0,1.0,1.0,1.0,0.0,3.0
11666.G0003A,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
11666.G0004A,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
11666.G0005A,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...
11666.G1777A,0.0,1.0,0.0,1.0,0.0,1.0,1.0,4.0
11666.G1778A,1.0,1.0,0.0,1.0,1.0,1.0,0.0,4.0
11666.G1779A,1.0,0.0,1.0,1.0,0.0,1.0,0.0,4.0
11666.G1780A,1.0,1.0,1.0,1.0,0.0,1.0,1.0,4.0


In [12]:
organized_metadata[1]

Unnamed: 0_level_0,abdominal_obesity_ncep_v2,ckd_v2,diabetes2_v2,hypertension2_v2,precvd_v2,elevated_bp_selfmeds_v2,dyslipidemia_v2,hispanic_origin
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
11666.G0001A,T,F,F,T,F,T,F,3.0
11666.G0002A,T,F,T,T,T,T,F,3.0
11666.G0003A,T,F,F,F,F,F,F,1.0
11666.G0004A,F,T,F,T,F,T,F,1.0
11666.G0005A,T,F,F,T,F,T,F,1.0
...,...,...,...,...,...,...,...,...
11666.G1777A,F,T,F,T,F,T,T,4.0
11666.G1778A,T,T,F,T,T,T,F,4.0
11666.G1779A,T,F,T,T,F,T,F,4.0
11666.G1780A,T,T,T,T,F,T,T,4.0


# EDA

In [None]:
make_visualizations.create_bar_col_binary(organized_metadata[1], 'ckd_v2')

In [None]:
diseases_cols = {
    'abdominal_obesity_ncep_v2': 'Obesity',
    'ckd_v2': 'Chronic Kidney Disease',
    'diabetes2_v2': 'Diabetes',
    'precvd_v2': 'Pre-CVD',
    'elevated_bp_selfmeds_v2': 'Elevated Blood Pressure',
    'dyslipidemia_v2': 'Dyslipidemia'
}

In [None]:
make_visualizations.disease_counts_graph(organized_metadata[0].rename(columns=diseases_cols), disease_cols=diseases_cols.values())

In [None]:
organized_metadata[0][feature_params['disease_cols']].sum(axis=1).value_counts().sort_values(ascending=False)

In [None]:
make_visualizations.total_disease_count_graph(organized_metadata[0],feature_params['disease_cols'])

# Model Building

In [None]:
## Obtaining model params
with open("config/model-params.json") as fh:
    model_params = json.load(fh)

In [13]:
precvd_undersample = organized_metadata[1][['precvd_v2']]
precvd_undersample

Metadata
--------
1747 IDs x 8 columns
abdominal_obesity_ncep_v2: ColumnProperties(type='categorical', missing_scheme='blank')
ckd_v2:                    ColumnProperties(type='categorical', missing_scheme='blank')
diabetes2_v2:              ColumnProperties(type='categorical', missing_scheme='blank')
hypertension2_v2:          ColumnProperties(type='categorical', missing_scheme='blank')
precvd_v2:                 ColumnProperties(type='categorical', missing_scheme='blank')
elevated_bp_selfmeds_v2:   ColumnProperties(type='categorical', missing_scheme='blank')
dyslipidemia_v2:           ColumnProperties(type='categorical', missing_scheme='blank')
hispanic_origin:           ColumnProperties(type='numeric', missing_scheme='blank')

Call to_dataframe() for a tabular representation.

In [None]:
balanced_precvd_df = build_features.balance_precvd(organized_metadata[1])

In [None]:
# need to return updated metadata path
qiime_metadata_tf = make_dataset.read_qiime_metadata("data/temp/final_metadata_tf.tsv")
qiime_metadata_tf

In [None]:
filtered_table = make_dataset.filter_feature_table(feature_table, 4, qiime_metadata_tf)
filtered_table.view(biom.Table)

In [None]:
binary_relevance_model = make_models.binary_relevance_model(filtered_table, qiime_metadata_tf, model_params['disease_targets'],precvd_data=balanced_precvd_df)

In [None]:
disease_accuracy_scores = evaluate_models.binary_relevance_accuracy_scores(binary_relevance_model, feature_params['disease_cols'])

In [None]:
disease_accuracy_scores

In [None]:
make_visualizations.binary_relevance_accuracy_scores_graph(disease_accuracy_scores)

In [None]:
aucs = {'abdominal_obesity_ncep_v2':[0.70,0.52],'ckd_v2':[0.57,0.56],
'diabetes2_v2':[0.73,0.61],'precvd_v2':[0.55,0.56],
'elevated_bp_selfmeds_v2':[0.63,0.61],'dyslipidemia_v2':[0.67,0.55]} #micro,macro

In [None]:
micro = {}
macro = {}
for i in aucs.keys():
    micro[i] = aucs[i][0]
    macro[i] = aucs[i][1]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
micro_average = pd.Series(micro, name='Percentage')
macro_average = pd.Series(macro, name='Percentage')
disease_accuracy_scores_series = pd.Series(disease_accuracy_scores, name='Percentage')

In [None]:
micro_average = micro_average.reset_index().assign(metric_type=['micro-average AUC' for x in range(6)])
macro_average = macro_average.reset_index().assign(metric_type=['macro-average AUC' for x in range(6)])
disease_accuracy_scores_series = disease_accuracy_scores_series.reset_index().assign(metric_type=['Overall Accuracy' for x in range(6)])

In [None]:
performance_metrics_seaborn = pd.concat([disease_accuracy_scores_series,micro_average,macro_average ])
performance_metrics_seaborn = performance_metrics_seaborn.rename(columns={'index':'Disease Type'})
performance_metrics_seaborn['Disease Type'] = performance_metrics_seaborn['Disease Type'].replace(diseases_cols)

In [None]:
import numpy as np

In [None]:
plt.figure(figsize=(22,10))
sns.set(font_scale=2)
ax = sns.barplot(data=performance_metrics_seaborn, x='Disease Type',y='Percentage',hue='metric_type')
ax.legend(loc='upper center', title ='Metric Type')
sns.move_legend(
    ax, "lower center",
    bbox_to_anchor=(.5, 1), ncol=3, title=None, frameon=False,
)
plt.title('Gradient Boosting Classifier Performance',y=1.1)
plt.savefig('performance_metrics_seaborn.png',dpi=300,bbox_inches='tight')

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
test_targets = binary_relevance_model['abdominal_obesity_ncep_v2'].test_targets.view(pd.Series)
predictions = binary_relevance_model['abdominal_obesity_ncep_v2'].predictions.view(pd.Series)
probabilities =  binary_relevance_model['abdominal_obesity_ncep_v2'].probabilities.view(pd.DataFrame)

In [None]:
print(roc_auc_score(test_targets, probabilities.to_numpy()[:,1],average='micro'))
roc_auc_score(test_targets, probabilities.to_numpy()[:,1])

# Permanova test

In [None]:
from qiime2 import Artifact

In [None]:
tree_artifact = Artifact.load('data/raw/tree_file.qza')

In [None]:
metrics = metrics_analysis.extract_core_metrics(filtered_table, 7930, metadata = qiime_metadata_tf, phylogeny = tree_artifact)

In [None]:
from qiime2.plugins.diversity.pipelines import core_metrics_phylogenetic

In [None]:
from qiime2.plugins.feature_table.methods import filter_samples

In [None]:
filtered_table_precvd = filter_samples(feature_table,metadata=balanced_precvd_df).filtered_table

In [None]:
precvd_metrics=core_metrics_phylogenetic(filtered_table_precvd, sampling_depth=10, metadata = balanced_precvd_df, phylogeny = tree_artifact)

In [None]:
metrics_analysis.permanova_test_all_diseases(precvd_metrics.unweighted_unifrac_distance_matrix,precvd_metrics.weighted_unifrac_distance_matrix,balanced_precvd_df,['precvd_v2'])

# rarerfied results

In [None]:
metrics.rarefied_table

In [None]:
rarefied_results = make_models.binary_relevance_model(metrics.rarefied_table, qiime_metadata_tf, feature_params['disease_cols'],precvd_col=balanced_precvd_df.get_column('precvd_v2'))