In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as x
import numpy as np

import qiime2 as q2
from qiime2 import Visualization

%matplotlib inline

In [4]:
raw_data_dir = "../data/raw"
data_dir = "../data/processed"
vis_dir  = "../results"

In [None]:
# ANCOM

In [3]:
! qiime composition add-pseudocount \
    --i-table $data_dir/table.qza \
    --o-composition-table $data_dir/comp-table.qza


[32mSaved FeatureTable[Composition] to: ../data/processed/comp-table.qza[0m
[0m

In [5]:
! qiime composition ancom \
    --i-table $data_dir/comp-table.qza \
    --m-metadata-file $data_dir/metadata_binned.tsv \
    --m-metadata-column Cohort_Number_Bin \
    --o-visualization $data_dir/ancom-results.qzv

[32mSaved Visualization to: ../data/processed/ancom-results.qzv[0m
[0m

In [6]:
Visualization.load(f"{data_dir}/ancom-results.qzv")

In [None]:
# Chose the 6 singificant features and mapped them to a taxon

In [10]:
pd.set_option('max_colwidth', 150)

In [11]:
# note: QIIME 2 artifact files can be loaded as python objects! This is how.
taxa = q2.Artifact.load(f'{data_dir}/taxonomy.qza')
# view as a `pandas.DataFrame`. Note: Only some Artifact types can be transformed to DataFrames
taxa = taxa.view(pd.DataFrame)

In [22]:
ancom = taxa.loc[['d383d75128d7423a9bbdb2076120e365', 'aeb03963939e00b75d7370f4be601417', '6a125442b3d882bd11b5cfe1866470fd', 'e3bff2e5d94dbb2b69f466ee85a1acf4', '5a0f522431143dce1339d7359fc37599', '833bf02443c2dece76422ef394ce48d0']]

In [27]:
ancom

Unnamed: 0_level_0,Taxon,Confidence
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1
d383d75128d7423a9bbdb2076120e365,d__Bacteria;p__Firmicutes;c__Bacilli;o__Erysipelotrichales;f__Erysipelotrichaceae;g__[Clostridium]_innocuum_group;s__,0.9999975834612012
aeb03963939e00b75d7370f4be601417,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__,0.9999999641579512
6a125442b3d882bd11b5cfe1866470fd,d__Bacteria;p__Firmicutes;c__Clostridia;o__Peptostreptococcales-Tissierellales;f__Peptostreptococcaceae;g__Intestinibacter;s__,0.99396907266501
e3bff2e5d94dbb2b69f466ee85a1acf4,d__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium_sensu_stricto_1;s__,0.9993446613517992
5a0f522431143dce1339d7359fc37599,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__,0.9999999976258208
833bf02443c2dece76422ef394ce48d0,d__Bacteria;p__Firmicutes;c__Bacilli;o__Erysipelotrichales;f__Erysipelatoclostridiaceae;g__Erysipelatoclostridium;s__,0.9999997011969576


In [None]:
# Data frame maipulation to map feature frequency of those 6 features depending on their timepoint (Cohort_Number)

In [71]:
features = q2.Artifact.load(f"{data_dir}/table-filtered.qza")
metadata = pd.read_csv(f"{raw_data_dir}/metadata.tsv", sep='\t')
features = features.view(pd.DataFrame).transpose()

In [79]:
df1 = pd.concat([ancom, features], axis=1, join='inner')

In [91]:
df2 = df1.transpose()
metadata.index = metadata['Sample_Name']
df3 = pd.concat([df2, metadata], axis=1, join='outer')
df3

Unnamed: 0,d383d75128d7423a9bbdb2076120e365,aeb03963939e00b75d7370f4be601417,6a125442b3d882bd11b5cfe1866470fd,e3bff2e5d94dbb2b69f466ee85a1acf4,5a0f522431143dce1339d7359fc37599,833bf02443c2dece76422ef394ce48d0,Sample_Name,Patient_ID,Stool_Consistency,Patient_Sex,Sample_Day,Recovery_Day,Cohort_Number
Taxon,d__Bacteria;p__Firmicutes;c__Bacilli;o__Erysipelotrichales;f__Erysipelotrichaceae;g__[Clostridium]_innocuum_group;s__,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__,d__Bacteria;p__Firmicutes;c__Clostridia;o__Peptostreptococcales-Tissierellales;f__Peptostreptococcaceae;g__Intestinibacter;s__,d__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium_sensu_stricto_1;s__,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__,d__Bacteria;p__Firmicutes;c__Bacilli;o__Erysipelotrichales;f__Erysipelatoclostridiaceae;g__Erysipelatoclostridium;s__,,,,,,,
Confidence,0.9999975834612012,0.9999999641579513,0.9939690726650101,0.9993446613517993,0.9999999976258209,0.9999997011969575,,,,,,,
EG0024,1456.0,373.0,0.0,0.0,979.0,18666.0,EG0024,P004,formed,F,0.0,34.0,1.0
EG0031,0.0,0.0,0.0,0.0,0.0,23.0,EG0031,P021,formed,M,20.0,24.0,2.0
EG0039,403.0,218.0,154.0,34.0,1458.0,144.0,EG0039,P073,formed,M,0.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
EG2580,0.0,0.0,3073.0,0.0,989.0,2.0,EG2580,P042,liquid,F,13.0,17.0,2.0
EG2591,157.0,3028.0,1677.0,58.0,577.0,881.0,EG2591,P017,liquid,M,0.0,17.0,1.0
EG2608,1320.0,281.0,0.0,43908.0,850.0,7.0,EG2608,P034,formed,F,0.0,18.0,1.0
EG2638,0.0,0.0,0.0,0.0,0.0,0.0,EG2638,P017,semi-formed,M,12.0,17.0,2.0


In [92]:
df_abduction = df3[df3['Cohort_Number'] == 1]
df_recovery = df3[df3['Cohort_Number'] == 2]

In [96]:
df_abduction

Unnamed: 0,d383d75128d7423a9bbdb2076120e365,aeb03963939e00b75d7370f4be601417,6a125442b3d882bd11b5cfe1866470fd,e3bff2e5d94dbb2b69f466ee85a1acf4,5a0f522431143dce1339d7359fc37599,833bf02443c2dece76422ef394ce48d0,Sample_Name,Patient_ID,Stool_Consistency,Patient_Sex,Sample_Day,Recovery_Day,Cohort_Number
EG0024,1456.0,373.0,0.0,0.0,979.0,18666.0,EG0024,P004,formed,F,0.0,34.0,1.0
EG0039,403.0,218.0,154.0,34.0,1458.0,144.0,EG0039,P073,formed,M,0.0,,1.0
EG0055,0.0,448.0,0.0,0.0,64.0,95.0,EG0055,P020,liquid,F,0.0,28.0,1.0
EG0070,213.0,760.0,1852.0,0.0,3914.0,123.0,EG0070,P062,semi-formed,F,0.0,27.0,1.0
EG0136,322.0,232.0,0.0,0.0,197.0,1047.0,EG0136,P027,formed,M,0.0,33.0,1.0
EG0141,132.0,1519.0,0.0,0.0,182.0,5704.0,EG0141,P032,liquid,F,0.0,21.0,1.0
EG0194,124.0,0.0,0.0,0.0,9828.0,6671.0,EG0194,P029,liquid,M,0.0,20.0,1.0
EG0236,287.0,304.0,1641.0,0.0,78.0,403.0,EG0236,P033,semi-formed,M,0.0,47.0,1.0
EG0280,7977.0,0.0,0.0,0.0,0.0,8856.0,EG0280,P051,formed,M,0.0,11.0,1.0
EG0282,0.0,2542.0,4915.0,0.0,1816.0,2.0,EG0282,P044,liquid,F,0.0,23.0,1.0


In [94]:
df_recovery

Unnamed: 0,d383d75128d7423a9bbdb2076120e365,aeb03963939e00b75d7370f4be601417,6a125442b3d882bd11b5cfe1866470fd,e3bff2e5d94dbb2b69f466ee85a1acf4,5a0f522431143dce1339d7359fc37599,833bf02443c2dece76422ef394ce48d0,Sample_Name,Patient_ID,Stool_Consistency,Patient_Sex,Sample_Day,Recovery_Day,Cohort_Number
EG0031,0.0,0.0,0.0,0.0,0.0,23.0,EG0031,P021,formed,M,20.0,24.0,2.0
EG0057,0.0,0.0,0.0,0.0,0.0,13.0,EG0057,P004,formed,F,35.0,34.0,2.0
EG0088,0.0,29.0,0.0,0.0,15.0,8.0,EG0088,P020,semi-formed,F,27.0,28.0,2.0
EG0101,0.0,169.0,0.0,5.0,14721.0,0.0,EG0101,P054,semi-formed,F,7.0,7.0,2.0
EG0118,0.0,0.0,0.0,0.0,15.0,7.0,EG0118,P062,formed,F,30.0,27.0,2.0
EG0175,0.0,0.0,0.0,0.0,14296.0,0.0,EG0175,P032,formed,F,19.0,21.0,2.0
EG0196,0.0,0.0,0.0,0.0,28.0,0.0,EG0196,P027,formed,M,35.0,33.0,2.0
EG0256,0.0,0.0,0.0,0.0,0.0,0.0,EG0256,P057,formed,M,17.0,19.0,2.0
EG0294,91.0,0.0,0.0,0.0,501.0,366.0,EG0294,P051,formed,M,7.0,11.0,2.0
EG0313,0.0,0.0,0.0,0.0,31.0,26.0,EG0313,P044,liquid,F,19.0,23.0,2.0


In [None]:
# All 6 features are less abundant in the recovered patients