In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as x
import numpy as np

import qiime2 as q2
from qiime2 import Visualization

%matplotlib inline

In [4]:
raw_data_dir = "../data/raw"
data_dir = "../data/processed"
vis_dir  = "../results"

In [None]:
# ANCOM

In [3]:
! qiime composition add-pseudocount \
    --i-table $data_dir/table.qza \
    --o-composition-table $data_dir/comp-table.qza


[32mSaved FeatureTable[Composition] to: ../data/processed/comp-table.qza[0m
[0m

In [5]:
! qiime composition ancom \
    --i-table $data_dir/comp-table.qza \
    --m-metadata-file $data_dir/metadata_binned.tsv \
    --m-metadata-column Cohort_Number_Bin \
    --o-visualization $data_dir/ancom-results.qzv

[32mSaved Visualization to: ../data/processed/ancom-results.qzv[0m
[0m

In [6]:
Visualization.load(f"{data_dir}/ancom-results.qzv")

In [14]:
! qiime tools export \
  --input-path $data_dir/ancom-results.qzv \
  --output-path $data_dir/ancom-stats

[32mExported ../data/processed/ancom-results.qzv as Visualization to directory ../data/processed/ancom-stats[0m


In [16]:
sig = pd.read_csv(f"{data_dir}/ancom-stats/ancom.tsv", sep='\t')
sig.head(10)
# Chose the 6 singificant features and map them to a taxon

Unnamed: 0.1,Unnamed: 0,W,Reject null hypothesis
0,d383d75128d7423a9bbdb2076120e365,1966,True
1,aeb03963939e00b75d7370f4be601417,1950,True
2,6a125442b3d882bd11b5cfe1866470fd,1930,True
3,e3bff2e5d94dbb2b69f466ee85a1acf4,1865,True
4,5a0f522431143dce1339d7359fc37599,1864,True
5,833bf02443c2dece76422ef394ce48d0,1834,True
6,df009054f19d9aac55f8a5bc2eeaa409,1756,False
7,648070229fc4f45e01a9481f1beefe43,1704,False
8,b222447694e2b10c02a7e80342ae6aca,1324,False
9,1c75ff6db2be53fea3de5274e3bd48e2,344,False


In [20]:
pd.set_option('max_colwidth', 150)

In [21]:
# note: QIIME 2 artifact files can be loaded as python objects! This is how.
taxa = q2.Artifact.load(f'{data_dir}/taxonomy.qza')
# view as a `pandas.DataFrame`. Note: Only some Artifact types can be transformed to DataFrames
taxa = taxa.view(pd.DataFrame)

In [22]:
ancom = taxa.loc[['d383d75128d7423a9bbdb2076120e365', 'aeb03963939e00b75d7370f4be601417', '6a125442b3d882bd11b5cfe1866470fd', 'e3bff2e5d94dbb2b69f466ee85a1acf4', '5a0f522431143dce1339d7359fc37599', '833bf02443c2dece76422ef394ce48d0']]

In [23]:
ancom

Unnamed: 0_level_0,Taxon,Confidence
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1
d383d75128d7423a9bbdb2076120e365,d__Bacteria;p__Firmicutes;c__Bacilli;o__Erysipelotrichales;f__Erysipelotrichaceae;g__[Clostridium]_innocuum_group;s__,0.9999975834612012
aeb03963939e00b75d7370f4be601417,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__,0.9999999641579512
6a125442b3d882bd11b5cfe1866470fd,d__Bacteria;p__Firmicutes;c__Clostridia;o__Peptostreptococcales-Tissierellales;f__Peptostreptococcaceae;g__Intestinibacter;s__,0.99396907266501
e3bff2e5d94dbb2b69f466ee85a1acf4,d__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium_sensu_stricto_1;s__,0.9993446613517992
5a0f522431143dce1339d7359fc37599,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__,0.9999999976258208
833bf02443c2dece76422ef394ce48d0,d__Bacteria;p__Firmicutes;c__Bacilli;o__Erysipelotrichales;f__Erysipelatoclostridiaceae;g__Erysipelatoclostridium;s__,0.9999997011969576


In [None]:
# Data frame maipulation to map feature frequency of those 6 features depending on their timepoint (Cohort_Number)

In [24]:
features = q2.Artifact.load(f"{data_dir}/table-filtered.qza")
metadata = pd.read_csv(f"{raw_data_dir}/metadata.tsv", sep='\t')
features = features.view(pd.DataFrame).transpose()

In [25]:
df1 = pd.concat([ancom, features], axis=1, join='inner')

In [27]:
df2 = df1.transpose()
metadata.index = metadata['Sample_Name']
df3 = pd.concat([df2, metadata], axis=1, join='outer')
df4 = df3.drop(columns = ['Sample_Name', 'Patient_ID', 'Stool_Consistency', 'Patient_Sex', 'Sample_Day', 'Recovery_Day'])

In [28]:
df_abduction = df4[df4['Cohort_Number'] == 1]
df_recovery = df4[df4['Cohort_Number'] == 2]

In [61]:
# Function to convert columns to numeric
def convert_to_numeric(col):
    # Convert to numeric, coercing errors to NaN
    return pd.to_numeric(col)

# Apply the function to all columns in the DataFrame
for column in df_abduction.columns:
    df_abduction[column] = convert_to_numeric(df_abduction[column])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_abduction[column] = convert_to_numeric(df_abduction[column])


In [62]:
df_abduction.describe()

Unnamed: 0,d383d75128d7423a9bbdb2076120e365,aeb03963939e00b75d7370f4be601417,6a125442b3d882bd11b5cfe1866470fd,e3bff2e5d94dbb2b69f466ee85a1acf4,5a0f522431143dce1339d7359fc37599,833bf02443c2dece76422ef394ce48d0,Cohort_Number
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,1882.537037,1855.351852,996.759259,2938.425926,1308.425926,2212.5,1.0
std,4223.671709,3516.213809,3789.564081,10068.587755,2264.945654,5157.699227,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,10.25,0.0,0.0,0.0,38.75,0.0,1.0
50%,193.0,258.0,0.0,0.0,262.5,209.0,1.0
75%,1554.25,1499.25,588.75,290.25,1726.5,1362.75,1.0
max,19940.0,14867.0,27511.0,56796.0,10911.0,28565.0,1.0


In [63]:
# Function to convert columns to numeric
def convert_to_numeric(col):
    # Convert to numeric, coercing errors to NaN
    return pd.to_numeric(col)

# Apply the function to all columns in the DataFrame
for column in df_recovery.columns:
    df_recovery[column] = convert_to_numeric(df_recovery[column])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recovery[column] = convert_to_numeric(df_recovery[column])


In [64]:
df_recovery.describe()

Unnamed: 0,d383d75128d7423a9bbdb2076120e365,aeb03963939e00b75d7370f4be601417,6a125442b3d882bd11b5cfe1866470fd,e3bff2e5d94dbb2b69f466ee85a1acf4,5a0f522431143dce1339d7359fc37599,833bf02443c2dece76422ef394ce48d0,Cohort_Number
count,48.0,48.0,48.0,48.0,48.0,48.0,48.0
mean,248.0625,158.708333,64.020833,2.125,772.708333,245.916667,2.0
std,1294.267235,756.912737,443.549344,8.900454,2944.364903,921.962911,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,2.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,2.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,1.5,0.0,0.0,0.0,41.0,8.25,2.0
max,8826.0,4989.0,3073.0,57.0,14721.0,4160.0,2.0


In [None]:
# All 6 features are less abundant in the recovered patients