In [2]:
import io
import os
import glob
import zipfile
import subprocess
import numpy as np
import pandas as pd
import qiime2 as q2
from skbio import TreeNode 
from biom import load_table, Table
from biom.util import biom_open
from qiime2.plugins.feature_table.methods import merge
from qiime2.plugins.fragment_insertion.methods import sepp


In [231]:
# get map of qiita preps
qiita_mf = pd.read_csv('../data/qiita-tables-processing/qiita-mapping.csv').dropna(1)
qiita_mf.head(3)


Unnamed: 0,qiita_study,prep_name,run_name,100nt_deblur,sequencing_id
0,10894,2524,Run2 Lane 1,60614,lane_1_170216_D00611_0439_BCAJ1MANXX_Knight_2
1,10894,2525,Run2 Lane 2,57513,lane_2_170216_D00611_0439_BCAJ1MANXX_Knight_2
2,10894,2527,Run2 Lane 3,61023,lane_3_170216_D00611_0439_BCAJ1MANXX_Knight_2


In [233]:
# import the table ID's and import to Q2
mf = []
tbls = glob.glob('../data/qiita-tables-processing/biom-table-by-prep/*.biom')
for tbl in tbls:
    # import table
    tblid = tbl.split('/')[-1][:-5]
    tb = load_table(tbl)
    # meta-data subset
    mf_tmp = qiita_mf[qiita_mf['100nt_deblur'].isin([tblid])].copy()
    mf_tmp = pd.concat([mf_tmp for id_ in tb.ids()])
    mf_tmp.index = tb.ids()
    mf.append(mf_tmp)
    # import table to qiime2 and write
    q2tb = q2.Artifact.import_data('FeatureTable[Frequency]', tb)
    q2tb.save(os.path.join('../data/qiita-tables-processing/q2-tables', tblid))
# final metadata merged
mf = pd.concat(mf)
mf.index.name = '#SampleID'
q2.Metadata(mf).save('../data/qiita-tables-processing/qiita-mapped-metadata.qza')
mf.to_csv('../data/qiita-tables-processing/qiita-mapped-metadata.tsv', sep='\t')
mf.head(3)
    

Unnamed: 0_level_0,qiita_study,prep_name,run_name,100nt_deblur,sequencing_id
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10894.HR4234,10894,3908,Baby_2_1-6,58678,lane_1_171002_D00611_0535_BHY5LYBCXY_Knight_Gr...
10894.HR5600,10894,3908,Baby_2_1-6,58678,lane_1_171002_D00611_0535_BHY5LYBCXY_Knight_Gr...
10894.HR4183,10894,3908,Baby_2_1-6,58678,lane_1_171002_D00611_0535_BHY5LYBCXY_Knight_Gr...


In [43]:
# merge all the tables into one
!qiime feature-table merge\
    --i-tables ../data/qiita-tables-processing/q2-tables/*.qza\
    --p-overlap-method 'sum'\
    --o-merged-table ../data/qiita-tables-processing/merged-table.qza


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[32mSaved FeatureTable[Frequency] to: ../data/qiita-tables-processing/merged-table.qza[0m


In [277]:
# build the set of all rep-seqs
seqs_ = q2.Artifact.load('../data/qiita-tables-processing/merged-table.qza').view(Table).ids('observation')
seqs_ = '\n'.join(['>'+i+'\n'+i for i in seqs_])
f = open("../data/qiita-tables-processing/rep-seqs.fa", "w")
f.write(seqs_)
f.close()


In [278]:
# import the rep-seqs
!qiime tools import \
    --input-path ../data/qiita-tables-processing/rep-seqs.fa\
    --output-path ../data/qiita-tables-processing/rep-seqs.qza\
    --type 'FeatureData[Sequence]'


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[32mImported ../data/qiita-tables-processing/rep-seqs.fa as DNASequencesDirectoryFormat to ../data/qiita-tables-processing/rep-seqs.qza[0m


In [280]:
# run taxonomic classification (run on cluster - big compute step)
!qiime feature-classifier classify-sklearn \
  --i-classifier ../data/qiita-tables-processing/gg-13-8-99-515-806-nb-classifier.qza \
  --i-reads ../data/qiita-tables-processing/rep-seqs.qza \
  --o-classification ../data/qiita-tables-processing/taxonomy.qza


In [48]:
# generate sepp-insertion tree (run on cluster - big compute step)
!qiime fragment-insertion sepp\
    --i-representative-sequences ../data/qiita-tables-processing/rep-seqs.qza\
    --output-dir ../data/qiita-tables-processing/sepp-tree\


In [363]:
# table to filter
q2tb = q2.Artifact.load('../data/qiita-tables-processing/merged-table.qza').view(Table)
print(q2tb.shape)
# metadata (merged from MG on 01/21/2020 and ECAM metadata)
mf = pd.read_csv('../data/qiita-tables-processing/metadata.tsv',
                 sep='\t', index_col=0)
mf.index = [ind.replace('11648','10249') for ind in mf.index]
# filter table to match metadata
id_keep = sorted(set(mf.index) & set(q2tb.ids()))
q2tb = q2tb.filter(id_keep)
# ensure no zero sums
q2tb = q2tb.filter(q2tb.ids()[q2tb.sum('sample') > 0]) # samples
filt_ = q2tb.ids('observation')[q2tb.sum('observation') > 0] # features
q2tb = q2tb.filter(filt_, axis='observation')
# reindex metadata and add qiita prep map
mf = mf.reindex(q2tb.ids())
prepmf = pd.read_csv('../data/qiita-tables-processing/qiita-mapped-metadata.tsv',
                     sep='\t', index_col=0)
prepmf = prepmf.reindex(q2tb.ids())
mf = pd.concat([prepmf, mf],
               sort=True, axis=1)
mf.index.name = "#SampleID"
mf = mf[~mf.manuscript_use.isin(['No-LaneRunError'])]
# import the tree (get inserted seqs)
tree = q2.Artifact.load('../data/qiita-tables-processing/sepp-tree/tree.qza').view(TreeNode)
# filter out chloroplast/mitochondria hits
taxonomy = q2.Artifact.load('../data/qiita-tables-processing/taxonomy.qza').view(pd.DataFrame)
drop_ = set([t_ for t_ in taxonomy.Taxon if 'chloroplast' in t_.lower()\
                                         or 'mitochondria' in t_.lower()])
taxonomy = taxonomy[~taxonomy.Taxon.isin(drop_)]
# check shared ids for tree, table, and taxonomy
keep_ = list((set([node.name for node in tree.tips()])\
              & set(taxonomy.index))\
             & set(q2tb.ids('observation')))
#keep_ = list((set(taxonomy.index))\
#             & set(q2tb.ids('observation')))
# filter table 
q2tb = q2tb.filter(keep_, axis='observation')
# ensure no zero sums
q2tb = q2tb.filter(q2tb.ids()[q2tb.sum('sample') > 0]) # samples
filt_ = q2tb.ids('observation')[q2tb.sum('observation') > 0] # features
q2tb = q2tb.filter(filt_, axis='observation')
# match and write metadata
mf = mf.reindex(q2tb.ids())
q2.Metadata(mf).save('../data/processed-data/metadata.qza')
mf.to_csv('../data/processed-data/metadata.tsv', sep='\t')
# write table
print(q2tb.shape)
with biom_open('../data/processed-data/table.biom', 'w') as f:
    q2tb.to_hdf5(f, "example")
q2.Artifact.import_data('FeatureTable[Frequency]', q2tb).save('../data/processed-data/table.qza')
# write taxonomy
q2.Artifact.import_data('FeatureData[Taxonomy]', taxonomy).save('../data/processed-data/taxonomy.qza')
# write tree
q2.Artifact.import_data('Phylogeny[Rooted]', tree).save('../data/processed-data/tree.qza')

(189385, 12945)
(178142, 12319)


'../data/processed-data/tree.qza'

In [364]:
mf.qiita_study.value_counts()


10894.0    10090
11648.0     1044
1718.0       505
Name: qiita_study, dtype: int64

## verify if ECAM / pilot data cause technical effects


In [227]:
# merge all the tables into one
!qiime feature-table merge\
    --i-tables ../data/processed-data/table.qza\
    --i-tables ../data/qiita-tables-processing/other-studies/ECAM/qza/*.qza\
    --p-overlap-method 'sum'\
    --o-merged-table  ../data/qiita-tables-processing/other-studies/merged-table.qza


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[32mSaved FeatureTable[Frequency] to: ../data/qiita-tables-processing/other-studies/merged-table.qza[0m


In [228]:
# table
bt_all = q2.Artifact.load('../data/qiita-tables-processing/other-studies/merged-table.qza').view(Table)

# metadata to include
included_ = ['qiita_study', 'prep_name', 'run_name', '100nt_deblur',
           'country','body_site_corrected', 'subjectid_unique',
           'date_sampling_category_days_continuous', 'birth_mode',
           'baby_sex', 'seeding_method','current_abx','mother_abx_1st_trimester',
           'mother_abx_2nd_trimester','mother_abx_3rd_trimester','current_breast_feeding',
           'current_formula','current_solids','mom_baby']

# match metadata from ecam to ours
mf_seed = pd.read_csv('../data/processed-data/metadata.tsv',
                     sep='\t', index_col=0, dtype=str)
# match and join the ECAM metdata
mf_ecam = pd.read_csv('../data/qiita-tables-processing/other-studies/10249_20180418-081211.txt',
                     sep='\t', index_col=0, dtype=str)
mf_ecam = mf_ecam[~mf_ecam.day_of_life.isin(['na ','na'])]
mf_ecam['qiita_study'] = '11648'
mf_ecam['prep_name'] = '4474'
mf_ecam['run_name'] = 'ECAM'
mf_ecam['100nt_deblur'] = '48368'
mf_ecam['country'] = 'USA'
mf_ecam['body_site_corrected'] = mf_ecam.sample_type.replace({'Stool_Stabilizer':'Feces',
                                                               'Dry_Stool':'Dry_Stool',
                                                               'Rectal_Swab':'Rectal_Swab',
                                                               'Vaginal_Swab':'Vagina',
                                                               'Repeats':'Repeats'})
mf_ecam['subjectid_unique'] = mf_ecam.host_subject_id
mf_ecam['date_sampling_category_days_continuous'] = mf_ecam.day_of_life.astype(float)
mf_ecam['baby_sex'] = mf_ecam.sex.replace({'Female':'F',
                                          'Male':'M',
                                          'na':'unknown'})
mf_ecam['birth_mode'] = mf_ecam.delivery.replace({'Vaginal':'Vag',
                                                 'Cesarean':'CS',
                                                  'na':'unknown'})
mf_ecam['seeding_method'] = np.nan
mf_ecam['current_abx'] = mf_ecam.abx1_pmp_all_bymonth.replace({'pre':'No',
                                                             'mid':'Yes',
                                                              'post':'No'})
mf_ecam['mother_prenatal_gbs']  = 'No'
mf_ecam['mother_abx_perinatal'] = mf_ecam.mom_prenatal_abx.replace({'true':'Yes',
                                                                   'false':'No'})

mf_ecam['mother_abx_1st_trimester'] = 'No'
mf_ecam['mother_abx_2nd_trimester'] = 'No'
mf_ecam['mother_abx_3rd_trimester'] = 'No'
mf_ecam.loc[mf_ecam[mf_ecam.mom_prenatal_abx_trimester.isin(['1'])].index,
            'mother_abx_1st_trimester'] = 'Yes'
mf_ecam.loc[mf_ecam[mf_ecam.mom_prenatal_abx_trimester.isin(['2'])].index,
            'mother_abx_2nd_trimester'] = 'Yes'
mf_ecam.loc[mf_ecam[mf_ecam.mom_prenatal_abx_trimester.isin(['3'])].index,
            'mother_abx_3rd_trimester'] = 'Yes'


mf_ecam['diet_2_month_split'] = [v.split('_')[0]
                                 for v in mf_ecam.diet_2_month.values]
mf_ecam['current_breast_feeding'] = 'No'
mf_ecam['current_formula'] = 'No'
mf_ecam['current_solids'] = 'unknown'
mf_ecam.loc[mf_ecam[mf_ecam.diet_2_month_split.isin(['bd'])].index,
            'current_breast_feeding'] = 'Yes'
mf_ecam.loc[mf_ecam[mf_ecam.diet_2_month_split.isin(['fd'])].index,
            'current_formula'] = 'Yes'

mf_ecam['mom_baby'] = mf_ecam.mom_child.replace({'C':'Baby',
                                                 'M':'Mom'})
mf_ecam.index = mf_ecam.index.astype(str)

# subset
mf_ecam = mf_ecam[included_]
mf_seed = mf_seed[included_]

# merge the metadata
mf_both = pd.concat([mf_seed, mf_ecam], sort=True)
mf_both['sample_type'] = mf_both.body_site_corrected.replace({'Stool_Stabilizer':'Feces',
                                                              'Oral':'Mouth',
                                                              'Right_Forearm': 'Skin'})
mf_both = mf_both[mf_both.sample_type.isin(['Feces','Mouth','Skin'])]
# subset by type
body_type = {}
for btlabl, mf_bt in mf_both.groupby('sample_type'):
    mf_bt['date_sampling_category_days_continuous'] = mf_bt.date_sampling_category_days_continuous.astype(float)
    # filter table 
    bt = bt_all.copy()
    keep_ = list(set(mf_bt.index) & set(bt.ids()))
    bt = bt.filter(keep_)
    # ensure no zero sums
    bt = bt.filter(bt.ids()[bt.sum('sample') > 0]) # samples
    filt_ = bt.ids('observation')[bt.sum('observation') > 0] # features
    bt = bt.filter(filt_, axis='observation')
    mf_bt = mf_bt.reindex(bt.ids())
    mf_bt.index = mf_bt.index.astype(str)
    print(btlabl)
    print(mf_bt.qiita_study.value_counts())
    mf_bt.index.name = "#SampleID"
    body_type[btlabl] = (q2.Artifact.import_data('FeatureTable[Frequency]', bt),
                            q2.Metadata(mf_bt))

    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Feces
10894.0    1750
11648       806
1718.0      199
Name: qiita_study, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Mouth
10894.0    1750
1718.0       52
Name: qiita_study, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Skin
10894.0    1505
Name: qiita_study, dtype: int64


In [229]:
from qiime2.plugins.deicode.actions import rpca
from qiime2.plugins.emperor.actions import biplot
from skbio import DistanceMatrix
from skbio.stats.distance import permanova

bs_rpca = {}
perm_res = {}
for bs_lbl, (tbl, mta) in body_type.items():
    res = rpca(tbl)
    bs_rpca[bs_lbl] = res
    vis = biplot(res.biplot, mta)
    vis.visualization.save('../data/qiita-tables-processing/other-studies/%s-biplot.qzv' % (bs_lbl))
    dist_ = res.distance_matrix.view(DistanceMatrix)
    perm_tmp = permanova(dist_, mta.to_dataframe().reindex(dist_.ids)['prep_name'])
    perm_res[bs_lbl] = pd.DataFrame(perm_tmp)
    print(bs_lbl)
    print(perm_tmp)

perm_resdf = pd.concat(perm_res)
perm_resdf.to_csv('../data/qiita-tables-processing/other-studies/permanova-results.tsv', sep='\t')
perm_resdf


Feces
method name               PERMANOVA
test statistic name        pseudo-F
sample size                    2659
number of groups                 12
test statistic              16.4097
p-value                       0.001
number of permutations          999
Name: PERMANOVA results, dtype: object


KeyboardInterrupt: 

In [230]:
dist_ = res.distance_matrix.view(DistanceMatrix)
perm_tmp = permanova(dist_, mta.to_dataframe().reindex(dist_.ids)['prep_name'])
perm_tmp

ValueError: All values in the grouping vector are unique. This method cannot operate on a grouping vector with only unique values (e.g., there are no 'within' distances because each group of objects contains only a single object).

In [216]:
mf_ecam = pd.read_csv('../data/qiita-tables-processing/other-studies/10249_20180418-081211.txt',
                     sep='\t', index_col=0, dtype=str)
mf_ecam = mf_ecam[~mf_ecam.day_of_life.isin(['na ','na'])]
mf_ecam = mf_ecam[mf_ecam.mom_child.isin(['C'])]
mf_ecam.sample_type.value_counts()

Stool_Stabilizer    762
Dry_Stool           135
Repeats              23
Name: sample_type, dtype: int64

In [214]:
# table
bt_all = q2.Artifact.load('../data/qiita-tables-processing/other-studies/merged-table.qza').view(Table)



array(['1718.1.14.I01.DM.200P', '1718.11.26.I01.RP', '1718.BLANK24', ...,
       '10249.C014.09SS', '10249.C004.10SS', '10249.C009.05SS'],
      dtype=object)

In [341]:
# merge the two metadata tables

# metadata to include
included_ = ['country','body_site_corrected', 'subjectid_unique',
           'date_sampling_category_days_continuous', 'birth_mode_ms',
           'baby_sex', 'seeding_method','current_abx','mother_abx_1st_trimester',
           'mother_abx_2nd_trimester','mother_abx_3rd_trimester','current_breast_feeding',
           'current_formula','current_solids','mom_baby','manuscript_use',
             'mother_prenatal_gbs','mother_abx_perinatal']

# match metadata from ecam to ours
mf_seed = pd.read_csv('../data/qiita-tables-processing/seeding-metadata.tsv',
                     sep='\t', index_col=0, dtype=str)
# match and join the ECAM metdata
mf_ecam = pd.read_csv('../data/qiita-tables-processing/10249_20180418-081211.txt',
                     sep='\t', index_col=0, dtype=str)
mf_ecam = mf_ecam[~mf_ecam.day_of_life.isin(['na ','na'])]
mf_ecam['country'] = 'USA'
mf_ecam['body_site_corrected'] = mf_ecam.sample_type.replace({'Stool_Stabilizer':'Feces',
                                                               'Dry_Stool':'Dry_Stool',
                                                               'Rectal_Swab':'Rectal_Swab',
                                                               'Vaginal_Swab':'Vagina',
                                                               'Repeats':'Repeats'})
mf_ecam['subjectid_unique'] = 'ECAM.'+mf_ecam.host_subject_id.astype(str)
mf_ecam['date_sampling_category_days_continuous'] = mf_ecam.day_of_life.astype(float)
mf_ecam['baby_sex'] = mf_ecam.sex.replace({'Female':'F',
                                          'Male':'M',
                                          'na':'unknown'})
mf_ecam['birth_mode_ms'] = mf_ecam.delivery.replace({'Vaginal':'Vag',
                                                 'Cesarean':'CS',
                                                  'na':'unknown'})
mf_ecam['seeding_method'] = np.nan
mf_ecam['current_abx'] = mf_ecam.abx1_pmp_all_bymonth.replace({'pre':'No',
                                                             'mid':'Yes',
                                                              'post':'No'})
mf_ecam['mother_prenatal_gbs']  = 'No'
mf_ecam['mother_abx_perinatal'] = mf_ecam.mom_prenatal_abx.replace({'true':'Yes',
                                                                   'false':'No'})

mf_ecam['mother_abx_1st_trimester'] = 'No'
mf_ecam['mother_abx_2nd_trimester'] = 'No'
mf_ecam['mother_abx_3rd_trimester'] = 'No'
mf_ecam.loc[mf_ecam[mf_ecam.mom_prenatal_abx_trimester.isin(['1'])].index,
            'mother_abx_1st_trimester'] = 'Yes'
mf_ecam.loc[mf_ecam[mf_ecam.mom_prenatal_abx_trimester.isin(['2'])].index,
            'mother_abx_2nd_trimester'] = 'Yes'
mf_ecam.loc[mf_ecam[mf_ecam.mom_prenatal_abx_trimester.isin(['3'])].index,
            'mother_abx_3rd_trimester'] = 'Yes'

mf_ecam['manuscript_use'] = 'Possible'
mf_ecam['diet_2_month_split'] = [v.split('_')[0]
                                 for v in mf_ecam.diet_2_month.values]
mf_ecam['current_breast_feeding'] = 'No'
mf_ecam['current_formula'] = 'No'
mf_ecam['current_solids'] = 'unknown'
mf_ecam.loc[mf_ecam[mf_ecam.diet_2_month_split.isin(['bd'])].index,
            'current_breast_feeding'] = 'Yes'
mf_ecam.loc[mf_ecam[mf_ecam.diet_2_month_split.isin(['fd'])].index,
            'current_formula'] = 'Yes'

mf_ecam['mom_baby'] = mf_ecam.mom_child.replace({'C':'Baby',
                                                 'M':'Mom'})

keep_days = set(mf_seed.date_sampling_category_days_continuous.astype(float))
mf_ecam = mf_ecam[mf_ecam.date_sampling_category_days_continuous.isin(keep_days)]
mf_ecam.index = mf_ecam.index.astype(str)

# subset
mf_ecam = mf_ecam[included_]
mf_seed = mf_seed[included_]

# merge the metadata
mf_both = pd.concat([mf_seed, mf_ecam], sort=True)
mf_both['sample_type'] = mf_both.body_site_corrected.replace({'Stool_Stabilizer':'Feces',
                                                              'Oral':'Mouth',
                                                              'Right_Forearm': 'Skin'})
mf_both[mf_both=='unknown'] = np.nan
mf_both[mf_both=='Maybe'] = np.nan

ever_col = ['current_abx',
            'current_breast_feeding',
            'current_formula']
for ever_col_ in ever_col:
    ever_ = {k:set(df[ever_col_].dropna())
             for k, df in mf_both.groupby(['subjectid_unique'])}
    for k, v in ever_.items():
        if len(v) >= 2:
            ever_[k] = 'Yes'
        else:
            ever_[k] = 'No'
    ever_[np.nan] = np.nan
    mf_both['ever'+ ever_col_.replace('current','')] = [ever_[v] for v in mf_both.subjectid_unique]


mf_both.to_csv('../data/qiita-tables-processing/metadata.tsv', sep='\t')
mf_both.head(3)


KeyboardInterrupt: 

In [218]:
len(set(mf_ecam.index) & set(bt_all.ids()))


277

In [219]:
set(mf_ecam.index) - set(bt_all.ids())

{'10249.C001.01SS',
 '10249.C001.04SS',
 '10249.C001.29SS',
 '10249.C001.30SS',
 '10249.C001.31SS',
 '10249.C001.32SS',
 '10249.C001.34SD',
 '10249.C001.34SS',
 '10249.C001.35SS',
 '10249.C001.36SD',
 '10249.C001.36SS',
 '10249.C001.37SD',
 '10249.C001.37SS',
 '10249.C001.38SS',
 '10249.C001.39SS',
 '10249.C001.40SS',
 '10249.C001.41SS',
 '10249.C001.42SS',
 '10249.C002.01SS',
 '10249.C002.14SS',
 '10249.C002.15SS',
 '10249.C002.16SS',
 '10249.C002.17SS',
 '10249.C002.18SS',
 '10249.C002.19SD',
 '10249.C002.19SS',
 '10249.C002.20SS',
 '10249.C002.21SD',
 '10249.C002.21SS',
 '10249.C002.22SS',
 '10249.C002.23SS',
 '10249.C004.16SD',
 '10249.C004.16SS',
 '10249.C005.01SS',
 '10249.C005.15SS',
 '10249.C005.16SS',
 '10249.C005.17SS',
 '10249.C005.18SS',
 '10249.C005.19SD',
 '10249.C005.19SS',
 '10249.C005.20SS',
 '10249.C005.21SD',
 '10249.C005.21SS',
 '10249.C005.22SS',
 '10249.C005.23SS',
 '10249.C005.24SS',
 '10249.C007.01SS',
 '10249.C007.14SS',
 '10249.C007.17SS',
 '10249.C007.18SS',


In [220]:
btcheck = q2.Artifact.load('../data/qiita-tables-processing/other-studies/48368-ECAM.qza').view(Table)
btcheck.shape


(1797, 277)

In [110]:
# table
bt_all = q2.Artifact.load('../data/qiita-tables-processing/other-studies/merged-table.qza').view(Table)

# metadata merge studies
mf_one = pd.read_csv('../data/processed-data/metadata.tsv',
                     sep='\t', index_col=0, dtype=str)
mf_one['sample_type'] = mf_one['Body_Site_corrected']
mf_one = mf_one[mf_one.Mom_Baby.isin(['Baby'])]
mf_one = mf_one[mf_one.sample_type.isin(['Feces','Right_Forearm','Mouth'])]
mf_one = mf_one[~mf_one.manuscript_use.isin(['No-LaneRunError'])]

mf_one.index = mf_one.index.astype(str)

mf_ecam = pd.read_csv('../data/qiita-tables-processing/other-studies/10249_20180418-081211.txt',
                     sep='\t', index_col=0, dtype=str)
mf_ecam['qiita_study'] = '11648'
mf_ecam['prep_name'] = '4474'
mf_ecam['100nt_deblur'] = '48368'
mf_ecam.index = mf_ecam.index.astype(str)

mf_pilot = pd.read_csv('../data/qiita-tables-processing/other-studies/2010_20190328-120214.txt',
                     sep='\t', index_col=0, dtype=str)
mf_pilot['qiita_study'] = '2010'
mf_pilot['prep_name'] = '1054'
mf_pilot['100nt_deblur'] = '26756'
mf_pilot.index = mf_pilot.index.astype(str)

mf_check = pd.concat([mf_one, mf_ecam, mf_pilot], sort=True).dropna(subset=['sample_type']).dropna(1)
mf_check['sample_type'] = mf_check.sample_type.replace({'Stool_Stabilizer':'Feces',
                                                        'Oral':'Mouth',
                                                        'Right_Forearm': 'Skin'})
mf_check = mf_check[mf_check.sample_type.isin(['Feces','Mouth','Skin'])]
# subset by type
body_type = {}
for btlabl, mf_bt in mf_check.groupby('sample_type'):
    # filter table 
    bt = bt_all.copy()
    keep_ = list(set(mf_bt.index) & set(bt.ids()))
    bt = bt.filter(keep_)
    # ensure no zero sums
    bt = bt.filter(bt.ids()[bt.sum('sample') > 0]) # samples
    filt_ = bt.ids('observation')[bt.sum('observation') > 0] # features
    bt = bt.filter(filt_, axis='observation')
    mf_bt = mf_bt.reindex(bt.ids())
    mf_bt.index = mf_bt.index.astype(str)
    print(btlabl)
    print(mf_bt.qiita_study.value_counts())
    mf_bt.index.name = "#SampleID"
    body_type[btlabl] = (q2.Artifact.import_data('FeatureTable[Frequency]', bt),
                            q2.Metadata(mf_bt))


Feces
10894.0    991
11648      254
1718.0     198
2010         7
Name: qiita_study, dtype: int64
Mouth
10894.0    977
2010       174
1718.0      40
Name: qiita_study, dtype: int64
Skin
10894.0    758
2010       438
Name: qiita_study, dtype: int64


In [117]:
mf_one.columns

Index(['qiita_study', 'prep_name', 'run_name', '100nt_deblur', 'sequencing_id',
       'SeqCount', 'orig_sampleid', 'study_id', 'primer_plate', 'well', 'lane',
       'run', 'Hospital_Name', 'Village', 'State', 'Country',
       'IRB_Institution', 'Project_Name', 'Body_Site_orig',
       'Body_Site_corrected', 'Body_Site_Type', 'FamilyID', 'FamilyID_Unique',
       'Mom_Baby', 'SubjectID', 'SubjectID_Unique', 'Date_Sampling',
       'Real_Sampling_Time', 'Date_Sampling_Category',
       'Date_Sampling_Category_Days', 'Date_Sampling_Category_Days_continuous',
       'Baby_Sex', 'Birth_Mode', 'Seeding_Method', 'Baby_Birth_Date',
       'Current_ABX', 'Mother_Prenatal_GBS', 'Mother_ABX_Perinatal',
       'Mother_ABX_Perinatal_Name', 'Mother_ABX_1st_Trimester',
       'Mother_ABX_1st_Trimester_Name', 'Mother_ABX_2nd_Trimester',
       'Mother_ABX_2nd_Trimester_Name', 'Mother_ABX_3rd_Trimester',
       'Mother_ABX_3rd_Trimester_Name', 'Mother_Race',
       'Current_Breast_Feeding', 'Current

In [116]:
list(mf_pilot.columns)

['age',
 'age_baby_days',
 'age_mother_y',
 'age_unit',
 'altitude',
 'anonymized_name',
 'antibiotics_after_birth',
 'antibiotics_at_birth',
 'baby_birth_date',
 'baby_sex',
 'babygender_m_f',
 'birth_mode',
 'bmi',
 'body_habitat',
 'body_product',
 'body_site',
 'body_site_corrected',
 'body_site_orig',
 'body_site_type',
 'bodysite_oralmucosa_forehead_volarright_palmright_footright_vag',
 'collection_timestamp',
 'comment',
 'country',
 'current_abx',
 'current_breast_feeding',
 'current_formula',
 'currrent_solids',
 'date_sampling',
 'date_sampling_category',
 'date_sampling_category_days',
 'date_sampling_category_days_continuous',
 'delivery_vvaginal_ccs_icsinoc',
 'depth',
 'description',
 'diet',
 'diet_b_exclusisbreast__f_formula_bf_breasfedformula__bs_breasts',
 'diet_description',
 'disease',
 'dna_extracted',
 'drug_name',
 'drug_type',
 'elevation',
 'env',
 'env_biome',
 'env_feature',
 'ethnicgroup',
 'exclusive_breastfeed',
 'extracted_dna_avail_now',
 'family',
 'fam

In [226]:
in_ = '/Users/cmartino/Dropbox/bin/mg_seeding/data/qiita-tables-processing/other-studies/ECAM/biom'

for biom_ in glob.glob(in_+'/*'):
    # import table to qiime2 and write
    q2tb = q2.Artifact.import_data('FeatureTable[Frequency]', biom_)
    q2tb.save(biom_.replace('biom', 'qza'))


In [58]:
mta.to_dataframe()['qiita_study']

#SampleID
10894.CH1547    10894
10894.CH688     10894
10894.CH1563    10894
10894.CH636     10894
10894.CH998     10894
10894.CH843     10894
10894.CH867     10894
10894.CH1010    10894
10894.CH290     10894
10894.CH127     10894
10894.CH1356    10894
10894.CH1384    10894
10894.CH1372    10894
10894.CH1575    10894
10894.CH668     10894
10894.CH1551    10894
10894.CH1519    10894
10894.CH871     10894
10894.CH1006    10894
10894.CH855     10894
10894.CH1022    10894
10894.CH1392    10894
10894.CH115     10894
10894.CH1364    10894
10894.CH839     10894
10894.CH131     10894
10894.CH286     10894
10894.CH1340    10894
10894.CH990     10894
10894.CH1523    10894
                ...  
2010.3113        2010
2010.3114        2010
2010.3117        2010
2010.3118        2010
2010.3119        2010
2010.3126        2010
2010.3127        2010
2010.3128        2010
2010.3131        2010
2010.3132        2010
2010.3133        2010
2010.3138        2010
2010.3139        2010
2010.314         2010


In [9]:
tbl, mta = body_type['Skin']
res = rpca(tbl)


In [19]:
permanova(res..view(DistanceMatrix), mta.to_dataframe()['qiita_study'])


10894    850
2010     487
Name: qiita_study, dtype: int64

In [16]:
vis = biplot(res.biplot, mta)
vis.visualization.save('')


In [4]:
from qiime2.plugins import available_plugins


In [6]:
from qiime2.plugins.deicode.actions import rpca

In [5]:
available_plugins()

{'qiime2.plugins.alignment',
 'qiime2.plugins.composition',
 'qiime2.plugins.cutadapt',
 'qiime2.plugins.dada2',
 'qiime2.plugins.deblur',
 'qiime2.plugins.deicode',
 'qiime2.plugins.demux',
 'qiime2.plugins.diversity',
 'qiime2.plugins.emperor',
 'qiime2.plugins.feature_classifier',
 'qiime2.plugins.feature_table',
 'qiime2.plugins.fragment_insertion',
 'qiime2.plugins.gemelli',
 'qiime2.plugins.gneiss',
 'qiime2.plugins.longitudinal',
 'qiime2.plugins.metadata',
 'qiime2.plugins.mmvec',
 'qiime2.plugins.phylogeny',
 'qiime2.plugins.quality_control',
 'qiime2.plugins.quality_filter',
 'qiime2.plugins.qurro',
 'qiime2.plugins.sample_classifier',
 'qiime2.plugins.songbird',
 'qiime2.plugins.taxa',
 'qiime2.plugins.types',
 'qiime2.plugins.vsearch'}

In [281]:
!qiime dev refresh-cache

[33mQIIME is caching your current deployment for improved performance. This may take a few moments and should only happen once per deployment.[0m
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [291]:
!qiime deicode --help

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Usage: [34mqiime deicode[0m [OPTIONS] COMMAND [ARGS]...

  Description: This is a QIIME 2 plugin supporting Robust Aitchison on
  feature tables

  Plugin website: https://github.com/biocore/DEICODE

  Getting user support: Please post to the QIIME 2 forum for help with this
  plugin: https://forum.qiime2.org

[1mOptions[0m:
  [34m--version[0m    Show the version and exit.


In [46]:
q2tb = q2.Artifact.import_data('FeatureTable[Frequency]', '../data/qiita-tables-processing/other-studies/48368-ECAM.biom')
q2tb.save('../data/qiita-tables-processing/other-studies/48368-ECAM.qza')




'../data/qiita-tables-processing/other-studies/48368-ECAM.qza'

In [None]:
# match and write metadata
mf = mf.reindex(q2tb.ids())
q2.Metadata(mf).save('../data/processed-data/metadata.qza')
mf.to_csv('../data/processed-data/metadata.tsv', sep='\t')

In [72]:

mf = pd.read_csv('../data/qiita-tables-processing/metadata.tsv',
                 sep='\t', index_col=0)
mf['Hospital_Name'] = mf['Hospital_Name'].replace(' ','', regex=True)
mf['Body_Site_orig'] = mf['Body_Site_orig'].replace(' ','', regex=True)
mf.to_csv('../data/qiita-tables-processing/metadata.tsv', sep='\t')

In [163]:
mf.shape

(11171, 50)

In [169]:
mf_old = pd.read_csv('../data/qiita-tables-processing/metadata.tsv',
                 sep='\t', index_col=0)
set(mf_old.index) - set(mf.index)


{'10894.4723',
 '10894.4795',
 '10894.4801',
 '10894.5771',
 '10894.5902',
 '10894.5921',
 '10894.5940',
 '10894.5946',
 '10894.6046',
 '10894.6517',
 '10894.6770',
 '10894.6794',
 '10894.6795',
 '10894.6943',
 '10894.6944',
 '10894.6945',
 '10894.6946',
 '10894.7004',
 '10894.7005',
 '10894.7006',
 '10894.7018',
 '10894.7019',
 '10894.7021',
 '10894.7029',
 '10894.7030',
 '10894.7031',
 '10894.7054',
 '10894.7055',
 '10894.7056',
 '10894.7057',
 '10894.7068',
 '10894.7069',
 '10894.7070',
 '10894.7079',
 '10894.7080',
 '10894.7081',
 '10894.7082',
 '10894.7093',
 '10894.7094',
 '10894.7095',
 '10894.7118',
 '10894.7119',
 '10894.7120',
 '10894.7143',
 '10894.7144',
 '10894.7145',
 '10894.7146',
 '10894.7168',
 '10894.7169',
 '10894.7170',
 '10894.7193',
 '10894.7194',
 '10894.7195',
 '10894.7218',
 '10894.7219',
 '10894.7220',
 '10894.7221',
 '10894.7343',
 '10894.7344',
 '10894.7345',
 '10894.7368',
 '10894.7369',
 '10894.7370',
 '10894.7393',
 '10894.7394',
 '10894.7395',
 '10894.74

In [171]:
mf_old.reindex(set(mf_old.index) - set(mf.index)).Use.value_counts()

No-control              411
Possible                321
No-notrelevant          119
Possible-Replicate       66
No-PerinatalABXinVag     20
No-misc                  11
No-GBSPositive            7
No-LaneRunError           1
Name: Use, dtype: int64

In [180]:
prepmf = pd.read_csv('../data/qiita-tables-processing/qiita-mapped-metadata.tsv',
                     sep='\t', index_col=0)
[i for i in prepmf.index if '1654' in i]


['10894.CH1654']

In [None]:
10894.12261.MGDB.ESP.JUL18.0754, 12261.MGDB.ESP.JUL18.0754

In [195]:
prepmf[prepmf.prep_name.isin(['8322'])]

Unnamed: 0_level_0,qiita_study,prep_name,run_name,100nt_deblur,sequencing_id
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10894.12261.MGDB.ESP.JUL18.0754,10894,8322,NYURun25,85233,NYURun25
10894.12261.MGDB.ESP.JUL18.0536,10894,8322,NYURun25,85233,NYURun25
10894.12261.MGDB.ESP.JUL18.1243,10894,8322,NYURun25,85233,NYURun25
10894.12261.MGDB.ESP.JUL18.0245,10894,8322,NYURun25,85233,NYURun25
10894.12261.MGDB.ESP.JUL18.0390,10894,8322,NYURun25,85233,NYURun25
10894.12261.MGDB.ESP.JUL18.0199,10894,8322,NYURun25,85233,NYURun25
10894.12261.MGDB.ESP.JUL18.1228,10894,8322,NYURun25,85233,NYURun25
10894.12261.MGDB.ESP.JUL18.0027,10894,8322,NYURun25,85233,NYURun25
10894.12261.MGDB.ESP.JUL18.1126,10894,8322,NYURun25,85233,NYURun25
10894.12261.MGDB.ESP.JUL18.0297,10894,8322,NYURun25,85233,NYURun25


In [197]:
# metadata (jincheng)
mf = pd.read_csv('../data/qiita-tables-processing/metadata.tsv', sep='\t', index_col=0)
mf['Hospital_Name'] = mf['Hospital_Name'].replace(' ','', regex=True)
mf['Body_Site_orig'] = mf['Body_Site_orig'].replace(' ','', regex=True)
mf.to_csv('../data/qiita-tables-processing/metadata.tsv', sep='\t')


In [160]:
# metadata (jincheng)
mf = pd.read_csv('../data/qiita-tables-processing/metadata.tsv', sep='\t', index_col=0)
missing_ = sorted(set(prepmf.index) - set(mf.index))
print(len(missing_))
rename_ = {'.'.join(i.split('.')[1:]):i for i in missing_ if '10894' in i}
mf = mf.rename(rename_, axis=0)
missing_ = sorted(set(prepmf.index) - set(mf.index))
print(len(missing_))
#mf.to_csv('../data/qiita-tables-processing/metadata.tsv', sep='\t')


558
558


In [193]:
rename_

{'BLANK.1.10B': '10894.BLANK.1.10B',
 'BLANK.1.10D': '10894.BLANK.1.10D',
 'BLANK.1.10G': '10894.BLANK.1.10G',
 'BLANK.1.10H': '10894.BLANK.1.10H',
 'BLANK.1.11A': '10894.BLANK.1.11A',
 'BLANK.1.11H': '10894.BLANK.1.11H',
 'BLANK.1.12A': '10894.BLANK.1.12A',
 'BLANK.1.12G': '10894.BLANK.1.12G',
 'BLANK.1.12H': '10894.BLANK.1.12H',
 'BLANK.10.11G': '10894.BLANK.10.11G',
 'BLANK.10.12A': '10894.BLANK.10.12A',
 'BLANK.10.12B': '10894.BLANK.10.12B',
 'BLANK.10.12C': '10894.BLANK.10.12C',
 'BLANK.10.12E': '10894.BLANK.10.12E',
 'BLANK.10.12F': '10894.BLANK.10.12F',
 'BLANK.10.12G': '10894.BLANK.10.12G',
 'BLANK.10.12H': '10894.BLANK.10.12H',
 'BLANK.11.12F': '10894.BLANK.11.12F',
 'BLANK.11.12G': '10894.BLANK.11.12G',
 'BLANK.12.12F': '10894.BLANK.12.12F',
 'BLANK.13.12C': '10894.BLANK.13.12C',
 'BLANK.13.12E': '10894.BLANK.13.12E',
 'BLANK.13.12F': '10894.BLANK.13.12F',
 'BLANK.13.12H': '10894.BLANK.13.12H',
 'BLANK.2.11E': '10894.BLANK.2.11E',
 'BLANK.2.11F': '10894.BLANK.2.11F',
 'BLANK.

In [150]:
len([i for i in missing_ if '1718' in i])

454

In [151]:
454 + 452

906

In [152]:
len(missing_)

1010

In [126]:
check_ = '../data/qiita-tables-processing/biom-table-by-prep/56612.biom'
load_table(check_).ids()

array(['1718.1.14.I01.DM.200P', '1718.11.26.I01.RP', '1718.BLANK24',
       '1718.12.17.I01.TONG', '1718.11.7.WASH.AN', '1718.11.27.I01.RP',
       '1718.11.7.dad01.FH', '1718.11.10.dad01.RP', '1718.11.18.I01.FH',
       '1718.2.9.I01.VAG', '1718.11.19.I01.DM.440A',
       '1718.11.11.mum01.TONG', '1718.11.19.I01.FH', '1718.11.24.I01.RP',
       '1718.11.10.mum01.VAG', '1718.2.2.mum01.RP', '1718.11.26.dad01.FH',
       '1718.11.10.I01.DM.1235A', '1718.11.30.I01.DM.500P',
       '1718.11.24.mum01.FH', '1718.11.15.I01.TONG',
       '1718.3.9.dad01.FH.DS', '1718.11.20.I01.RP',
       '1718.12.1.mum01.TONG', '1718.12.21.I01.DM.1246P',
       '1718.3.10.mum01.FH.DS', '1718.1.8.I01.TONG', '1718.11.5.mum01.FH',
       '1718.11.12.mum01.RP', '1718.11.22.I01.RP', '1718.11.30.mum01.BR',
       '1718.12.1.I01.DM.1130P', '1718.12.31.dad01.FECE',
       '1718.12.24.dad01.FH', '1718.12.31.mum01.LP', '1718.11.23.I01.RP',
       '1718.12.17.I01.LP', '1718.11.9.dad01.BR', '1718.11.10.dad01.TONG',
     

In [133]:
[i for i in mf.index if '12261' in i]

['12261.MGDB.ESP.JUL18.0001',
 '12261.MGDB.ESP.JUL18.0002',
 '12261.MGDB.ESP.JUL18.0007',
 '12261.MGDB.ESP.JUL18.0012',
 '12261.MGDB.ESP.JUL18.0022',
 '12261.MGDB.ESP.JUL18.0027',
 '12261.MGDB.ESP.JUL18.0037',
 '12261.MGDB.ESP.JUL18.0038',
 '12261.MGDB.ESP.JUL18.0043',
 '12261.MGDB.ESP.JUL18.0048',
 '12261.MGDB.ESP.JUL18.0053',
 '12261.MGDB.ESP.JUL18.0058',
 '12261.MGDB.ESP.JUL18.0063',
 '12261.MGDB.ESP.JUL18.0068',
 '12261.MGDB.ESP.JUL18.0209',
 '12261.MGDB.ESP.JUL18.0210',
 '12261.MGDB.ESP.JUL18.0215',
 '12261.MGDB.ESP.JUL18.0220',
 '12261.MGDB.ESP.JUL18.0225',
 '12261.MGDB.ESP.JUL18.0230',
 '12261.MGDB.ESP.JUL18.0235',
 '12261.MGDB.ESP.JUL18.0245',
 '12261.MGDB.ESP.JUL18.0246',
 '12261.MGDB.ESP.JUL18.0251',
 '12261.MGDB.ESP.JUL18.0256',
 '12261.MGDB.ESP.JUL18.0261',
 '12261.MGDB.ESP.JUL18.0266',
 '12261.MGDB.ESP.JUL18.0271',
 '12261.MGDB.ESP.JUL18.0276',
 '12261.1.PCR.NC',
 '12261.1.DNA.Ext.NC',
 '12261.2.PCR.NC',
 '12261.2.DNA.Ext.NC',
 '12261.3.PCR.NC',
 '12261.3.DNA.Ext.NC',
 '12

In [112]:
mf = pd.read_csv('../data/qiita-tables-processing/metadata.tsv', sep='\t', index_col=0)
mf


Unnamed: 0_level_0,orig_sampleid,study_id,primer_plate,well,lane,run,Hospital_Name,Village,State,Country,...,Mother_ABX_2nd_Trimester,Mother_ABX_2nd_Trimester_Name,Mother_ABX_3rd_Trimester,Mother_ABX_3rd_Trimester_Name,Mother_Race,Current_Breast_Feeding,Current_Formula,Current_Solids,Exclusive_BreastFeed,Use
X.SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10894.BL576,BL576,10894,6,C3,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,No,Yes,No-notrelevant
10894.BL1079,BL1079,10894,7,E1,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,No,Yes,No-notrelevant
10894.BL1095,BL1095,10894,7,A2,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,No,Yes,No-notrelevant
10894.BL1135,BL1135,10894,7,C3,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,No,Yes,No-notrelevant
10894.BL808,BL808,10894,6,B5,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,No,Yes,No-notrelevant
10894.BL924,BL924,10894,6,B8,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,No,Yes,No-notrelevant
10894.BL1178,BL1178,10894,7,E4,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,Yes,No,No-notrelevant
10894.BL1250,BL1250,10894,7,G6,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,Yes,No,No-notrelevant
10894.BL1282,BL1282,10894,7,G7,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,Yes,No,No-notrelevant
10894.BL1322,BL1322,10894,7,A9,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,Yes,No,No-notrelevant


In [None]:
mf = pd.read_csv('../data/qiita-tables-processing/metadata.tsv', sep='\t', index_col=0)
pd.concat([mf, prepmf], axis=1, sort=True)


In [101]:
len(set(mf.index) & set(q2tb.ids()))


10719

In [102]:
len(q2tb.ids())

11729

In [115]:
11729 - 10719

1010

In [90]:
mf.index.name = "#SampleID"


Unnamed: 0_level_0,orig_sampleid,study_id,primer_plate,well,lane,run,Hospital_Name,Village,State,Country,...,Mother_ABX_2nd_Trimester,Mother_ABX_2nd_Trimester_Name,Mother_ABX_3rd_Trimester,Mother_ABX_3rd_Trimester_Name,Mother_Race,Current_Breast_Feeding,Current_Formula,Current_Solids,Exclusive_BreastFeed,Use
X.SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10894.BL576,BL576,10894,6,C3,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,No,Yes,No-notrelevant
10894.BL1079,BL1079,10894,7,E1,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,No,Yes,No-notrelevant
10894.BL1095,BL1095,10894,7,A2,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,No,Yes,No-notrelevant
10894.BL1135,BL1135,10894,7,C3,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,No,Yes,No-notrelevant
10894.BL808,BL808,10894,6,B5,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,No,Yes,No-notrelevant
10894.BL924,BL924,10894,6,B8,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,No,Yes,No-notrelevant
10894.BL1178,BL1178,10894,7,E4,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,Yes,No,No-notrelevant
10894.BL1250,BL1250,10894,7,G6,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,Yes,No,No-notrelevant
10894.BL1282,BL1282,10894,7,G7,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,Yes,No,No-notrelevant
10894.BL1322,BL1322,10894,7,A9,5,2,,,,Bolivia,...,No,,No,,Amerindian,Yes,No,Yes,No,No-notrelevant


In [85]:
[t_ for t_ in taxonomy.Taxon if 'mitochondria' in t_.lower()]

['k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rickettsiales; f__mitochondria',
 'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rickettsiales; f__mitochondria; g__Acanthamoeba; s__palestinensis',
 'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rickettsiales; f__mitochondria',
 'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rickettsiales; f__mitochondria',
 'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rickettsiales; f__mitochondria',
 'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rickettsiales; f__mitochondria',
 'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rickettsiales; f__mitochondria',
 'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rickettsiales; f__mitochondria; g__Cucurbita; s__pepo',
 'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rickettsiales; f__mitochondria; g__Abies; s__homolepis',
 'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rickettsial

In [None]:
# generate rarified table (5000)
TODO


In [None]:
# merge the metadata
TODO
