In [1]:
#!/usr/bin/env python3
#~/miniconda3/bin/python3

import gzip
import pandas as pd
import numpy as np
from numpy import array
import ntpath

from Bio import SeqIO
from glob import glob
from itertools import product

from functools import partial
from multiprocessing import Pool
import os.path
import pickle

from sklearn.preprocessing import normalize
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RepeatedStratifiedKFold

import config_file_AEB

species_directory = config_file_AEB.species_directory
data_directory = config_file_AEB.data_directory
analysis_directory = config_file_AEB.analysis_directory  
scripts_directory = config_file_AEB.scripts_directory 

In [2]:
data_sets_to_use = [
    [['MetaHIT'], ['MetaHIT']],
   # [['Qin_et_al'], ['Qin_et_al']],
   # [['Zeller_2014'], ['Zeller_2014']],
   # [['LiverCirrhosis'], ['LiverCirrhosis']]
   ]

In [3]:
def load_species(data_sets):
    
    for data_set in data_sets:
        print(data_set)

        if data_set == "MetaHIT":
            cnts = pd.read_csv(species_directory + 'MetaHIT_ids.spcsAbundance.noNA.txt', sep = '\t')
            df_labels = pd.read_csv(data_directory + "metadata/MetaHIT_ids.txt", sep = '\t')
            labels = df_labels[['subject_id', 'ibd']]
            ids = labels['subject_id'].str.replace('.', '_').str.replace('-', '_')
            df_labels['subject_id'] = ids
            labels = df_labels[['subject_id', 'ibd']].drop_duplicates().set_index("subject_id")
            intersection = pd.merge(labels, cnts, how='inner', left_index=True, right_index=True)
            
            labels = np.asarray(intersection['ibd']).astype(np.int)
            
            species = intersection.iloc[:,1:]
            species_cnts = species.values
            
            features = list(species.columns)
            
            return species_cnts, labels, features

        
        elif data_set=='Qin_et_al':
            cnts = pd.read_csv(species_directory + 'Qin_2012_ids_all.spcsAbundance.noNA.txt', sep = '\t')
            df_labels = pd.read_csv(data_directory + "metadata/Qin_2012_ids_all.txt", sep = '\t')
            labels = df_labels[['subject_id', 't2d']].drop_duplicates().set_index("subject_id")
            intersection = pd.merge(labels, cnts, how='inner', left_index=True, right_index=True)

            labels = np.asarray(intersection['t2d']).astype(np.int)
            
            species = intersection.iloc[:,1:]
            species_cnts = species.values
            
            features = list(species.columns)
            
            return species_cnts, labels, features
        
        elif data_set=='Zeller_2014':
            cnts = pd.read_csv(species_directory + 'Zeller.spcsAbundance.noNA.txt', sep = '\t')
            df_labels = pd.read_csv(data_directory + "metadata/Zeller_metadata.txt", sep = '\t')
            labels = df_labels[['Sample ID', 'Group']].drop_duplicates().set_index("Sample ID")
            labels['Group'] = labels['Group'].astype('category').cat.rename_categories(['1','0'])
            intersection = pd.merge(labels, cnts, how='inner', left_index=True, right_index=True).dropna()
            
            labels = np.asarray(intersection['Group']).astype(np.int)
            
            species = intersection.iloc[:,1:]
            species_cnts = species.values
            
            features = list(species.columns)
            
            return species_cnts, labels, features
        
        elif data_set=='LiverCirrhosis':
            cnts = pd.read_csv(species_directory + 'LiverCirrhosis.spcsAbundance.noNA.txt', sep = '\t')
            df_labels = pd.read_csv(data_directory + "metadata/LiverCirrhosis.txt", sep = '\t')
            labels = df_labels[['Sample ID', 'Cirrhotic(Y or N)']].drop_duplicates().set_index("Sample ID")
            labels['Group'] = labels['Cirrhotic(Y or N)'].astype('category').cat.rename_categories(['0','1'])
            labels = labels.drop(['Cirrhotic(Y or N)'], axis = 1)
            intersection = pd.merge(labels, cnts, how='inner', left_index=True, right_index=True).dropna()
            
            labels = np.asarray(intersection['Group']).astype(np.int)
            
            species = intersection.iloc[:,1:]
            species_cnts = species.values
            
            features = list(species.columns)
            
            return species_cnts, labels, features           

In [4]:
for data_set in data_sets_to_use:
    data_set = data_set[0]
    species_cnts, labelz, feats = load_species(data_set)
    print("LOADED DATASET " + str(data_set[0]) + ": " + str(len(species_cnts)) + " SAMPLES")
    print(species_cnts.shape)
    print(labelz.shape)
    print(str(len(feats)) + " features\n")

MetaHIT
LOADED DATASET MetaHIT: 110 SAMPLES
(110, 3302)
(110,)
3302 features



In [5]:
species_cnts

array([[0.     , 0.     , 0.     , ..., 0.     , 0.     , 0.     ],
       [0.     , 0.     , 0.     , ..., 0.     , 0.     , 0.     ],
       [0.     , 0.     , 0.     , ..., 0.     , 0.     , 0.     ],
       ...,
       [0.     , 0.     , 0.     , ..., 0.     , 0.     , 0.     ],
       [3.40237, 3.40237, 3.39891, ..., 0.     , 0.     , 0.     ],
       [0.2426 , 0.2426 , 0.23865, ..., 0.     , 0.     , 0.     ]])

In [6]:
feats

['k__Archaea',
 'k__Archaea|p__Euryarchaeota',
 'k__Archaea|p__Euryarchaeota|c__Methanobacteria',
 'k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales',
 'k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae',
 'k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter',
 'k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_smithii',
 'k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_smithii|t__Methanobrevibacter_smithii_unclassified',
 'k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_unclassified',
 'k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanosphaera',
 'k__Arch

In [7]:
df = pd.DataFrame(species_cnts)

In [8]:
df.columns = feats

In [9]:
df['k__Archaea'].head()

0    0.00000
1    0.00000
2    0.00000
3    0.27928
4    4.02688
Name: k__Archaea, dtype: float64

In [10]:
df['k__Bacteria'].head()

0    100.00000
1    100.00000
2    100.00000
3     99.71785
4     95.97312
Name: k__Bacteria, dtype: float64

In [11]:
np.any(np.isnan(species_cnts)) # returns False

False

In [12]:
np.all(np.isfinite(species_cnts)) # returns True

True

In [13]:
means = species_cnts.mean(axis=0)

stdevs = species_cnts.std(axis=0)

In [14]:
np.count_nonzero(means==0)

2154

In [15]:
cov = species_cnts.std(axis=0) / species_cnts.mean(axis=0)

  """Entry point for launching an IPython kernel.


In [16]:
np.sum(np.isnan(cov))

2154

In [17]:
sums = species_cnts.sum(axis=0)

In [18]:
np.count_nonzero(sums==0)

2154

In [19]:
species_cnts.shape

(110, 3302)

In [20]:
species_cnts = species_cnts[:, ~np.all(species_cnts == 0, axis=0)]

In [21]:
species_cnts.shape

(110, 1148)

In [22]:
species_cnts[:,0]

array([0.00000e+00, 0.00000e+00, 0.00000e+00, 2.79280e-01, 4.02688e+00,
       4.30680e-01, 6.24600e-02, 2.36000e-03, 0.00000e+00, 1.23530e-01,
       1.17900e-01, 5.19140e-01, 0.00000e+00, 6.22800e-02, 1.11740e-01,
       2.87770e-01, 9.26000e-03, 1.11880e-01, 0.00000e+00, 7.48500e-02,
       1.12840e-01, 0.00000e+00, 0.00000e+00, 7.27000e-03, 0.00000e+00,
       0.00000e+00, 2.02130e-01, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.07270e-01,
       7.59100e-02, 1.65070e-01, 2.95970e-01, 1.08130e-01, 0.00000e+00,
       4.91910e-01, 0.00000e+00, 4.61900e-02, 3.53000e-03, 6.52590e-01,
       0.00000e+00, 0.00000e+00, 1.82310e-01, 0.00000e+00, 1.24999e+00,
       1.68280e-01, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.36790e-01,
       1.28140e-01, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.48240e-01,
       0.00000e+00, 5.16600e-02, 2.19000e-02, 1.93665e+00, 0.00000e+00,
       3.34830e-01, 1.81930e-01, 0.00000e+00, 0.00000e+00, 0.000

In [23]:
means = species_cnts.mean(axis=0)

stdevs = species_cnts.std(axis=0)

In [24]:
list(means)

[0.38868127272727276,
 0.38868127272727276,
 0.3866297272727273,
 0.3866297272727273,
 0.3866297272727273,
 0.3854663636363636,
 0.3224129090909092,
 0.3224129090909092,
 0.06305336363636364,
 0.0011632727272727274,
 0.0011632727272727274,
 0.0011632727272727274,
 99.60958554545455,
 0.008683636363636362,
 0.008683636363636362,
 0.008683636363636362,
 0.008683636363636362,
 0.0001683636363636364,
 2.5425330909090906,
 2.5425330909090906,
 0.008684090909090909,
 0.001371,
 0.001371,
 0.00010854545454545454,
 0.00010854545454545454,
 0.000642,
 0.000642,
 0.0004696363636363637,
 0.0004696363636363637,
 0.003049181818181818,
 0.003049181818181818,
 0.0023497272727272724,
 0.0023497272727272724,
 0.0006994545454545454,
 1.8589854545454547,
 1.8589854545454547,
 1.8581020909090913,
 1.0805627272727272,
 1.0805627272727272,
 0.003733727272727273,
 0.003733727272727273,
 0.10435927272727273,
 0.10435927272727273,
 0.04079636363636364,
 0.04079636363636364,
 0.5351637272727272,
 0.526658181818

In [25]:
list(stdevs)

[1.2209403253186821,
 1.2209403253186821,
 1.2132820772459671,
 1.2132820772459671,
 1.2132820772459671,
 1.2129277198151043,
 0.967235269857852,
 0.967235269857852,
 0.2791879271461079,
 0.004438447949266398,
 0.004438447949266398,
 0.004438447949266398,
 1.2217515549582434,
 0.016594385399190278,
 0.016594385399190278,
 0.016594385399190278,
 0.016594385399190278,
 0.0011325470307494976,
 3.71931301500932,
 3.71931301500932,
 0.019109454313861334,
 0.007298099372750789,
 0.007298099372750789,
 0.0007576048818644143,
 0.0007576048818644143,
 0.0051992850557465225,
 0.0051992850557465225,
 0.004903147584093808,
 0.004903147584093808,
 0.007688343494811797,
 0.007688343494811797,
 0.007458735020594177,
 0.007458735020594177,
 0.0023897618125372157,
 3.147463807647108,
 3.147463807647108,
 3.1473014385894857,
 2.2129183674205217,
 2.2129183674205217,
 0.019704884113478562,
 0.019704884113478562,
 0.35332776059967347,
 0.35332776059967347,
 0.16803527011113564,
 0.16803527011113564,
 0.97

In [26]:
cov = species_cnts.std(axis=0) / species_cnts.mean(axis=0)

In [27]:
np.any(np.isnan(cov))

False

In [28]:
np.all(np.isfinite(species_cnts)) # returns True

True

In [29]:
np.count_nonzero(stdevs==0)
np.count_nonzero(means==0)

0

In [30]:
sample_mean = species_cnts.mean(axis=0)
sample_std = species_cnts.std(axis=0)

In [31]:
# Standardize both training and test samples with the training mean and std
x_train = (species_cnts - sample_mean) / sample_std


In [32]:
list(x_train)

[array([-0.31834584, -0.31834584, -0.31866434, ..., -0.09578263,
        -0.09578263, -0.09578263]),
 array([-0.31834584, -0.31834584, -0.31866434, ..., -0.09578263,
        -0.09578263, -0.09578263]),
 array([-0.31834584, -0.31834584, -0.31866434, ..., -0.09578263,
        -0.09578263, -0.09578263]),
 array([-0.08960411, -0.08960411, -0.08847879, ..., -0.09578263,
        -0.09578263, -0.09578263]),
 array([ 2.97983337,  2.97983337,  2.98526644, ..., -0.09578263,
        -0.09578263, -0.09578263]),
 array([ 0.03439867,  0.03439867,  0.0363067 , ..., -0.09578263,
        -0.09578263, -0.09578263]),
 array([-0.26718855, -0.26718855, -0.26718414, ..., -0.09578263,
        -0.09578263, -0.09578263]),
 array([-0.3164129 , -0.3164129 , -0.3167192 , ..., -0.09578263,
        -0.09578263, -0.09578263]),
 array([-0.31834584, -0.31834584, -0.31866434, ..., -0.09578263,
        -0.09578263, -0.09578263]),
 array([-0.21716972, -0.21716972, -0.2168496 , ..., -0.09578263,
        -0.09578263, -0.09