In [1]:
#!/usr/bin/env python3
#~/miniconda3/bin/python3

import gzip
import pandas as pd
import numpy as np
from numpy import array
import ntpath

from Bio import SeqIO
from glob import glob
from itertools import product

from functools import partial
from multiprocessing import Pool
import os.path
import pickle

from sklearn.preprocessing import normalize
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RepeatedStratifiedKFold

import config_file

species_directory = config_file.species_directory
data_directory = config_file.data_directory
analysis_directory = config_file.analysis_directory  
scripts_directory = config_file.scripts_directory 

In [2]:
data_sets_to_use = [
    [['MetaHIT'], ['MetaHIT']],
    [['Qin_et_al'], ['Qin_et_al']],
    [['Zeller_2014'], ['Zeller_2014']],
    [['LiverCirrhosis'], ['LiverCirrhosis']]
   ]

In [3]:
def load_species(data_sets):
    
    for data_set in data_sets:
        print(data_set)

        if data_set == "MetaHIT":
            cnts = pd.read_csv(species_directory + 'MetaHIT_ids.spcsAbundance.noNA.txt', sep = '\t')
            df_labels = pd.read_csv(data_directory + "metadata/MetaHIT_ids.txt", sep = '\t')
            labels = df_labels[['subject_id', 'ibd']]
            ids = labels['subject_id'].str.replace('.', '_').str.replace('-', '_')
            df_labels['subject_id'] = ids
            labels = df_labels[['subject_id', 'ibd']].drop_duplicates().set_index("subject_id")
            intersection = pd.merge(labels, cnts, how='inner', left_index=True, right_index=True)
            
            labels = np.asarray(intersection['ibd']).astype(np.int)
            
            species = intersection.iloc[:,1:]
            species_cnts = species.values
            
            features = list(species.columns)
            
            return species_cnts, labels, features

        
        elif data_set=='Qin_et_al':
            cnts = pd.read_csv(species_directory + 'Qin_2012_ids_all.spcsAbundance.noNA.txt', sep = '\t')
            df_labels = pd.read_csv(data_directory + "metadata/Qin_2012_ids_all.txt", sep = '\t')
            labels = df_labels[['subject_id', 't2d']].drop_duplicates().set_index("subject_id")
            intersection = pd.merge(labels, cnts, how='inner', left_index=True, right_index=True)

            labels = np.asarray(intersection['t2d']).astype(np.int)
            
            species = intersection.iloc[:,1:]
            species_cnts = species.values
            
            features = list(species.columns)
            
            return species_cnts, labels, features
        
        elif data_set=='Zeller_2014':
            cnts = pd.read_csv(species_directory + 'Zeller.spcsAbundance.noNA.txt', sep = '\t')
            df_labels = pd.read_csv(data_directory + "metadata/Zeller_metadata.txt", sep = '\t')
            labels = df_labels[['Sample ID', 'Group']].drop_duplicates().set_index("Sample ID")
            labels['Group'] = labels['Group'].astype('category').cat.rename_categories(['1','0'])
            intersection = pd.merge(labels, cnts, how='inner', left_index=True, right_index=True).dropna()
            
            labels = np.asarray(intersection['Group']).astype(np.int)
            
            species = intersection.iloc[:,1:]
            species_cnts = species.values
            
            features = list(species.columns)
            
            return species_cnts, labels, features
        
        elif data_set=='LiverCirrhosis':
            cnts = pd.read_csv(species_directory + 'LiverCirrhosis.spcsAbundance.noNA.txt', sep = '\t')
            df_labels = pd.read_csv(data_directory + "metadata/LiverCirrhosis.txt", sep = '\t')
            labels = df_labels[['Sample ID', 'Cirrhotic(Y or N)']].drop_duplicates().set_index("Sample ID")
            labels['Group'] = labels['Cirrhotic(Y or N)'].astype('category').cat.rename_categories(['0','1'])
            labels = labels.drop(['Cirrhotic(Y or N)'], axis = 1)
            intersection = pd.merge(labels, cnts, how='inner', left_index=True, right_index=True).dropna()
            
            labels = np.asarray(intersection['Group']).astype(np.int)
            
            species = intersection.iloc[:,1:]
            species_cnts = species.values
            
            features = list(species.columns)
            
            return species_cnts, labels, features           

In [4]:
for data_set in data_sets_to_use:
    data_set = data_set[0]
    species_cnts, labelz, feats = load_species(data_set)
    print("LOADED DATASET " + str(data_set[0]) + ": " + str(len(species_cnts)) + " SAMPLES")
    print(species_cnts.shape)
    print(labelz.shape)
    print(str(len(feats)) + " features\n")

MetaHIT
LOADED DATASET MetaHIT: 110 SAMPLES
(110, 3302)
(110,)
3302 features

Qin_et_al
LOADED DATASET Qin_et_al: 271 SAMPLES
(271, 3302)
(271,)
3302 features

Zeller_2014
LOADED DATASET Zeller_2014: 121 SAMPLES
(121, 3302)
(121,)
3302 features

LiverCirrhosis
LOADED DATASET LiverCirrhosis: 232 SAMPLES
(232, 3302)
(232,)
3302 features

