In [1]:
import matplotlib
matplotlib.use('Agg') # this suppresses the console for plotting
import matplotlib.pyplot as plt

In [2]:
import bz2
import numpy as np
from numpy import random, array
import pandas as pd
import os
import os.path
import pylab
import importlib
import imp
from importlib import reload
import gzip
import ntpath
from Bio import SeqIO
from glob import glob
from itertools import product
from functools import partial
from multiprocessing import Pool
import pickle
from scipy import interp
import argparse

In [3]:
from sklearn.preprocessing import normalize, label_binarize, LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc, accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.utils import shuffle
from sklearn.model_selection import RepeatedStratifiedKFold

In [4]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.callbacks import History, TensorBoard
from keras import backend as K
backend = K.backend()

Using TensorFlow backend.


In [5]:
# import our private scripts
import load_spAB
import deep_learning_models
import plotting_utils_AEB
import stats_utils_AEB_110718
import config_file_AEB

species_directory = config_file_AEB.species_directory
data_directory = config_file_AEB.data_directory
analysis_directory = config_file_AEB.analysis_directory  
scripts_directory = config_file_AEB.scripts_directory 

In [14]:
def run_model(data_set, norm_input, encoding_dim, encoded_activation, input_dropout_pct, dropout_pct, num_epochs, batch_size, n_splits, n_repeats, compute_informative_features, plot_iteration, graph_dir, outFile):
    
    # format strings for outputting the paramters associated with this run:
    summary_string, plotting_string= stats_utils_AEB_110718.format_input_parameters_printing(data_set, norm_input, encoding_dim, encoded_activation,input_dropout_pct,dropout_pct,num_epochs,batch_size,n_splits,n_repeats,compute_informative_features,plot_iteration)

    outFile_header='data_set\tnorm_input\tencoding_dim\tencoded_activation\tinput_dropout_pct\tdropout_pct\tnum_epochs\tbatch_size\tn_splits\tn_repeats\t'

    #################
    # Load the data # 
    #################
    print('Loading data...')

    data_normalized, labels, rskf = load_spAB.load_single_disease(data_set, n_splits, n_repeats, precomputed_kfolds=False)
    
    print("Dimensions of normalized species cnts: " + str(data_normalized.shape))
    print("Dimensions of labels: " + str(labels.shape))
    # rskf = repeated stratified k fold. This contains all the kfold-by-iteration combos. 


    ###################################################
    # iterate through the data kfolds and iterations #
    ###################################################

    # Create a dictionary to store the metrics of each fold 
    aggregated_statistics={} # key=n_repeat, values= dictionary with stats
    
    #needed to change datatype to list bc I was getting - TypeError: 'generator' object is not subscriptable
    rskf = list(rskf)

    for n_repeat in range(0,len(rskf[0])):
        
        print('Iteration %s...' %n_repeat)
        
        aggregated_statistics[n_repeat] = {}
        
        train_idx = rskf[0][n_repeat]
        test_idx = rskf[1][n_repeat]
        x_train, y_train = data_normalized[train_idx], labels[train_idx]
        x_test, y_test = data_normalized[test_idx], labels[test_idx]
    
        #standardize the data, mean=0, std=1
        if norm_input:
            x_train, x_test= stats_utils_AEB_110718.standardize_data(x_train, x_test)
    
        ###########################################
        # set up a model (supervised learning)    #
        ###########################################
        # note that the model has to be instantiated each time a new fold is started otherwise the weights will not start from scratch. 
    
        input_dim=len(data_normalized[0]) # this is the number of input kmers

        model=deep_learning_models.create_supervised_model(input_dim, encoding_dim, encoded_activation,input_dropout_pct, dropout_pct)
    
        #weightFile = os.environ['HOME'] + '/deep_learning_microbiome/data/weights.txt'
       
        ##################################################
        # Fit the model with the train data of this fold #
        ##################################################
        history = History()
        # history is a dictionary. To get the keys, type print(history.history.keys())
        
        model.fit(x_train, y_train, 
                  epochs=num_epochs, 
                  batch_size=len(x_train), 
                  shuffle=True,
                  validation_data=(x_test, y_test),
                  verbose=0,
                  callbacks=[history])
    
        # predict using the held out data
        y_pred=model.predict(x_test)
        
        # save the weights of this model. TODO 
    
        ################################################################
        # Compute summary statistics                                   #
        ################################################################
        # Store the results of this fold in aggregated_statistics
        aggregated_statistics = stats_utils_AEB_110718.compute_summary_statistics(y_test, y_pred, history, aggregated_statistics, n_repeat)

        # could  plot everything (roc, accuracy vs epoch, loss vs epoch, confusion matrix, precision recall) for each fold, but this will produce a lot of graphs. 
        if compute_informative_features:
            shap_values, shap_values_summed = stats_utils_AEB_110718.compute_shap_values_deeplearning(input_dim, model, x_test)
            aggregated_statistics[n_repeat]['shap_values_summed']=shap_values_summed
            aggregated_statistics[n_repeat]['shap_values']=shap_values

        # also plot:
        #shap.summary_plot(shap_values, X, plot_type="bar")
        #shap.summary_plot(shap_values, X)

    ##############################################
    # aggregate the results from all the k-folds #
    # Print and Plot                             #
    ##############################################
    print('Aggregating statistics across iterations and printing/plotting...')

    stats_utils_AEB_110718.aggregate_statistics_across_folds(aggregated_statistics, rskf, n_splits, outFile, summary_string, plotting_string, outFile_header)


    ###################
    # Aggregate shap: #
    ###################

    if compute_informative_features: 
        print('Computing informative features with Shap...')
        stats_utils_AEB_110718.aggregate_shap(aggregated_statistics, rskf)


    #####################################
    # TSNE visualization                #
    # Annamarie                         #
    # find the weights of the best fold #
    #####################################




##############################
# parser for the config dict #
##############################
def parse_config_and_run(config_dict, outFile):
    data_sets=config_dict['data_set']
    #kmer_sizes=config_dict['kmer_size']
    norm_inputs=config_dict['norm_input']
    encoding_dims=config_dict['encoding_dim']
    encoded_activations=config_dict['encoded_activation']
    input_dropout_pcts=config_dict['input_dropout_pct']
    dropout_pcts=config_dict['dropout_pct'] 
    num_epochss=config_dict['num_epochs']
    batch_sizes=config_dict['batch_size']
    n_splitss=config_dict['n_splits']
    n_repeatss=config_dict['n_repeats']
    compute_informative_featuress=config_dict['compute_informative_features']
    plot_iterations=config_dict['plot_iteration'] 
    graph_dirs=config_dict['graph_dir'] 

    for data_set in data_sets:
        #for kmer_size in kmer_sizes:
        for norm_input in norm_inputs:
            for encoding_dim in encoding_dims:
                for encoded_activation in encoded_activations:
                    for input_dropout_pct in input_dropout_pcts:
                        for dropout_pct in dropout_pcts:
                            for num_epochs in num_epochss:
                                for batch_size in batch_sizes:
                                    for n_splits in n_splitss:
                                        for n_repeats in n_repeatss:
                                            for compute_informative_features in compute_informative_featuress:
                                                for plot_iteration in plot_iterations:
                                                    for graph_dir in graph_dirs:
                                                        
                                                        run_model(data_set, 
                                                                      #kmer_size,
                                                                      norm_input,
                                                                      encoding_dim,
                                                                      encoded_activation,
                                                                      input_dropout_pct,
                                                                      dropout_pct,
                                                                      num_epochs,
                                                                      batch_size,
                                                                      n_splits,
                                                                      n_repeats,
                                                                      compute_informative_features,
                                                                      plot_iteration,
                                                                      graph_dir, 
                                                                      outFile)

In [7]:
# making sure loader is working

data_set = ['MetaHIT']
species_cnts, labelz, feats = load_spAB.load_species(data_set)
print("LOADED DATASET " + str(data_set) + ": " + str(len(species_cnts)) + " SAMPLES")

# Checking the data
np.count_nonzero(species_cnts==0, axis = 1) # a lot of nonzeros
np.any(np.isnan(species_cnts)) # returns False
np.all(np.isfinite(species_cnts)) # returns True

MetaHIT
LOADED DATASET ['MetaHIT']: 110 SAMPLES


True

In [8]:
feats

In [15]:
config_dict=config_file_AEB.config
#for data_set in data_sets_to_use:
parse_config_and_run(config_dict, outFile="summary_statistics_11718.txt")

Parameters being tested:
MetaHIT
Normalize input? False
Encoding dim: 8
Encoded activation: sigmoid
Input dropout percent: 0
Dropout percent: 0
Num epochs: 400
Batch size: 16
n_splits (k-folds): 10
n_repeats (iterations): 5
Compute infromative features with Shap? False
Plots for each iteration? False

Loading data...
MetaHIT
Dimensions of normalized species cnts: (110, 3302)
Dimensions of labels: (110,)
Iteration 0...


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Iteration 1...
Aggregating statistics across iterations and printing/plotting...
Saving summary statistics to file /pollard/home/abustion/deep_learning_microbiome/analysis/summary_statistics_11718.txt
