In [1]:
import os
import pandas as pd
import numpy as np
import csv

from itertools import cycle, product
import argparse
import warnings

from sklearn.preprocessing import normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, RepeatedStratifiedKFold, train_test_split
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.decomposition import NMF
from sklearn.exceptions import ConvergenceWarning

import matplotlib.pyplot as plt
plt.switch_backend('Agg') # this suppresses the console for plotting  
import seaborn as sns

# import private scripts
import load_kmer_cnts_jf
import stats_utils_AEB

In [2]:
# directories (make sure date exists)
date = 'testing_120418/'
output_dir = os.environ['HOME'] + '/deep_learning_microbiome/analysis/kmers/linear/' + str(date)


# filter out warnings about convergence 
warnings.filterwarnings("ignore", category=ConvergenceWarning)


kmer_size = None
cv_gridsearch = None # number of folds for grid search
cv_testfolds = None # number of folds for actual training of the best model
n_iter = None # number of iterations of cross-validation
random_state = None

In [3]:
# Lists of data sets to be tested
# Each item consists of two lists: from the first, the healthy samples will be extracted.
# From the second, diseased samples will be extracted.
# The two sets will then be combined. 
data_sets_to_use = [
    [['MetaHIT'], ['MetaHIT']],
    [['Qin_et_al'], ['Qin_et_al']],
    #[['Zeller_2014'], ['Zeller_2014']],
    #[['LiverCirrhosis'], ['LiverCirrhosis']],
    #[['Karlsson_2013_no_adapter'], ['Karlsson_2013_no_adapter']],
    #[['RA_no_adapter'], ['RA_no_adapter']],
    #[['LeChatelier'], ['LeChatelier']],
    #[['Feng'], ['Feng']]
   ]

In [4]:
# Dictionary of parameters for each model 
param_dict = {       
    "rf": {"n_estimators": [400],
           "criterion": ["gini"],
           "max_features": ["sqrt"],
           "max_depth": [None],
           "min_samples_split": [2],
           "n_jobs": [1]},
    "lassoLR": {"penalty": ["l1"], 
                "solver": ["saga", "liblinear"]}
    }

# From figuring out target distribution

In [None]:
# just to help run it in jupyter #
kmer_size = 5
n_factor = 5
###

for data_set in data_sets_to_use:
    data_set = data_set[0]
        
    # Retrieve diseased data and labels
    allowed_labels = ['0', '1']
    kmer_cnts, accessions, labelz, domain_labels = load_kmer_cnts_jf.load_kmers(kmer_size, data_set, allowed_labels)
    print("LOADED DATASET " + str(data_set[0]) + ": " + str(len(kmer_cnts)) + " SAMPLES")
    labelz=np.asarray(labelz).astype(np.int)
    
    for n in [0]:
        if n == 0:
            data_normalized = normalize(kmer_cnts, axis = 1, norm = 'l1')
            data_normalized, labels = shuffle(data_normalized, labelz, random_state=0) 
            x = data_normalized
            y = labels
                    
        else:
            data_normalized = normalize(kmer_cnts, axis = 1, norm = 'l1')
            data_normalized = stats_utils_AEB.NMF_factor(data_normalized, kmer_size, n_components = int(n), 
                                                     title=(str(data_set) + str(kmer_size) + "mers" 
                                                            + str(n) + "factors"))
            data_normalized, labels = shuffle(data_normalized, labelz, random_state=0)
            x = data_normalized
            y = labels

In [None]:
kmer_cnts

In [None]:
targets, counts = np.unique(y, return_counts=True)

print(targets)
print(counts)
print(counts[1]/(counts[1] + counts[0]))

# RF

In [26]:
# just to help run it in jupyter #
learn_type = 'rf'
cv_gridsearch = 10
n_iter_grid = 1
#n_splits = 5
#n_repeats = 1

###

param_grid = param_dict[learn_type]

scoring = {
    'Acc': 'accuracy',
    'AUC': 'roc_auc',
    'F1': 'f1', # weighted average of precision and recall
    'Precision': 'precision',
    'Recall': 'recall'
}

estimator = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=2, n_jobs=1)
k_fold = RepeatedStratifiedKFold(n_splits=cv_gridsearch, n_repeats=n_iter_grid)
grid_search = GridSearchCV(estimator, param_grid, cv = k_fold, n_jobs = 1, scoring=scoring, refit = False)
grid_search = grid_search.fit(x, y)            
grid_search_results = grid_search.cv_results_

In [27]:
grid_search_results



{'mean_fit_time': array([0.31941969]),
 'std_fit_time': array([0.02139739]),
 'mean_score_time': array([0.11171954]),
 'std_score_time': array([0.02184605]),
 'param_criterion': masked_array(data=['gini'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[None],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_max_features': masked_array(data=['sqrt'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[2],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[400],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_n_jobs': masked_array(data=[1],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'criterion': 'gini',
   'max_depth': None,
   'max_features': 'sq

In [28]:
trees = np.array(grid_search_results['param_n_estimators'])

In [29]:
rank_auc = np.array(grid_search_results['rank_test_AUC'])
rank_acc = np.array(grid_search_results['rank_test_Acc'])
rank_f1 = np.array(grid_search_results['rank_test_F1'])

In [31]:
aucs = np.array(grid_search_results['mean_test_AUC'])
accuracies = np.array(grid_search_results['mean_test_Acc'])
f1s = np.array(grid_search_results['mean_test_F1'])
precisions = np.array(grid_search_results['mean_test_Precision'])
recalls = np.array(grid_search_results['mean_test_Recall'])

In [None]:
all_params = np.array(grid_search_results['params'])

In [None]:
with open(graph_dir + '101418_can_delete/summ_table.csv', 'w', newline='') as csvfile:
    fieldnames = ['dataset', 'kmer_size', 'n_splits', 'n_repeats', 'acc', 'auc', 'precision', 'recall', 'f1',
                      'model', 'NMF_factors', 'params']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for i in range(len(aucs)):
        param_grid = all_params[i]
        criterion = param_grid["criterion"]
        max_depth = param_grid["max_depth"]
        max_features = param_grid["max_features"]
        min_samples_split = param_grid["min_samples_split"]
        n_estimators = param_grid["n_estimators"]
        n_jobs = 1
        current_estimator = RandomForestClassifier(criterion=criterion, max_depth=max_depth, 
                                                max_features=max_features, min_samples_split=min_samples_split,
                                                n_estimators=n_estimators,n_jobs=n_jobs)
    
    #results saved to csv

        writer.writerow({
                'dataset': str(data_set),
                'kmer_size': kmer_size,
                'n_splits': cv_gridsearch,
                'n_repeats': n_iter_grid,
                'acc': accuracies[i],
                'auc': aucs[i],
                'precision': precisions[i],
                'recall': recalls[i],
                'f1': f1s[i],
                'model': learn_type,
                'NMF_factors': n_factor,
                'params': str(all_params[i])
            })
'''   
print(
        ">auroc:" + str(aucs[i]) + 
        " acc:" + (str(accuracies[i]) + 
        " f1:" + (str(f1s[i]) +
        " recall:" + str(recalls[i]) +
        " precision:" + str(precisions[i]) +
        " dataset:" + str(data_set) +
        " model:" + learn_type +  
        " NMFfactors: "+ str(n_factor) + 
        " kmer_size:" + str(kmer_size) + 
        #" no_trees:" + str(trees[i]))) +
        " params:" + str(all_params[i])
    )))
    '''

In [None]:
df_known_summ = pd.read_csv(graph_dir + '101418_can_delete/summ_table.csv')
df_known_summ

# Trying shap

In [None]:
# load JS visualization code to notebook
shap.initjs()

In [None]:
clf = RandomForestClassifier()
clf = clf.fit(x,y)

In [None]:
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(x)

In [None]:
shap.summary_plot(shap_values, x)

# LassoLR

In [5]:
# just to help run it in jupyter #
learn_type = 'lassoLR'
cv_gridsearch = 10
n_iter_grid = 20
###

param_grid = param_dict[learn_type]

scoring = {
    'Acc': 'accuracy',
    'AUC': 'roc_auc'
}

In [6]:
kmer_size = 5
n_factor = 5

for data_set in data_sets_to_use:
    data_set = data_set[0]
        
    # Retrieve diseased data and labels
    allowed_labels = ['0', '1']
    kmer_cnts, accessions, labelz, domain_labels = load_kmer_cnts_jf.load_kmers(kmer_size, data_set, allowed_labels)
    print("LOADED DATASET " + str(data_set[0]) + ": " + str(len(kmer_cnts)) + " SAMPLES")
    labelz=np.asarray(labelz).astype(np.int)

MetaHIT
LOADED DATASET MetaHIT: 110 SAMPLES
Qin_et_al
LOADED DATASET Qin_et_al: 344 SAMPLES


In [7]:
for n in [0, 5]:
    if n == 0:
        data_normalized = normalize(kmer_cnts, axis = 1, norm = 'l1')
        data_normalized, labels = shuffle(data_normalized, labelz, random_state=0) 
        x = data_normalized
        y = labels
                    
    else:
        data_normalized = normalize(kmer_cnts, axis = 1, norm = 'l1')
        data_normalized = stats_utils_AEB.NMF_factor(data_normalized, kmer_size, n_components = int(n), 
                                                     title=(str(data_set) + str(kmer_size) + "mers" 
                                                            + str(n) + "factors"))
        data_normalized, labels = shuffle(data_normalized, labelz, random_state=0)
        x = data_normalized
        y = labels

In [12]:
k_fold = RepeatedStratifiedKFold(n_splits=cv_gridsearch, n_repeats=n_iter_grid)

In [22]:
estimator_auc = LogisticRegressionCV(cv = k_fold, penalty = 'l1', scoring='roc_auc', solver = 'saga', n_jobs = 1).fit(x,y)

In [23]:
estimator_acc = LogisticRegressionCV(cv = k_fold, penalty = 'l1', scoring='accuracy', solver = 'saga', n_jobs = 1).fit(x,y)

In [52]:
estimator_acc = LogisticRegressionCV(cv = k_fold, penalty = 'l1', scoring='f1', solver = 'saga', n_jobs = 1).fit(x,y)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

In [48]:
auc = estimator_auc.scores_[1].mean(axis=0).max()

In [49]:
acc = estimator_acc.scores_[1].mean(axis=0).max()

In [51]:
print(
        ">auroc:" + str(auc) + 
        " acc:" + str(acc) + 
        " dataset:" + str(data_set) +
        " model:" + learn_type +  
        " NMFfactors: "+ str(n_factor) + 
        " kmer_size:" + str(kmer_size) + 
        #" no_trees:" + str(trees[i]))) +
        " params:" + str('saga')
)

>auroc:0.6580728565936177 acc:0.6049537815126055 dataset:['Qin_et_al'] model:rf NMFfactors: 5 kmer_size:5 params:saga


In [55]:
with open(output_dir + '121018_can_delete/summ_table.csv', 'w', newline='') as csvfile:
    fieldnames = ['dataset', 'kmer_size', 'n_splits', 'n_repeats', 'acc', 'auc', 'precision', 'recall', 'f1',
                                  'model', 'NMF_factors', 'params']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    #results saved to csv

    writer.writerow({
                'dataset': str(data_set),
                'kmer_size': kmer_size,
                'n_splits': cv_gridsearch,
                'n_repeats': n_iter_grid,
                'acc': acc,
                'auc': auc,
                'model': learn_type,
                'NMF_factors': n_factor,
                'params': 'saga'
            })

In [41]:
#save best models
for i in range(len(aucs)):
    param_grid = all_params[i]
    n_jobs = 1
    solver = param_grid["solver"]
    penalty = param_grid["penalty"]
    current_estimator = LogisticRegression(solver=solver, penalty=penalty, n_jobs=n_jobs)
        
    print(
        ">auroc:" + str(aucs[i]) + 
        " acc:" + (str(accuracies[i]) + 
        " f1:" + (str(f1s[i]) +
        " recall:" + str(recalls[i]) +
        " precision:" + str(precisions[i]) +
        " dataset:" + str(data_set) +
        " model:" + learn_type +  
        " NMFfactors: "+ str(n_factor) + 
        " kmer_size:" + str(kmer_size) + 
        #" no_trees:" + str(trees[i]))) +
        " params:" + str(all_params[i]))))

>auroc:0.5 acc:0.77 f1:0.0021818181818181823 recall:0.005454545454545455 precision:0.0013636363636363637 dataset:['MetaHIT'] model:lassoLR NMFfactors: 5 kmer_size:5 params:{'penalty': 'l1', 'solver': 'saga'}
>auroc:0.5 acc:0.7727272727272727 f1:0.0 recall:0.0 precision:0.0 dataset:['MetaHIT'] model:lassoLR NMFfactors: 5 kmer_size:5 params:{'penalty': 'l1', 'solver': 'liblinear'}


# With LassoCV instead

In [22]:
# just to help run it in jupyter #
learn_type = 'lassoLR'
cv_gridsearch = 10
n_iter_grid = 1
###

param_grid = param_dict[learn_type]

scoring = {
    'Acc': 'accuracy',
    'AUC': 'roc_auc'
}

In [23]:
kmer_size = 5
n_factor = 25

for data_set in data_sets_to_use:
    data_set = data_set[0]
        
    # Retrieve diseased data and labels
    allowed_labels = ['0', '1']
    kmer_cnts, accessions, labelz, domain_labels = load_kmer_cnts_jf.load_kmers(kmer_size, data_set, allowed_labels)
    print("LOADED DATASET " + str(data_set[0]) + ": " + str(len(kmer_cnts)) + " SAMPLES")
    labelz=np.asarray(labelz).astype(np.int)

MetaHIT
LOADED DATASET MetaHIT: 110 SAMPLES


In [24]:
for n in [0, 25]:
    if n == 0:
        data_normalized = normalize(kmer_cnts, axis = 1, norm = 'l1')
        data_normalized, labels = shuffle(data_normalized, labelz, random_state=0) 
        x = data_normalized
        y = labels
                    
    else:
        data_normalized = normalize(kmer_cnts, axis = 1, norm = 'l1')
        data_normalized = stats_utils_AEB.NMF_factor(data_normalized, kmer_size, n_components = int(n), 
                                                     title=(str(data_set) + str(kmer_size) + "mers" 
                                                            + str(n) + "factors"))
        data_normalized, labels = shuffle(data_normalized, labelz, random_state=0)
        x = data_normalized
        y = labels

In [25]:
param_grid = param_dict[learn_type]

In [26]:
scoring = {
    'Acc': 'accuracy',
    'AUC': 'roc_auc',
                    }

In [27]:
k_fold = RepeatedStratifiedKFold(n_splits=cv_gridsearch, n_repeats=n_iter_grid)

In [28]:
estimator = LogisticRegressionCV(penalty = 'l1', solver = 'saga', cv = k_fold, scoring='roc_auc')

In [29]:
estimator.fit(x,y)

LogisticRegressionCV(Cs=10, class_weight=None,
           cv=<sklearn.model_selection._split.RepeatedStratifiedKFold object at 0x7faeb4f57048>,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
           random_state=None, refit=True, scoring='roc_auc', solver='saga',
           tol=0.0001, verbose=0)

In [30]:
estimator.scores_

{1: array([[0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
         0.5       , 0.59259259, 0.44444444, 0.40740741, 0.44444444],
        [0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
         0.5       , 0.62962963, 0.62962963, 0.62962963, 0.62962963],
        [0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
         0.5       , 0.51851852, 0.44444444, 0.44444444, 0.40740741],
        [0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
         0.5       , 0.81481481, 0.7037037 , 0.77777778, 0.77777778],
        [0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
         0.5       , 0.62962963, 0.81481481, 0.81481481, 0.77777778],
        [0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
         0.5       , 0.5       , 1.        , 1.        , 1.        ],
        [0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
         0.5       , 0.3125    , 0.625     , 0.6875    , 0.6875    ],
        [0.5       , 0.5