Skip to content

Commit

Permalink
vica train and vica evaluate were refactored and now work
Browse files Browse the repository at this point in the history
  • Loading branch information
Adam R. Rivers committed Dec 11, 2017
1 parent a5fc9b5 commit fe7540a
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 71 deletions.
4 changes: 2 additions & 2 deletions .travis.yml
Expand Up @@ -4,8 +4,8 @@ python:
- "3.6"
# command to install dependencies
before_install:
- wget https://sourceforge.net/projects/bbmap/files/BBMap_37.68.tar.gz -O /tmp/BBMap_37.68.tar.gz
- tar -xvf /tmp/BBMap_37.68.tar.gz
- wget https://sourceforge.net/projects/bbmap/files/BBMap_37.75.tar.gz -O /tmp/BBMap_37.75.tar.gz
- tar -xvf /tmp/BBMap_37.75.tar.gz
- export PATH=$PATH:$PWD/bbmap
- wget https://github.com/hyattpd/Prodigal/archive/v2.6.3.tar.gz -O /tmp/Prodigalv2.6.3.tar.gz
- tar -xvf /tmp/Prodigalv2.6.3.tar.gz
Expand Down
2 changes: 1 addition & 1 deletion vica/__init__.py
Expand Up @@ -13,4 +13,4 @@
from .split_shred import *
from .tfrecord_maker import *
from .vica_cli import *
# from .train_eval import *
from .train_eval import *
4 changes: 2 additions & 2 deletions vica/data/config_default.yml
Expand Up @@ -40,11 +40,11 @@ minhash:
# Taxfilter, a file generated by vica split containing the taxids in the test set
taxfilter: /Users/rivers/Documents/vica_docs/testtrain1/test/test_taxids.txt
# tacfilterlevel: level to exclude taxa related to the sequences in taxfilter
taxfilterlevel: "order" # ["species", "genus", "family", "order","class","phylum"]
taxfilterlevel: "genus" # ["species", "genus", "family", "order","class","phylum"]
# paramters for the get_features module
memory: "-Xmx14g"
get_features:
tempdir: "/Users/rivers/Desktop/gftd"
tempdir: "/Users/rivers/Desktop/gftd3"
# parameters for the classify module
classifier: null
train_eval:
Expand Down
2 changes: 1 addition & 1 deletion vica/minhash.py
Expand Up @@ -111,7 +111,7 @@ def _parse_comparesketch(file):
next
elif line.startswith("Query:"):
ll = line.strip().split("\t")
key1 = ll[6].split(":")[1]
key1 = ll[6].split(":")[1].strip()
tempdf[key1] = {}
elif line.startswith("WKID"):
next
Expand Down
178 changes: 114 additions & 64 deletions vica/train_eval.py
@@ -1,57 +1,61 @@
#!/usr/bin/env python3
"""train_eval.py: a module to train models and evaluate models from tfrecords
of features. It uses The Tensorflow 1.3+ datasets api and estimator api"""
of features. It uses The Tensorflow 1.3+ datasets api and estimator api
"""


import os
import urllib
import tempfile
import time
import datetime
from collections import Counter
import functools
import logging

import yaml
import numpy as np
import tensorflow as tf
import csv

import vica

with open(configpath) as cf:
with open(vica.CONFIG_PATH) as cf:
config = yaml.load(cf)

def _featureshape(k):
def _featureshape(k=5, codonlength=177, minhashlength=267):
"""Determine the shape of the features for each feature type including
for kmers of different lengths."""
codonlength = 177
minhashlength= 267
kmerdim = len(vica.khmer_features.iterate_kmer(k)) - 1
kmer = tf.feature_column.numeric_column(key='kmer', shape=(kmerdim))
codon = tf.feature_column.numeric_column(key='codon', shape=(config["train_eval"]["codonlength"]))
minhash = tf.feature_column.numeric_column(key='minhash', shape=(config["train_eval"]["minhashlength"]))
codon = tf.feature_column.numeric_column(key='codon', shape=(codonlength))
minhash = tf.feature_column.numeric_column(key='minhash', shape=(minhashlength))
return kmerdim, kmer, codon, minhash

global modeldir, epochs, kmerdim, kmer, codon, minhash, n_classes, filenames

def train_input_fn():
def base_input_fn(codonlength, minhashlength, kmerdim, shuffle, shuffle_buffer_size, batch, epochs, filenames):
"""the function for feeding and processing training data"""
filenames = tf.placeholder(tf.string, shape=[None])
# filenames = tf.placeholder(tf.string, shape=[None])
dataset = tf.contrib.data.TFRecordDataset(filenames)
def parser(record):
keys_to_features = {"id": tf.FixedLenFeature((), tf.string),
"label": tf.FixedLenFeature((), tf.int64),
"kmer": tf.FixedLenFeature([kmerdim], tf.float32),
"codon": tf.FixedLenFeature([config["train_eval"]["codonlength"]], tf.float32),
"minhash": tf.FixedLenFeature([config["train_eval"]["minhashlength"]], tf.float32)}
"codon": tf.FixedLenFeature([codonlength], tf.float32),
"minhash": tf.FixedLenFeature([minhashlength], tf.float32)}
parsed = tf.parse_single_example(record, keys_to_features)
return {'kmer': parsed['kmer'], 'codon': parsed['codon'], 'minhash': parsed['minhash']}, parsed['label']
dataset = dataset.map(parser)
dataset = dataset.shuffle(buffer_size=10000)
dataset = dataset.batch(32)
if shuffle:
dataset = dataset.shuffle(shuffle_buffer_size)
dataset = dataset.batch(batch)
dataset = dataset.repeat(epochs)
iterator = dataset.make_one_shot_iterator()
features, labels = iterator.get_next()
return features, labels

def test_input_fn():
""" """the function for feeding and processing training data""""""
"""the function for feeding and processing training data"""
filenames = tf.placeholder(tf.string, shape=[None])
dataset = tf.contrib.data.TFRecordDataset(filenames)
def parser(record):
Expand All @@ -71,81 +75,127 @@ def parser(record):


# Model definitions
combined_estimator = tf.estimator.DNNLinearCombinedClassifier(
model_dir = modeldir,
n_classes=n_classes,
weight_column=None,
linear_feature_columns=[minhash],
linear_optimizer='Ftrl',
dnn_feature_columns=[kmer, codon],
dnn_dropout=0.5,
dnn_activation_fn=tf.nn.relu,
dnn_hidden_units=[256, 32],
dnn_optimizer='Adagrad')

dnn_estimator = tf.estimator.DNNClassifier(
model_dir = modeldir,
n_classes=n_classes,
weight_column=None,
feature_columns=[kmer, codon],
dropout=0.5,
activation_fn=tf.nn.relu,
hidden_units=[256, 32],
optimizer='Adagrad')

def train(infiles, out, modeldir, n_classes, configpath):
def mk_dnnlogistic_estimator(modeldir, n_classes, minhash, kmer, codon):
dnnlogistic_estimator = tf.estimator.DNNLinearCombinedClassifier(
model_dir = modeldir,
n_classes=n_classes,
weight_column=None,
linear_feature_columns=[minhash],
linear_optimizer='Ftrl',
dnn_feature_columns=[kmer, codon],
dnn_dropout=0.5,
dnn_activation_fn=tf.nn.relu,
dnn_hidden_units=[256, 32],
dnn_optimizer='Adagrad')
return dnnlogistic_estimator

def mk_dnn_estimator(modeldir, n_classes, kmer, codon):
dnn_estimator = tf.estimator.DNNClassifier(
model_dir = modeldir,
n_classes=n_classes,
weight_column=None,
feature_columns=[kmer, codon],
dropout=0.5,
activation_fn=tf.nn.relu,
hidden_units=[256, 32],
optimizer='Adagrad')
return dnn_estimator

def train(infiles, out, modeldir, n_classes, configpath):
"""Main training function called by vica_cli trains a Tensorflow model
returning a modeldir and TFmodel file used by the tensorflow serving api
"""
try:
logging.info("Beginning tensorflow model training. to see results in real-time run 'tensorboard --logdir=path/to/log-directory'")
logging.info("Beginning tensorflow model training. To see results in real-time run 'tensorboard --logdir={}'".format(modeldir))
with open(configpath, "r") as cf:
global config
config = yaml.load(cf)
kmerdim, kmer, codon, minhash = _featureshape(config["train_eval"]["ksize"])
filenames = [infiles]
epochs = config["train_eval"]["epochs"]
modeldir = modeldir
global modeldir, epochs, kmerdim, kmer, codon, minhash, n_classes, filenames
if config["train_eval"]["model"]args.model == "DNN":
dnn_estimator.train(input_fn={train_input_fn: filenames})
kmerdim, kmer, codon, minhash = _featureshape(config["khmer_features"]["ksize"])
input_fn = functools.partial(base_input_fn,
codonlength=config["train_eval"]["codonlength"],
minhashlength=config["train_eval"]["minhashlength"],
kmerdim=kmerdim,
shuffle=True,
shuffle_buffer_size=10000,
batch=config["train_eval"]["train_batch_size"],
epochs=config["train_eval"]["epochs"],
filenames=infiles)
if config["train_eval"]["model"] == "DNN":
estimator = mk_dnn_estimator(modeldir=modeldir,
n_classes=int(n_classes),
kmer=kmer,
codon=codon)
estimator.train(input_fn=input_fn)
elif config["train_eval"]["model"] == "DNNLogistic":
combined_estimator.train(input_fn={train_input_fn: filenames})
estimator = mk_dnnlogistic_estimator(modeldir=modeldir,
n_classes=int(n_classes),
minhash=minhash,
kmer=kmer,
codon=codon)
estimator.train(input_fn=input_fn)
except:
loggin.exception(" during tensorflow model training the following exception occured:")
logging.exception("During tensorflow model training the following exception occured:")
raise SystemExit(1)
try:
# Save results if successful
feature_spec={"id": tf.FixedLenFeature((), tf.string),
"kmer": tf.FixedLenFeature([kmerdim], tf.float32),
"codon": tf.FixedLenFeature([config["train_eval"]["codonlength"]], tf.float32),
"minhash": tf.FixedLenFeature([config["train_eval"]["minhashlength"]], tf.float32)}
serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
estimator.export_savedmodel(out, serving_input_receiver_fn)
except:
logging.exception("While exporting the model after tensorflow training the following exception occured:")


def eval(infiles, out, modeldir, n_classes, configpath):
def eval(infiles, out, modeldir, n_classes, configpath):
""" Main evaluation function called by vica_cli. Load a model from a model directory
returning a file of predictions. In the fiture it will have other evaluation datd.
"""
try:
logging.info("Beginning tensorflow model evaluation. to see results in real-time run 'tensorboard --logdir=path/to/log-directory'")
logging.info("Beginning tensorflow model evaluation. To see results in real-time run 'tensorboard --logdir={}'".format(modeldir))
with open(configpath, "r") as cf:
global config
config = yaml.load(cf)
kmerdim, kmer, codon, minhash = _featureshape(args.ksize)
filenames = [infiles]
epochs = config["train_eval"]["epochs"]
modeldir = modeldir
global modeldir, epochs, kmerdim, kmer, codon, minhash, n_classes, filenames
kmerdim, kmer, codon, minhash = _featureshape(config["khmer_features"]["ksize"])
input_fn = functools.partial(base_input_fn,
codonlength=config["train_eval"]["codonlength"],
minhashlength=config["train_eval"]["minhashlength"],
kmerdim=kmerdim,
shuffle=False,
shuffle_buffer_size=0,
batch=config["train_eval"]["eval_batch_size"],
epochs=1,
filenames=infiles)
if config["train_eval"]["model"] == "DNN":
preds = dnn_estimator.train(input_fn={test_input_fn: filenames})
estimator = mk_dnn_estimator(modeldir=modeldir,
n_classes=int(n_classes),
kmer=kmer,
codon=codon)
results = estimator.evaluate(input_fn=input_fn)
preds = estimator.predict(input_fn=input_fn)
elif config["train_eval"]["model"] == "DNNLogistic":
preds = combined_estimator.train(input_fn={test_input_fn: filenames})
estimator = mk_dnnlogistic_estimator(modeldir=modeldir,
n_classes=int(n_classes),
minhash=minhash,
kmer=kmer,
codon=codon)
results = estimator.evaluate(input_fn=input_fn)
preds = estimator.predict(input_fn=input_fn)
logging.info("Tensorflow model performance. See also {}.".format(out))
logging.info(preds)
if not os.path.exists(out)
if not os.path.exists(out):
os.mkdir(out)
predictions = os.path.join(out,"modelpredictions.txt")
with open(predictions), "w") as outfile:
predictions = os.path.join(out, "modelpredictions.txt")
with open(predictions, "w") as outfile:
csv_writer_instance = csv.writer(outfile, lineterminator='\n')
for rec in preds:
plist = rec['probabilities']
pliststr = [str(x) for x in plist]
ll = [rec['classes'][0].decode("utf-8"), str(rec['class_ids'][0])]
ll.extend(pliststr)
csv_writer_instance.writerow(ll)
for key in sorted(results):
logging.info('{}: {}'.format(key, results[key]))
except:
loggin.exception(" during tensorflow model evaluation the following exception occured:")
logging.exception("During tensorflow model evaluation the following exception occured:")
2 changes: 1 addition & 1 deletion vica/vica_cli.py
Expand Up @@ -244,7 +244,7 @@ def main():
n_classes= args.n_classes,
configpath= args.config)
except:
logging.exception()
logging.exception("vica_cli.py: The following exception occured:")
raise SystemExit(1)


Expand Down

0 comments on commit fe7540a

Please sign in to comment.