Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 27 additions & 17 deletions selene_sdk/evaluate_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,12 @@ class EvaluateModel(object):
Default is None. Specify an ordered list of features for which to
run the evaluation. The features in this list must be identical to or
a subset of `features`, and in the order you want the resulting
`test_targets.npz` and `test_predictions.npz` to be saved.
`test_targets.npz` and `test_predictions.npz` to be saved. If using
a FileSampler or H5DataLoader for the evaluation, you can pass in
a dataset with the targets matrix only containing these features, but
note that this subsetted targets matrix MUST be ordered the same
way as `features`, and the predictions and targets .npz output
will be reordered according to `use_features_ord`.

Attributes
----------
Expand Down Expand Up @@ -117,17 +122,14 @@ def __init__(self,
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)

self.features = features
self.features = np.array(features)
self._use_ixs = list(range(len(features)))
if use_features_ord is not None:
feature_ixs = {f: ix for (ix, f) in enumerate(features)}
self._use_ixs = []
self.features = []

for f in use_features_ord:
if f in feature_ixs:
self._use_ixs.append(feature_ixs[f])
self.features.append(f)
else:
warnings.warn(("Feature {0} in `use_features_ord` "
"does not match any features in the list "
Expand Down Expand Up @@ -157,11 +159,23 @@ def __init__(self,

self._test_data, self._all_test_targets = \
self.sampler.get_data_and_targets(self.batch_size, n_test_samples)
# TODO: we should be able to do this on the sampler end instead of
# here. the current workaround is problematic, since
# self._test_data still has the full featureset in it, and we
# select the subset during `evaluate`
self._all_test_targets = self._all_test_targets[:, self._use_ixs]

self._use_testmat_ixs = self._use_ixs[:]
# if the targets shape is the same as the subsetted features,
# reindex based on the subsetted list
if self._all_test_targets.shape[1] == len(self._use_ixs):
subset_features = {self.features[ix]: i for (i, ix) in
enumerate(sorted(self._use_ixs))}
self._use_testmat_ixs = [
subset_features[f] for f in self.features[self._use_ixs]]

self._all_test_targets = self._all_test_targets[
:, self._use_testmat_ixs]

# save the targets dataset now
np.savez_compressed(
os.path.join(self.output_dir, "test_targets.npz"),
data=self._all_test_targets)

# reset Genome base ordering when applicable.
if (hasattr(self.sampler, "reference_sequence") and
Expand All @@ -179,7 +193,7 @@ def _write_features_ordered_to_file(self):
"""
fp = os.path.join(self.output_dir, 'use_features_ord.txt')
with open(fp, 'w+') as file_handle:
for f in self.features:
for f in self.features[self._use_ixs]:
file_handle.write('{0}\n'.format(f))

def _get_feature_from_index(self, index):
Expand All @@ -196,7 +210,7 @@ def _get_feature_from_index(self, index):
The name of the feature/target at the specified index.

"""
return self.features[index]
return self.features[self._use_ixs][index]

def evaluate(self):
"""
Expand All @@ -216,7 +230,7 @@ def evaluate(self):
all_predictions = []
for (inputs, targets) in self._test_data:
inputs = torch.Tensor(inputs)
targets = torch.Tensor(targets[:, self._use_ixs])
targets = torch.Tensor(targets[:, self._use_testmat_ixs])

if self.use_cuda:
inputs = inputs.cuda()
Expand Down Expand Up @@ -246,10 +260,6 @@ def evaluate(self):
os.path.join(self.output_dir, "test_predictions.npz"),
data=all_predictions)

np.savez_compressed(
os.path.join(self.output_dir, "test_targets.npz"),
data=self._all_test_targets)

loss = np.average(batch_losses)
logger.info("test loss: {0}".format(loss))
for name, score in average_scores.items():
Expand Down
18 changes: 8 additions & 10 deletions selene_sdk/samplers/file_samplers/mat_file_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,7 @@ def __init__(self,
self._tgts_batch_axis = targets_batch_axis
self.n_samples = self._sample_seqs.shape[self._seq_batch_axis]

self._sample_indices = np.arange(
self.n_samples).tolist()
self._sample_indices = np.arange(self.n_samples).tolist()
self._sample_next = 0

self._shuffle = shuffle
Expand Down Expand Up @@ -138,7 +137,7 @@ def sample(self, batch_size=1):
"""
sample_up_to = self._sample_next + batch_size
use_indices = None
if sample_up_to >= len(self._sample_indices):
if sample_up_to > len(self._sample_indices):
if self._shuffle:
np.random.shuffle(self._sample_indices)
self._sample_next = 0
Expand Down Expand Up @@ -237,19 +236,18 @@ def get_data_and_targets(self, batch_size, n_samples=None):
"initialization. Please use `get_data` instead.")
if not n_samples:
n_samples = self.n_samples

sequences_and_targets = []
targets_mat = []

count = batch_size
count = 0
while count < n_samples:
seqs, tgts = self.sample(batch_size=batch_size)
sample_size = min(n_samples - count, batch_size)
seqs, tgts = self.sample(batch_size=sample_size)
sequences_and_targets.append((seqs, tgts))
targets_mat.append(tgts)
count += batch_size
remainder = batch_size - (count - n_samples)
seqs, tgts = self.sample(batch_size=remainder)
sequences_and_targets.append((seqs, tgts))
targets_mat.append(tgts)
count += sample_size

# TODO: should not assume targets are always integers
targets_mat = np.vstack(targets_mat).astype(float)
return sequences_and_targets, targets_mat
11 changes: 6 additions & 5 deletions selene_sdk/utils/performance_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def get_feature_specific_scores(data, get_feature_from_index_fn):

def auc_u_test(labels, predictions):
"""
Outputs the area under the the ROC curve associated with a certain
Outputs the area under the the ROC curve associated with a certain
set of labels and the predictions given by the training model.
Computed from the U statistic.

Expand All @@ -265,8 +265,8 @@ def auc_u_test(labels, predictions):
Returns
-------
float
AUC value of given label, prediction pairs
AUC value of given label, prediction pairs

"""
len_pos = int(np.sum(labels))
len_neg = len(labels) - len_pos
Expand Down Expand Up @@ -316,7 +316,8 @@ class PerformanceMetrics(object):
def __init__(self,
get_feature_from_index_fn,
report_gt_feature_n_positives=10,
metrics=dict(roc_auc=roc_auc_score, average_precision=average_precision_score)):
metrics=dict(roc_auc=roc_auc_score,
average_precision=average_precision_score)):
"""
Creates a new object of the `PerformanceMetrics` class.
"""
Expand Down Expand Up @@ -467,7 +468,7 @@ def write_feature_scores_to_file(self, output_path):
cols = '\t'.join(["class"] + metric_cols)
with open(output_path, 'w+') as file_handle:
file_handle.write("{0}\n".format(cols))
for feature, metric_scores in sorted(feature_scores.items()):
for feature, metric_scores in feature_scores.items():
if not metric_scores:
file_handle.write("{0}\t{1}\n".format(feature, "\t".join(["NA"] * len(metric_cols))))
else:
Expand Down