diff --git a/selene_sdk/evaluate_model.py b/selene_sdk/evaluate_model.py index 1b3a8300..9e67ccbd 100644 --- a/selene_sdk/evaluate_model.py +++ b/selene_sdk/evaluate_model.py @@ -59,7 +59,12 @@ class EvaluateModel(object): Default is None. Specify an ordered list of features for which to run the evaluation. The features in this list must be identical to or a subset of `features`, and in the order you want the resulting - `test_targets.npz` and `test_predictions.npz` to be saved. + `test_targets.npz` and `test_predictions.npz` to be saved. If using + a FileSampler or H5DataLoader for the evaluation, you can pass in + a dataset with the targets matrix only containing these features, but + note that this subsetted targets matrix MUST be ordered the same + way as `features`, and the predictions and targets .npz output + will be reordered according to `use_features_ord`. Attributes ---------- @@ -117,17 +122,14 @@ def __init__(self, self.output_dir = output_dir os.makedirs(output_dir, exist_ok=True) - self.features = features + self.features = np.array(features) self._use_ixs = list(range(len(features))) if use_features_ord is not None: feature_ixs = {f: ix for (ix, f) in enumerate(features)} self._use_ixs = [] - self.features = [] - for f in use_features_ord: if f in feature_ixs: self._use_ixs.append(feature_ixs[f]) - self.features.append(f) else: warnings.warn(("Feature {0} in `use_features_ord` " "does not match any features in the list " @@ -157,11 +159,23 @@ def __init__(self, self._test_data, self._all_test_targets = \ self.sampler.get_data_and_targets(self.batch_size, n_test_samples) - # TODO: we should be able to do this on the sampler end instead of - # here. the current workaround is problematic, since - # self._test_data still has the full featureset in it, and we - # select the subset during `evaluate` - self._all_test_targets = self._all_test_targets[:, self._use_ixs] + + self._use_testmat_ixs = self._use_ixs[:] + # if the targets shape is the same as the subsetted features, + # reindex based on the subsetted list + if self._all_test_targets.shape[1] == len(self._use_ixs): + subset_features = {self.features[ix]: i for (i, ix) in + enumerate(sorted(self._use_ixs))} + self._use_testmat_ixs = [ + subset_features[f] for f in self.features[self._use_ixs]] + + self._all_test_targets = self._all_test_targets[ + :, self._use_testmat_ixs] + + # save the targets dataset now + np.savez_compressed( + os.path.join(self.output_dir, "test_targets.npz"), + data=self._all_test_targets) # reset Genome base ordering when applicable. if (hasattr(self.sampler, "reference_sequence") and @@ -179,7 +193,7 @@ def _write_features_ordered_to_file(self): """ fp = os.path.join(self.output_dir, 'use_features_ord.txt') with open(fp, 'w+') as file_handle: - for f in self.features: + for f in self.features[self._use_ixs]: file_handle.write('{0}\n'.format(f)) def _get_feature_from_index(self, index): @@ -196,7 +210,7 @@ def _get_feature_from_index(self, index): The name of the feature/target at the specified index. """ - return self.features[index] + return self.features[self._use_ixs][index] def evaluate(self): """ @@ -216,7 +230,7 @@ def evaluate(self): all_predictions = [] for (inputs, targets) in self._test_data: inputs = torch.Tensor(inputs) - targets = torch.Tensor(targets[:, self._use_ixs]) + targets = torch.Tensor(targets[:, self._use_testmat_ixs]) if self.use_cuda: inputs = inputs.cuda() @@ -246,10 +260,6 @@ def evaluate(self): os.path.join(self.output_dir, "test_predictions.npz"), data=all_predictions) - np.savez_compressed( - os.path.join(self.output_dir, "test_targets.npz"), - data=self._all_test_targets) - loss = np.average(batch_losses) logger.info("test loss: {0}".format(loss)) for name, score in average_scores.items(): diff --git a/selene_sdk/samplers/file_samplers/mat_file_sampler.py b/selene_sdk/samplers/file_samplers/mat_file_sampler.py index f18bd1a2..393764a3 100644 --- a/selene_sdk/samplers/file_samplers/mat_file_sampler.py +++ b/selene_sdk/samplers/file_samplers/mat_file_sampler.py @@ -105,8 +105,7 @@ def __init__(self, self._tgts_batch_axis = targets_batch_axis self.n_samples = self._sample_seqs.shape[self._seq_batch_axis] - self._sample_indices = np.arange( - self.n_samples).tolist() + self._sample_indices = np.arange(self.n_samples).tolist() self._sample_next = 0 self._shuffle = shuffle @@ -138,7 +137,7 @@ def sample(self, batch_size=1): """ sample_up_to = self._sample_next + batch_size use_indices = None - if sample_up_to >= len(self._sample_indices): + if sample_up_to > len(self._sample_indices): if self._shuffle: np.random.shuffle(self._sample_indices) self._sample_next = 0 @@ -237,19 +236,18 @@ def get_data_and_targets(self, batch_size, n_samples=None): "initialization. Please use `get_data` instead.") if not n_samples: n_samples = self.n_samples + sequences_and_targets = [] targets_mat = [] - count = batch_size + count = 0 while count < n_samples: - seqs, tgts = self.sample(batch_size=batch_size) + sample_size = min(n_samples - count, batch_size) + seqs, tgts = self.sample(batch_size=sample_size) sequences_and_targets.append((seqs, tgts)) targets_mat.append(tgts) - count += batch_size - remainder = batch_size - (count - n_samples) - seqs, tgts = self.sample(batch_size=remainder) - sequences_and_targets.append((seqs, tgts)) - targets_mat.append(tgts) + count += sample_size + # TODO: should not assume targets are always integers targets_mat = np.vstack(targets_mat).astype(float) return sequences_and_targets, targets_mat diff --git a/selene_sdk/utils/performance_metrics.py b/selene_sdk/utils/performance_metrics.py index 201bf23c..d16e1501 100644 --- a/selene_sdk/utils/performance_metrics.py +++ b/selene_sdk/utils/performance_metrics.py @@ -250,7 +250,7 @@ def get_feature_specific_scores(data, get_feature_from_index_fn): def auc_u_test(labels, predictions): """ - Outputs the area under the the ROC curve associated with a certain + Outputs the area under the the ROC curve associated with a certain set of labels and the predictions given by the training model. Computed from the U statistic. @@ -265,8 +265,8 @@ def auc_u_test(labels, predictions): Returns ------- float - AUC value of given label, prediction pairs - + AUC value of given label, prediction pairs + """ len_pos = int(np.sum(labels)) len_neg = len(labels) - len_pos @@ -316,7 +316,8 @@ class PerformanceMetrics(object): def __init__(self, get_feature_from_index_fn, report_gt_feature_n_positives=10, - metrics=dict(roc_auc=roc_auc_score, average_precision=average_precision_score)): + metrics=dict(roc_auc=roc_auc_score, + average_precision=average_precision_score)): """ Creates a new object of the `PerformanceMetrics` class. """ @@ -467,7 +468,7 @@ def write_feature_scores_to_file(self, output_path): cols = '\t'.join(["class"] + metric_cols) with open(output_path, 'w+') as file_handle: file_handle.write("{0}\n".format(cols)) - for feature, metric_scores in sorted(feature_scores.items()): + for feature, metric_scores in feature_scores.items(): if not metric_scores: file_handle.write("{0}\t{1}\n".format(feature, "\t".join(["NA"] * len(metric_cols)))) else: