diff --git a/selene_sdk/evaluate_model.py b/selene_sdk/evaluate_model.py
index 1b3a8300..9e67ccbd 100644
--- a/selene_sdk/evaluate_model.py
+++ b/selene_sdk/evaluate_model.py
@@ -59,7 +59,12 @@ class EvaluateModel(object):
         Default is None. Specify an ordered list of features for which to
         run the evaluation. The features in this list must be identical to or
         a subset of `features`, and in the order you want the resulting
-        `test_targets.npz` and `test_predictions.npz` to be saved.
+        `test_targets.npz` and `test_predictions.npz` to be saved. If using
+        a FileSampler or H5DataLoader for the evaluation, you can pass in
+        a dataset with the targets matrix only containing these features, but
+        note that this subsetted targets matrix MUST be ordered the same
+        way as `features`, and the predictions and targets .npz output
+        will be reordered according to `use_features_ord`.
 
     Attributes
     ----------
@@ -117,17 +122,14 @@ def __init__(self,
         self.output_dir = output_dir
         os.makedirs(output_dir, exist_ok=True)
 
-        self.features = features
+        self.features = np.array(features)
         self._use_ixs = list(range(len(features)))
         if use_features_ord is not None:
             feature_ixs = {f: ix for (ix, f) in enumerate(features)}
             self._use_ixs = []
-            self.features = []
-
             for f in use_features_ord:
                 if f in feature_ixs:
                     self._use_ixs.append(feature_ixs[f])
-                    self.features.append(f)
                 else:
                     warnings.warn(("Feature {0} in `use_features_ord` "
                                    "does not match any features in the list "
@@ -157,11 +159,23 @@ def __init__(self,
 
         self._test_data, self._all_test_targets = \
             self.sampler.get_data_and_targets(self.batch_size, n_test_samples)
-        # TODO: we should be able to do this on the sampler end instead of
-        # here. the current workaround is problematic, since
-        # self._test_data still has the full featureset in it, and we
-        # select the subset during `evaluate`
-        self._all_test_targets = self._all_test_targets[:, self._use_ixs]
+
+        self._use_testmat_ixs = self._use_ixs[:]
+        # if the targets shape is the same as the subsetted features,
+        # reindex based on the subsetted list
+        if self._all_test_targets.shape[1] == len(self._use_ixs):
+            subset_features = {self.features[ix]: i for (i, ix) in
+                               enumerate(sorted(self._use_ixs))}
+            self._use_testmat_ixs = [
+                subset_features[f] for f in self.features[self._use_ixs]]
+
+        self._all_test_targets = self._all_test_targets[
+            :, self._use_testmat_ixs]
+
+        # save the targets dataset now
+        np.savez_compressed(
+            os.path.join(self.output_dir, "test_targets.npz"),
+            data=self._all_test_targets)
 
         # reset Genome base ordering when applicable.
         if (hasattr(self.sampler, "reference_sequence") and
@@ -179,7 +193,7 @@ def _write_features_ordered_to_file(self):
         """
         fp = os.path.join(self.output_dir, 'use_features_ord.txt')
         with open(fp, 'w+') as file_handle:
-            for f in self.features:
+            for f in self.features[self._use_ixs]:
                 file_handle.write('{0}\n'.format(f))
 
     def _get_feature_from_index(self, index):
@@ -196,7 +210,7 @@ def _get_feature_from_index(self, index):
             The name of the feature/target at the specified index.
 
         """
-        return self.features[index]
+        return self.features[self._use_ixs][index]
 
     def evaluate(self):
         """
@@ -216,7 +230,7 @@ def evaluate(self):
         all_predictions = []
         for (inputs, targets) in self._test_data:
             inputs = torch.Tensor(inputs)
-            targets = torch.Tensor(targets[:, self._use_ixs])
+            targets = torch.Tensor(targets[:, self._use_testmat_ixs])
 
             if self.use_cuda:
                 inputs = inputs.cuda()
@@ -246,10 +260,6 @@ def evaluate(self):
             os.path.join(self.output_dir, "test_predictions.npz"),
             data=all_predictions)
 
-        np.savez_compressed(
-            os.path.join(self.output_dir, "test_targets.npz"),
-            data=self._all_test_targets)
-
         loss = np.average(batch_losses)
         logger.info("test loss: {0}".format(loss))
         for name, score in average_scores.items():
diff --git a/selene_sdk/samplers/file_samplers/mat_file_sampler.py b/selene_sdk/samplers/file_samplers/mat_file_sampler.py
index f18bd1a2..393764a3 100644
--- a/selene_sdk/samplers/file_samplers/mat_file_sampler.py
+++ b/selene_sdk/samplers/file_samplers/mat_file_sampler.py
@@ -105,8 +105,7 @@ def __init__(self,
             self._tgts_batch_axis = targets_batch_axis
         self.n_samples = self._sample_seqs.shape[self._seq_batch_axis]
 
-        self._sample_indices = np.arange(
-            self.n_samples).tolist()
+        self._sample_indices = np.arange(self.n_samples).tolist()
         self._sample_next = 0
 
         self._shuffle = shuffle
@@ -138,7 +137,7 @@ def sample(self, batch_size=1):
         """
         sample_up_to = self._sample_next + batch_size
         use_indices = None
-        if sample_up_to >= len(self._sample_indices):
+        if sample_up_to > len(self._sample_indices):
             if self._shuffle:
                 np.random.shuffle(self._sample_indices)
             self._sample_next = 0
@@ -237,19 +236,18 @@ def get_data_and_targets(self, batch_size, n_samples=None):
                 "initialization. Please use `get_data` instead.")
         if not n_samples:
             n_samples = self.n_samples
+
         sequences_and_targets = []
         targets_mat = []
 
-        count = batch_size
+        count = 0
         while count < n_samples:
-            seqs, tgts = self.sample(batch_size=batch_size)
+            sample_size = min(n_samples - count, batch_size)
+            seqs, tgts = self.sample(batch_size=sample_size)
             sequences_and_targets.append((seqs, tgts))
             targets_mat.append(tgts)
-            count += batch_size
-        remainder = batch_size - (count - n_samples)
-        seqs, tgts = self.sample(batch_size=remainder)
-        sequences_and_targets.append((seqs, tgts))
-        targets_mat.append(tgts)
+            count += sample_size
+
         # TODO: should not assume targets are always integers
         targets_mat = np.vstack(targets_mat).astype(float)
         return sequences_and_targets, targets_mat
diff --git a/selene_sdk/utils/performance_metrics.py b/selene_sdk/utils/performance_metrics.py
index 201bf23c..d16e1501 100644
--- a/selene_sdk/utils/performance_metrics.py
+++ b/selene_sdk/utils/performance_metrics.py
@@ -250,7 +250,7 @@ def get_feature_specific_scores(data, get_feature_from_index_fn):
 
 def auc_u_test(labels, predictions):
     """
-    Outputs the area under the the ROC curve associated with a certain 
+    Outputs the area under the the ROC curve associated with a certain
     set of labels and the predictions given by the training model.
     Computed from the U statistic.
 
@@ -265,8 +265,8 @@ def auc_u_test(labels, predictions):
     Returns
     -------
     float
-        AUC value of given label, prediction pairs  
-   
+        AUC value of given label, prediction pairs
+
     """
     len_pos = int(np.sum(labels))
     len_neg = len(labels) - len_pos
@@ -316,7 +316,8 @@ class PerformanceMetrics(object):
     def __init__(self,
                  get_feature_from_index_fn,
                  report_gt_feature_n_positives=10,
-                 metrics=dict(roc_auc=roc_auc_score, average_precision=average_precision_score)):
+                 metrics=dict(roc_auc=roc_auc_score,
+                              average_precision=average_precision_score)):
         """
         Creates a new object of the `PerformanceMetrics` class.
         """
@@ -467,7 +468,7 @@ def write_feature_scores_to_file(self, output_path):
         cols = '\t'.join(["class"] + metric_cols)
         with open(output_path, 'w+') as file_handle:
             file_handle.write("{0}\n".format(cols))
-            for feature, metric_scores in sorted(feature_scores.items()):
+            for feature, metric_scores in feature_scores.items():
                 if not metric_scores:
                     file_handle.write("{0}\t{1}\n".format(feature, "\t".join(["NA"] * len(metric_cols))))
                 else: