Make SavePrimaryDB faster

We avoid reading in the entire file by just reading the first byte. Also, this reduces the amount of waiting between polls, so small test runs complete noticeably faster. As a side effect of streaming the file, we have to prevent the use of toil caching. (readGlobalFileStream doesn't have a cache=False option.)
ComparativeGenomicsToolkit · Jul 6, 2017 · 08694b3 · 08694b3
1 parent 26fcc44
commit 08694b3
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 6 deletions.
diff --git a/src/cactus/pipeline/cactus_workflow.py b/src/cactus/pipeline/cactus_workflow.py
@@ -300,11 +300,11 @@ def run(self, fileStore):
         stopKtserver(dbElem)
         # Wait for the file to appear in the right place. This may take a while
         while True:
-            path = fileStore.readGlobalFile(self.cactusWorkflowArguments.snapshotID, cache=False)
-            stat = os.stat(path)
-            if stat.st_size > 0:
-                break
-            time.sleep(60)
+            with fileStore.readGlobalFileStream(self.cactusWorkflowArguments.snapshotID) as f:
+                if f.read(1) != '':
+                    # The file is no longer empty
+                    break
+            time.sleep(10)
         # We have the file now
         intermediateResultsUrl = getattr(self.cactusWorkflowArguments, 'intermediateResultsUrl', None)
         if intermediateResultsUrl is not None:

diff --git a/src/cactus/progressive/cactus_progressive.py b/src/cactus/progressive/cactus_progressive.py
@@ -442,7 +442,12 @@ def main():
     setLoggingFromOptions(options)
 
     options.cactusDir = os.path.abspath(options.cactusDir)
-
+
+    # Caching generally slows down the cactus workflow, plus some
+    # methods like readGlobalFileStream don't support forced
+    # reads directly from the job store rather than from cache.
+    options.disableCaching = True
+
     #Create the progressive cactus project 
     projWrapper = ProjectWrapper(options)
     projWrapper.writeXml()