Skip to content

Commit

Permalink
Fix pickling error when jobstate file doesnt exist and fix threading …
Browse files Browse the repository at this point in the history
…error when lock file exists then disappears (#4575)

Co-authored-by: Brandon Walker <walkerbd@dali1.dali.hpc.ncats.nih.gov>
Co-authored-by: Adam Novak <anovak@soe.ucsc.edu>
  • Loading branch information
3 people authored and stxue1 committed Sep 20, 2023
1 parent 59c19bb commit ec2aab8
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 1 deletion.
4 changes: 4 additions & 0 deletions src/toil/fileStores/nonCachingFileStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,10 @@ def _getAllJobStates(cls, coordination_dir: str) -> Iterator[Dict[str, str]]:
# This is a FileNotFoundError.
# job finished & deleted its jobState file since the jobState files were discovered
continue
elif e.errno == 5:
# This is a OSError: [Errno 5] Input/output error (jobStatefile seems to disappear
# on network file system sometimes)
continue
else:
raise

Expand Down
7 changes: 6 additions & 1 deletion src/toil/lib/threading.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,12 @@ def leave(self) -> Iterator[bool]:
# There is someone claiming to be here. Are they alive?
full_path = os.path.join(self.lockfileDir, item)

fd = os.open(full_path, os.O_RDONLY)
try:
fd = os.open(full_path, os.O_RDONLY)
except OSError as e:
# suddenly file doesnt exist on network file system?
continue

try:
fcntl.lockf(fd, fcntl.LOCK_SH | fcntl.LOCK_NB)
except OSError as e:
Expand Down

0 comments on commit ec2aab8

Please sign in to comment.