feat: expose lock & add timeout on push job so that queue cant be stuck

AVVS · AVVS · commit 935ba811c6dc · 2018-01-17T00:32:19.000-08:00
BREAKING CHANGE: earlier it was not needed to ensure that we operate on
the taken lock, and it was possible for it to expire during long jobs
and another process to take control of the job creating multiple
parallel threads. To avoid that `onJobComplete` functions exposes `lock`
as a property, which allows us to preserve back-compatibility. However,
it must be noted that user must take care of lock extension during
lengthy tasks and that they will fail with a timeout error if result
is not resolved within default lock timeout multiplied by 2 or by
expected custom timeout provided to `push` function.
diff --git a/__tests__/integration.js b/__tests__/integration.js
@@ -18,6 +18,9 @@ describe('integration tests', () => {
           client: this.redis,
           pubsub: this.pubsub,
           pubsubChannel: 'dlock',
+          lock: {
+            timeout: 2000,
+          },
         });
         return null;
       })
@@ -93,6 +96,31 @@ describe('integration tests', () => {
       });
   });
 
+  it('#push: fails after timeout', () => {
+    const job = sinon.spy();
+    const onComplete = sinon.spy();
+    const failedToQueue = sinon.spy();
+    const unexpectedError = sinon.spy();
+
+    return Promise.map(this.queueManagers, (queueManager, idx) => {
+      const id = String(idx % 3);
+      return queueManager.dlock
+        .push(id, (...args) => onComplete(...args)) /* to ensure functions are unique */
+        .then(job)
+        .catch(isLockAcquisitionError, failedToQueue)
+        .catch(unexpectedError);
+    })
+      .delay(4500) /* must be called after timeout * 2 */
+      .then(() => {
+        assert.equal(job.callCount, 3);
+        assert.equal(onComplete.callCount, 10);
+        assert.equal(onComplete.withArgs(sinon.match({ message: 'queue-no-response' })).callCount, 10);
+        assert.equal(failedToQueue.callCount, 7, 'unexpected error was raised');
+        assert.equal(unexpectedError.called, false, 'fatal error was raised');
+        return null;
+      });
+  });
+
   it('#push: when job fails onComplete is called with an error', () => {
     const args = new Error('fail');
     const job = sinon.spy(next => next(args));
@@ -233,9 +261,6 @@ describe('integration tests', () => {
         .map(Array(50), (_, i) => {
           const semaphore = this.semaphores[i % this.semaphores.length];
           return Promise.using(semaphore.take(), async () => {
-            process.stderr.write(this.counter);
-            process.stderr.write('\n');
-
             this.counter += 1;
             // if it's possible for other contestants
             // to run out of semaphore lock - this.counter will
diff --git a/package.json b/package.json
@@ -47,21 +47,21 @@
     "@makeomatic/deploy": "^5.0.2",
     "@makeomatic/last-release-npm": "^1.0.1",
     "babel-cli": "^6.26.0",
-    "babel-eslint": "^8.1.2",
+    "babel-eslint": "^8.2.1",
     "babel-plugin-istanbul": "^4.1.5",
     "babel-plugin-transform-strict-mode": "^6.24.1",
     "babel-register": "^6.26.0",
     "codecov": "^3.0.0",
     "cross-env": "^5.1.3",
-    "eslint": "^4.14.0",
+    "eslint": "^4.15.0",
     "eslint-config-makeomatic": "^2.0.1",
     "eslint-plugin-import": "^2.8.0",
     "eslint-plugin-promise": "^3.6.0",
     "ioredis": "^3.2.2",
-    "jest-cli": "^22.0.4",
+    "jest-cli": "^22.1.1",
     "mocha": "^4.1.0",
     "nyc": "^11.4.1",
-    "sinon": "^4.1.3"
+    "sinon": "^4.1.6"
   },
   "engine": {
     "node": ">= 8.9.0"
diff --git a/src/distributed-callback-queue.js b/src/distributed-callback-queue.js
@@ -22,6 +22,7 @@ const notLockAcquisitionError = e => e.name !== 'LockAcquisitionError';
 const isBoolean = filter(Boolean);
 const toFlattenedTruthyArray = compose(isBoolean, flatten);
 const couldNotAcquireLockError = new LockAcquisitionError('job is already running');
+const TimeoutError = new Promise.TimeoutError('queue-no-response');
 
 /**
  * @class DistributedCallbackQueue
@@ -32,7 +33,7 @@ const couldNotAcquireLockError = new LockAcquisitionError('job is already runnin
  *    @param {redisClient} pubsub: redis connection that will be used for notifications
  *    @param {String} pubsubChannel - will be used to pass notifications
  *    @param {Object} lock - configuration for redislock:
- *        @param {Number} timeout - defaults to 1000
+ *        @param {Number} timeout - defaults to 10000
  *        @param {Number} retries - defaults to 0
  *        @param {Number} delay - defaults to 100
  *    @param {Object|Boolean} log: sets up logger. If set to false supresses all warnings
@@ -55,7 +56,7 @@ class DistributedCallbackQueue {
 
     const lockOptions = defaults(options.lock || {}, {
       timeout: 10000,
-      retries: 1,
+      retries: 2,
       delay: 100,
     });
 
@@ -124,10 +125,11 @@ class DistributedCallbackQueue {
    * Adds callback to distributed queue
    * @param {String}  suffix - queue identifier
    * @param {Function} next - callback to be called when request is finished
+   * @param {number} [timeout=this.lockOptions.timeout * 2] - fail after <timeout>, set to 0 to disable
    * @returns {Promise} if promise is resolved then we must act, if it's rejected - then
    *                    somebody else is working on the same task right now
    */
-  push(suffix, next) {
+  push(suffix, next, timeout = this.lockOptions.timeout * 2) {
     assert(suffix, 'must be a truthy string');
 
     // first queue locally to make use of pending requests
@@ -140,6 +142,13 @@ class DistributedCallbackQueue {
       return Promise.reject(couldNotAcquireLockError);
     }
 
+    if (timeout) {
+      /* we are first in the local queue */
+      const onTimeout = setTimeout(callbackQueue._call, timeout, lockRedisKey, [TimeoutError], this.logger);
+      /* if we have no response from dlock -> without timeout, clean local queue */
+      callbackQueue.add(lockRedisKey, () => clearTimeout(onTimeout));
+    }
+
     // create lock
     const lock = this.getLock();
 
@@ -254,33 +263,45 @@ class DistributedCallbackQueue {
    *                              all queued callbacks
    */
   createWorker(lockRedisKey, lock) {
-    const dlock = this;
     /**
      * This function must be called when job has been completed
      * @param  {Error} err
      * @param  {Array} ...args
      */
-    return (err, ...args) => {
+    const broadcastJobStatus = async (err, ...args) => {
+      /* clen ref */
+      broadcastJobStatus.lock = null;
+
       // must release lock now. Technically there could be an event
       // where lock had not been released, notification already emitted
       // and callback is stuck in the queue, to avoid that we can add retry
       // to lock acquisition. Desicion and constraints are up to you. Ideally
       // you would want to cache result of the function for some time - and then
       // this race is completed. Multi() command is not possible to use here
-      return lock
-        .release()
-        .then(() => {
-          // emit event
-          // at this point we are sure that this job still belongs to us,
-          // if it doesn't - we can't publish response, because this task may be acquired
-          // by someone else
-          return dlock.publish(lockRedisKey, err, ...args);
-        })
-        .catch((error) => {
-          // because a job may take too much time, other listeners must implement timeout/retry strategy
-          dlock.logger.warn('failed to release lock and publish results', error);
-        });
+      try {
+        // ensure lock still belongs to us
+        await lock.extend();
+      } catch (error) {
+        // because a job may take too much time, other listeners must implement timeout/retry strategy
+        this.logger.warn('failed to release lock and publish results', error);
+        return null;
+      }
+
+      // emit event
+      // at this point we are sure that this job still belongs to us,
+      // if it doesn't - we can't publish response, because this task may be acquired
+      // by someone else
+      return this
+        .publish(lockRedisKey, err, ...args)
+        /* ensure we release the lock once publish is completed */
+        /* during race conditions we rely on _retry_ setting to re-acquire lock */
+        .finally(() => lock.release().reflect());
     };
+
+    // set associated lock -> lengthy jobs must extend this
+    broadcastJobStatus.lock = lock;
+
+    return broadcastJobStatus;
   }
 }
 
diff --git a/yarn.lock b/yarn.lock