diff --git a/master/buildbot/data/buildrequests.py b/master/buildbot/data/buildrequests.py index ca8160e38df..210d6378477 100644 --- a/master/buildbot/data/buildrequests.py +++ b/master/buildbot/data/buildrequests.py @@ -21,6 +21,7 @@ from buildbot.db.buildrequests import AlreadyClaimedError from buildbot.db.buildrequests import NotClaimedError from buildbot.process import results +from buildbot.process.results import RETRY class Db2DataMixin(object): @@ -218,6 +219,7 @@ def unclaimBuildRequests(self, brids): @defer.inlineCallbacks def completeBuildRequests(self, brids, results, complete_at=None, _reactor=reactor): + assert results != RETRY, "a buildrequest cannot be completed with a retry status!" if not brids: # empty buildrequest list. No need to call db API defer.returnValue(True) diff --git a/master/buildbot/db/buildrequests.py b/master/buildbot/db/buildrequests.py index e8177b8ef95..a847006e2dc 100644 --- a/master/buildbot/db/buildrequests.py +++ b/master/buildbot/db/buildrequests.py @@ -22,6 +22,7 @@ from buildbot.db import NULL from buildbot.db import base +from buildbot.process.results import RETRY from buildbot.util import datetime2epoch from buildbot.util import epoch2datetime @@ -202,6 +203,7 @@ def thd(conn): def completeBuildRequests(self, brids, results, complete_at=None, _reactor=reactor): + assert results != RETRY, "a buildrequest cannot be completed with a retry status!" if complete_at is not None: complete_at = datetime2epoch(complete_at) else: @@ -231,7 +233,7 @@ def thd(conn): # if an incorrect number of rows were updated, then we failed. if res.rowcount != len(batch): - log.msg("tried to complete %d buildreqests, " + log.msg("tried to complete %d buildrequests, " "but only completed %d" % (len(batch), res.rowcount)) transaction.rollback() raise NotClaimedError diff --git a/master/buildbot/process/builder.py b/master/buildbot/process/builder.py index ed6d13c54be..17c772de835 100644 --- a/master/buildbot/process/builder.py +++ b/master/buildbot/process/builder.py @@ -28,7 +28,7 @@ from buildbot.process import buildrequest from buildbot.process import workerforbuilder from buildbot.process.build import Build -from buildbot.status.builder import RETRY +from buildbot.process.results import RETRY from buildbot.util import service as util_service from buildbot.util import ascii2unicode from buildbot.util import epoch2datetime diff --git a/master/buildbot/test/fake/step.py b/master/buildbot/test/fake/step.py index c09a5dd7022..e1d7b136715 100644 --- a/master/buildbot/test/fake/step.py +++ b/master/buildbot/test/fake/step.py @@ -13,7 +13,7 @@ # # Copyright Buildbot Team Members -from twisted.internet.defer import Deferred +from twisted.internet import defer from buildbot.process.buildstep import BuildStep from buildbot.process.results import CANCELLED @@ -30,6 +30,7 @@ class BuildStepController(object): def __init__(self, **kwargs): self.step = ControllableBuildStep(self, **kwargs) self.running = False + self.auto_finish_results = None def finish_step(self, result): assert self.running @@ -37,6 +38,11 @@ def finish_step(self, result): d, self._run_deferred = self._run_deferred, None d.callback(result) + def auto_finish_step(self, result): + self.auto_finish_results = result + if self.running: + self.finish_step(result) + class ControllableBuildStep(BuildStep): @@ -50,10 +56,11 @@ def __init__(self, controller): self._controller = controller def run(self): + if self._controller.auto_finish_results is not None: + return defer.succeed(self._controller.auto_finish_results) assert not self._controller.running - self._controller.running = True - self._controller._run_deferred = Deferred() + self._controller._run_deferred = defer.Deferred() return self._controller._run_deferred def interrupt(self, reason): diff --git a/master/buildbot/test/integration/test_hyper.py b/master/buildbot/test/integration/test_hyper.py index 7fa4e66abf8..116eb3a76c3 100644 --- a/master/buildbot/test/integration/test_hyper.py +++ b/master/buildbot/test/integration/test_hyper.py @@ -20,7 +20,15 @@ from twisted.internet import defer +from buildbot import util +from buildbot.config import BuilderConfig +from buildbot.plugins import schedulers +from buildbot.plugins import steps +from buildbot.process.factory import BuildFactory +from buildbot.process.results import CANCELLED +from buildbot.process.results import RETRY from buildbot.process.results import SUCCESS +from buildbot.test.fake.step import BuildStepController from buildbot.test.util.integration import RunMasterBase from buildbot.worker import hyper as workerhyper from buildbot.worker.hyper import HyperLatentWorker @@ -52,36 +60,116 @@ def setUp(self): @defer.inlineCallbacks def test_trigger(self): - yield self.setupConfig(masterConfig(), startWorker=False) - - change = dict(branch="master", - files=["foo.c"], - author="me@foo.com", - comments="good stuff", - revision="HEAD", - project="none" - ) - yield self.doForceBuild(wantSteps=True, useChange=change, wantLogs=True) + yield self.setupConfig(masterConfig(num_concurrent=NUM_CONCURRENT), startWorker=False) + yield self.doForceBuild() + builds = yield self.master.data.get(("builds",)) # if there are some retry, there will be more builds self.assertEqual(len(builds), 1 + NUM_CONCURRENT) for b in builds: self.assertEqual(b['results'], SUCCESS) + @defer.inlineCallbacks + def test_trigger_controlled_step(self): + stepcontroller = BuildStepController() + yield self.setupConfig(masterConfig(num_concurrent=1, extra_steps=[stepcontroller.step]), + startWorker=False) + + d = self.doForceBuild() + builds = [] + while len(builds) != 2: + builds = yield self.master.data.get(("builds",)) + util.asyncSleep(.1) + + while not stepcontroller.running: + yield util.asyncSleep(.1) + + stepcontroller.finish_step(SUCCESS) + yield d + builds = yield self.master.data.get(("builds",)) + for b in builds: + self.assertEqual(b['results'], SUCCESS) + + @defer.inlineCallbacks + def test_trigger_controlled_step_killing_worker_in_between(self): + stepcontroller = BuildStepController() + yield self.setupConfig(masterConfig(num_concurrent=1, extra_steps=[stepcontroller.step]), + startWorker=False) + + d = self.doForceBuild() + builds = [] + while len(builds) != 2: + builds = yield self.master.data.get(("builds",)) + yield util.asyncSleep(.1) + + while not stepcontroller.running: + yield util.asyncSleep(.1) + + worker = self.master.workers.workers['hyper0'] + worker.client.remove_container(worker.instance['Id'], v=True, force=True) + + # wait that the build is retried + while len(builds) == 2: + builds = yield self.master.data.get(("builds",)) + yield util.asyncSleep(.1) + stepcontroller.auto_finish_step(SUCCESS) + + yield d + builds = yield self.master.data.get(("builds",)) + self.assertEqual(len(builds), 5, msg=None) + # the two first builds were retried + self.assertEqual(builds[0]['results'], RETRY) + self.assertEqual(builds[1]['results'], RETRY) + self.assertEqual(builds[2]['results'], SUCCESS) + self.assertEqual(builds[3]['results'], SUCCESS) + self.assertEqual(builds[4]['results'], SUCCESS) + + @defer.inlineCallbacks + def test_trigger_controlled_step_stopped_worker_in_between(self): + stepcontroller = BuildStepController() + yield self.setupConfig(masterConfig(num_concurrent=1, extra_steps=[stepcontroller.step]), + startWorker=False) + + d = self.doForceBuild() + builds = [] + while len(builds) != 2: + builds = yield self.master.data.get(("builds",)) + yield util.asyncSleep(.1) + + while not stepcontroller.running: + yield util.asyncSleep(.1) + + worker = self.master.workers.workers['hyper0'] + worker.client.stop(worker.instance['Id']) + # wait that the build is retried + while len(builds) == 2: + builds = yield self.master.data.get(("builds",)) + yield util.asyncSleep(.1) + stepcontroller.auto_finish_step(SUCCESS) + + yield d + builds = yield self.master.data.get(("builds",)) + self.assertEqual(len(builds), 5, msg=None) + # the two first builds were retried + self.assertEqual(builds[0]['results'], RETRY) + self.assertEqual(builds[1]['results'], RETRY) + self.assertEqual(builds[2]['results'], SUCCESS) + self.assertEqual(builds[3]['results'], SUCCESS) + self.assertEqual(builds[4]['results'], SUCCESS) + # master configuration -def masterConfig(): +def masterConfig(num_concurrent, extra_steps=None): + if extra_steps is None: + extra_steps = [] c = {} - from buildbot.config import BuilderConfig - from buildbot.process.factory import BuildFactory - from buildbot.plugins import steps, schedulers c['schedulers'] = [ - schedulers.AnyBranchScheduler( - name="sched", + schedulers.ForceScheduler( + name="force", builderNames=["testy"])] triggereables = [] - for i in range(NUM_CONCURRENT): + for i in range(num_concurrent): c['schedulers'].append( schedulers.Triggerable( name="trigsched" + str(i), @@ -96,13 +184,15 @@ def masterConfig(): f.addStep(steps.ShellCommand(command='echo world')) f2 = BuildFactory() f2.addStep(steps.ShellCommand(command='echo ola')) + for step in extra_steps: + f2.addStep(step) c['builders'] = [ BuilderConfig(name="testy", workernames=["hyper0"], factory=f), BuilderConfig(name="build", workernames=["hyper" + str(i) - for i in range(NUM_CONCURRENT)], + for i in range(num_concurrent)], factory=f2)] hyperconfig = workerhyper.Hyper.guess_config() if isinstance(hyperconfig, string_types): @@ -114,8 +204,10 @@ def masterConfig(): hyperconfig[ 'secretkey'], 'buildbot/buildbot-worker:master', masterFQDN=masterFQDN) - for i in range(NUM_CONCURRENT) + for i in range(num_concurrent) ] + # un comment for debugging what happens if things looks locked. + # c['www'] = {'port': 8080} # if the masterFQDN is forced (proxy case), then we use 9989 default port # else, we try to find a free port if masterFQDN is not None: diff --git a/master/buildbot/test/integration/test_latent.py b/master/buildbot/test/integration/test_latent.py index f1ae5729b21..d34e858b923 100644 --- a/master/buildbot/test/integration/test_latent.py +++ b/master/buildbot/test/integration/test_latent.py @@ -28,7 +28,7 @@ from buildbot.interfaces import LatentWorkerSubstantiatiationCancelled from buildbot.process.buildstep import BuildStep from buildbot.process.factory import BuildFactory -from buildbot.process.results import CANCELLED +from buildbot.process.results import RETRY from buildbot.process.results import SUCCESS from buildbot.test.fake.latent import LatentController from buildbot.test.fake.reactor import NonThreadPool @@ -523,7 +523,7 @@ def test_worker_close_connection_while_building(self): controller.disconnect_worker(self) builds = self.successResultOf( master.data.get(("builds",))) - self.assertEqual(builds[0]['results'], CANCELLED) + self.assertEqual(builds[0]['results'], RETRY) # Request one build. self.createBuildrequest(master, [builder_id]) diff --git a/master/buildbot/test/util/integration.py b/master/buildbot/test/util/integration.py index 484ce1fd834..f0095cc12de 100644 --- a/master/buildbot/test/util/integration.py +++ b/master/buildbot/test/util/integration.py @@ -28,6 +28,7 @@ from zope.interface import implementer from buildbot.config import MasterConfig +from buildbot.data import resultspec from buildbot.interfaces import IConfigLoader from buildbot.master import BuildMaster from buildbot.plugins import worker @@ -155,24 +156,24 @@ def doForceBuild(self, wantSteps=False, wantProperties=False, # in order to allow trigger based integration tests # we wait until the first started build is finished - self.firstBuildId = None + self.firstBuildRequestId = None def newCallback(_, data): - if self.firstBuildId is None: - self.firstBuildId = data['buildid'] + if self.firstBuildRequestId is None: + self.firstBuildRequestId = data['buildrequestid'] newConsumer.stopConsuming() def finishedCallback(_, data): - if self.firstBuildId == data['buildid']: + if self.firstBuildRequestId == data['buildrequestid']: d.callback(data) newConsumer = yield self.master.mq.startConsuming( newCallback, - ('builds', None, 'new')) + ('buildrequests', None, 'new')) finishedConsumer = yield self.master.mq.startConsuming( finishedCallback, - ('builds', None, 'finished')) + ('buildrequests', None, 'complete')) if useChange is False: # use data api to force a build @@ -182,7 +183,13 @@ def finishedCallback(_, data): yield self.master.data.updates.addChange(**useChange) # wait until we receive the build finished event - build = yield d + buildrequest = yield d + print(buildrequest) + builds = yield self.master.data.get( + ('builds',), + filters=[resultspec.Filter('buildrequestid', 'eq', [buildrequest['buildrequestid']])]) + # if the build has been retried, there will be several matching builds. We return the last build + build = builds[-1] finishedConsumer.stopConsuming() yield self.enrichBuild(build, wantSteps, wantProperties, wantLogs) defer.returnValue(build) diff --git a/master/buildbot/worker/hyper.py b/master/buildbot/worker/hyper.py index 41637032a35..ac51b1f3531 100644 --- a/master/buildbot/worker/hyper.py +++ b/master/buildbot/worker/hyper.py @@ -35,6 +35,7 @@ try: import docker from hyper_sh import Client as Hyper + from docker.errors import NotFound [docker, Hyper] except ImportError: Hyper = None @@ -95,12 +96,6 @@ class HyperLatentWorker(AbstractLatentWorker): def checkConfig(self, name, password, hyper_host, hyper_accesskey, hyper_secretkey, image, hyper_size="s3", masterFQDN=None, **kwargs): - # Set build_wait_timeout to 0s if not explicitely set: Starting a - # container is almost immediate, we can affort doing so for each build. - - if 'build_wait_timeout' not in kwargs: - kwargs['build_wait_timeout'] = 0 - AbstractLatentWorker.checkConfig(self, name, password, **kwargs) if not Hyper: @@ -120,6 +115,12 @@ def client(self): @defer.inlineCallbacks def reconfigService(self, name, password, hyper_host, hyper_accesskey, hyper_secretkey, image, hyper_size="s3", masterFQDN=None, **kwargs): + # Set build_wait_timeout to 0s if not explicitely set: Starting a + # container is almost immediate, we can affort doing so for each build. + + if 'build_wait_timeout' not in kwargs: + kwargs['build_wait_timeout'] = 0 + yield AbstractLatentWorker.reconfigService(self, name, password, **kwargs) self.manager = yield HyperLatentManager.getService(self.master, hyper_host, hyper_accesskey, @@ -150,9 +151,6 @@ def deferToThread(self, meth, *args, **kwargs): @defer.inlineCallbacks def start_instance(self, build): - if self.instance is not None: - raise ValueError('instance active') - image = yield build.render(self.image) yield self.deferToThread(self._thd_start_instance, image) defer.returnValue(True) @@ -162,9 +160,13 @@ def getContainerName(self): def _thd_cleanup_instance(self): instances = self.client.containers( + all=1, filters=dict(name=self.getContainerName())) for instance in instances: - self.client.remove_container(instance['Id'], v=True, force=True) + try: + self.client.remove_container(instance['Id'], v=True, force=True) + except NotFound: + pass # that's a race condition def _thd_start_instance(self, image): t1 = time.time() @@ -213,7 +215,15 @@ def _thd_stop_instance(self, fast): log.debug('{name}:{containerid}: Stopping container', name=self.name, containerid=self.shortid) t1 = time.time() - self.client.stop(self.instance['Id']) + try: + self.client.stop(self.instance['Id']) + except NotFound: + # That's ok. container was already deleted, probably by an admin + # lets fail nicely + log.warn('{name}:{containerid}: container was already deleted!', name=self.name, + containerid=self.shortid) + self.instance = None + return t2 = time.time() if not fast: self.client.wait(self.instance['Id']) diff --git a/master/buildbot/worker/latent.py b/master/buildbot/worker/latent.py index 76fc3171f9c..29cfe5e2def 100644 --- a/master/buildbot/worker/latent.py +++ b/master/buildbot/worker/latent.py @@ -195,6 +195,9 @@ def _substantiation_failed(self, failure): return self._mail_missing_message(subject, text) def canStartBuild(self): + # we were disconnected, but all the builds are not yet cleaned up. + if self.conn is None and self.building: + return False if self.insubstantiating: return False return AbstractWorker.canStartBuild(self)