Skip to content

Commit

Permalink
hyper: make sure the build is properly retried in case of container d…
Browse files Browse the repository at this point in the history
…ying

This is a real scenario that happens on the metabuildbot_travis
were we get ValueError('instance active') errors.

This is because somehow the workers are killed by the infra.

This tests raised lots of corner cases in the buildbot core, which are fixed in the previous commits
  • Loading branch information
tardyp committed Oct 21, 2016
1 parent 6539d7c commit 98bd296
Show file tree
Hide file tree
Showing 9 changed files with 167 additions and 44 deletions.
2 changes: 2 additions & 0 deletions master/buildbot/data/buildrequests.py
Expand Up @@ -21,6 +21,7 @@
from buildbot.db.buildrequests import AlreadyClaimedError
from buildbot.db.buildrequests import NotClaimedError
from buildbot.process import results
from buildbot.process.results import RETRY


class Db2DataMixin(object):
Expand Down Expand Up @@ -218,6 +219,7 @@ def unclaimBuildRequests(self, brids):
@defer.inlineCallbacks
def completeBuildRequests(self, brids, results, complete_at=None,
_reactor=reactor):
assert results != RETRY, "a buildrequest cannot be completed with a retry status!"
if not brids:
# empty buildrequest list. No need to call db API
defer.returnValue(True)
Expand Down
4 changes: 3 additions & 1 deletion master/buildbot/db/buildrequests.py
Expand Up @@ -22,6 +22,7 @@

from buildbot.db import NULL
from buildbot.db import base
from buildbot.process.results import RETRY
from buildbot.util import datetime2epoch
from buildbot.util import epoch2datetime

Expand Down Expand Up @@ -202,6 +203,7 @@ def thd(conn):

def completeBuildRequests(self, brids, results, complete_at=None,
_reactor=reactor):
assert results != RETRY, "a buildrequest cannot be completed with a retry status!"
if complete_at is not None:
complete_at = datetime2epoch(complete_at)
else:
Expand Down Expand Up @@ -231,7 +233,7 @@ def thd(conn):

# if an incorrect number of rows were updated, then we failed.
if res.rowcount != len(batch):
log.msg("tried to complete %d buildreqests, "
log.msg("tried to complete %d buildrequests, "
"but only completed %d" % (len(batch), res.rowcount))
transaction.rollback()
raise NotClaimedError
Expand Down
2 changes: 1 addition & 1 deletion master/buildbot/process/builder.py
Expand Up @@ -28,7 +28,7 @@
from buildbot.process import buildrequest
from buildbot.process import workerforbuilder
from buildbot.process.build import Build
from buildbot.status.builder import RETRY
from buildbot.process.results import RETRY
from buildbot.util import service as util_service
from buildbot.util import ascii2unicode
from buildbot.util import epoch2datetime
Expand Down
13 changes: 10 additions & 3 deletions master/buildbot/test/fake/step.py
Expand Up @@ -13,7 +13,7 @@
#
# Copyright Buildbot Team Members

from twisted.internet.defer import Deferred
from twisted.internet import defer

from buildbot.process.buildstep import BuildStep
from buildbot.process.results import CANCELLED
Expand All @@ -30,13 +30,19 @@ class BuildStepController(object):
def __init__(self, **kwargs):
self.step = ControllableBuildStep(self, **kwargs)
self.running = False
self.auto_finish_results = None

def finish_step(self, result):
assert self.running
self.running = False
d, self._run_deferred = self._run_deferred, None
d.callback(result)

def auto_finish_step(self, result):
self.auto_finish_results = result
if self.running:
self.finish_step(result)


class ControllableBuildStep(BuildStep):

Expand All @@ -50,10 +56,11 @@ def __init__(self, controller):
self._controller = controller

def run(self):
if self._controller.auto_finish_results is not None:
return defer.succeed(self._controller.auto_finish_results)
assert not self._controller.running

self._controller.running = True
self._controller._run_deferred = Deferred()
self._controller._run_deferred = defer.Deferred()
return self._controller._run_deferred

def interrupt(self, reason):
Expand Down
130 changes: 111 additions & 19 deletions master/buildbot/test/integration/test_hyper.py
Expand Up @@ -20,7 +20,15 @@

from twisted.internet import defer

from buildbot import util
from buildbot.config import BuilderConfig
from buildbot.plugins import schedulers
from buildbot.plugins import steps
from buildbot.process.factory import BuildFactory
from buildbot.process.results import CANCELLED
from buildbot.process.results import RETRY
from buildbot.process.results import SUCCESS
from buildbot.test.fake.step import BuildStepController
from buildbot.test.util.integration import RunMasterBase
from buildbot.worker import hyper as workerhyper
from buildbot.worker.hyper import HyperLatentWorker
Expand Down Expand Up @@ -52,36 +60,116 @@ def setUp(self):

@defer.inlineCallbacks
def test_trigger(self):
yield self.setupConfig(masterConfig(), startWorker=False)

change = dict(branch="master",
files=["foo.c"],
author="me@foo.com",
comments="good stuff",
revision="HEAD",
project="none"
)
yield self.doForceBuild(wantSteps=True, useChange=change, wantLogs=True)
yield self.setupConfig(masterConfig(num_concurrent=NUM_CONCURRENT), startWorker=False)
yield self.doForceBuild()

builds = yield self.master.data.get(("builds",))
# if there are some retry, there will be more builds
self.assertEqual(len(builds), 1 + NUM_CONCURRENT)
for b in builds:
self.assertEqual(b['results'], SUCCESS)

@defer.inlineCallbacks
def test_trigger_controlled_step(self):
stepcontroller = BuildStepController()
yield self.setupConfig(masterConfig(num_concurrent=1, extra_steps=[stepcontroller.step]),
startWorker=False)

d = self.doForceBuild()
builds = []
while len(builds) != 2:
builds = yield self.master.data.get(("builds",))
util.asyncSleep(.1)

while not stepcontroller.running:
yield util.asyncSleep(.1)

stepcontroller.finish_step(SUCCESS)
yield d
builds = yield self.master.data.get(("builds",))
for b in builds:
self.assertEqual(b['results'], SUCCESS)

@defer.inlineCallbacks
def test_trigger_controlled_step_killing_worker_in_between(self):
stepcontroller = BuildStepController()
yield self.setupConfig(masterConfig(num_concurrent=1, extra_steps=[stepcontroller.step]),
startWorker=False)

d = self.doForceBuild()
builds = []
while len(builds) != 2:
builds = yield self.master.data.get(("builds",))
yield util.asyncSleep(.1)

while not stepcontroller.running:
yield util.asyncSleep(.1)

worker = self.master.workers.workers['hyper0']
worker.client.remove_container(worker.instance['Id'], v=True, force=True)

# wait that the build is retried
while len(builds) == 2:
builds = yield self.master.data.get(("builds",))
yield util.asyncSleep(.1)
stepcontroller.auto_finish_step(SUCCESS)

yield d
builds = yield self.master.data.get(("builds",))
self.assertEqual(len(builds), 5, msg=None)
# the two first builds were retried
self.assertEqual(builds[0]['results'], RETRY)
self.assertEqual(builds[1]['results'], RETRY)
self.assertEqual(builds[2]['results'], SUCCESS)
self.assertEqual(builds[3]['results'], SUCCESS)
self.assertEqual(builds[4]['results'], SUCCESS)

@defer.inlineCallbacks
def test_trigger_controlled_step_stopped_worker_in_between(self):
stepcontroller = BuildStepController()
yield self.setupConfig(masterConfig(num_concurrent=1, extra_steps=[stepcontroller.step]),
startWorker=False)

d = self.doForceBuild()
builds = []
while len(builds) != 2:
builds = yield self.master.data.get(("builds",))
yield util.asyncSleep(.1)

while not stepcontroller.running:
yield util.asyncSleep(.1)

worker = self.master.workers.workers['hyper0']
worker.client.stop(worker.instance['Id'])
# wait that the build is retried
while len(builds) == 2:
builds = yield self.master.data.get(("builds",))
yield util.asyncSleep(.1)
stepcontroller.auto_finish_step(SUCCESS)

yield d
builds = yield self.master.data.get(("builds",))
self.assertEqual(len(builds), 5, msg=None)
# the two first builds were retried
self.assertEqual(builds[0]['results'], RETRY)
self.assertEqual(builds[1]['results'], RETRY)
self.assertEqual(builds[2]['results'], SUCCESS)
self.assertEqual(builds[3]['results'], SUCCESS)
self.assertEqual(builds[4]['results'], SUCCESS)


# master configuration
def masterConfig():
def masterConfig(num_concurrent, extra_steps=None):
if extra_steps is None:
extra_steps = []
c = {}
from buildbot.config import BuilderConfig
from buildbot.process.factory import BuildFactory
from buildbot.plugins import steps, schedulers

c['schedulers'] = [
schedulers.AnyBranchScheduler(
name="sched",
schedulers.ForceScheduler(
name="force",
builderNames=["testy"])]
triggereables = []
for i in range(NUM_CONCURRENT):
for i in range(num_concurrent):
c['schedulers'].append(
schedulers.Triggerable(
name="trigsched" + str(i),
Expand All @@ -96,13 +184,15 @@ def masterConfig():
f.addStep(steps.ShellCommand(command='echo world'))
f2 = BuildFactory()
f2.addStep(steps.ShellCommand(command='echo ola'))
for step in extra_steps:
f2.addStep(step)
c['builders'] = [
BuilderConfig(name="testy",
workernames=["hyper0"],
factory=f),
BuilderConfig(name="build",
workernames=["hyper" + str(i)
for i in range(NUM_CONCURRENT)],
for i in range(num_concurrent)],
factory=f2)]
hyperconfig = workerhyper.Hyper.guess_config()
if isinstance(hyperconfig, string_types):
Expand All @@ -114,8 +204,10 @@ def masterConfig():
hyperconfig[
'secretkey'], 'buildbot/buildbot-worker:master',
masterFQDN=masterFQDN)
for i in range(NUM_CONCURRENT)
for i in range(num_concurrent)
]
# un comment for debugging what happens if things looks locked.
# c['www'] = {'port': 8080}
# if the masterFQDN is forced (proxy case), then we use 9989 default port
# else, we try to find a free port
if masterFQDN is not None:
Expand Down
4 changes: 2 additions & 2 deletions master/buildbot/test/integration/test_latent.py
Expand Up @@ -28,7 +28,7 @@
from buildbot.interfaces import LatentWorkerSubstantiatiationCancelled
from buildbot.process.buildstep import BuildStep
from buildbot.process.factory import BuildFactory
from buildbot.process.results import CANCELLED
from buildbot.process.results import RETRY
from buildbot.process.results import SUCCESS
from buildbot.test.fake.latent import LatentController
from buildbot.test.fake.reactor import NonThreadPool
Expand Down Expand Up @@ -523,7 +523,7 @@ def test_worker_close_connection_while_building(self):
controller.disconnect_worker(self)
builds = self.successResultOf(
master.data.get(("builds",)))
self.assertEqual(builds[0]['results'], CANCELLED)
self.assertEqual(builds[0]['results'], RETRY)

# Request one build.
self.createBuildrequest(master, [builder_id])
Expand Down
21 changes: 14 additions & 7 deletions master/buildbot/test/util/integration.py
Expand Up @@ -28,6 +28,7 @@
from zope.interface import implementer

from buildbot.config import MasterConfig
from buildbot.data import resultspec
from buildbot.interfaces import IConfigLoader
from buildbot.master import BuildMaster
from buildbot.plugins import worker
Expand Down Expand Up @@ -155,24 +156,24 @@ def doForceBuild(self, wantSteps=False, wantProperties=False,

# in order to allow trigger based integration tests
# we wait until the first started build is finished
self.firstBuildId = None
self.firstBuildRequestId = None

def newCallback(_, data):
if self.firstBuildId is None:
self.firstBuildId = data['buildid']
if self.firstBuildRequestId is None:
self.firstBuildRequestId = data['buildrequestid']
newConsumer.stopConsuming()

def finishedCallback(_, data):
if self.firstBuildId == data['buildid']:
if self.firstBuildRequestId == data['buildrequestid']:
d.callback(data)

newConsumer = yield self.master.mq.startConsuming(
newCallback,
('builds', None, 'new'))
('buildrequests', None, 'new'))

finishedConsumer = yield self.master.mq.startConsuming(
finishedCallback,
('builds', None, 'finished'))
('buildrequests', None, 'complete'))

if useChange is False:
# use data api to force a build
Expand All @@ -182,7 +183,13 @@ def finishedCallback(_, data):
yield self.master.data.updates.addChange(**useChange)

# wait until we receive the build finished event
build = yield d
buildrequest = yield d
print(buildrequest)
builds = yield self.master.data.get(
('builds',),
filters=[resultspec.Filter('buildrequestid', 'eq', [buildrequest['buildrequestid']])])
# if the build has been retried, there will be several matching builds. We return the last build
build = builds[-1]
finishedConsumer.stopConsuming()
yield self.enrichBuild(build, wantSteps, wantProperties, wantLogs)
defer.returnValue(build)
Expand Down

0 comments on commit 98bd296

Please sign in to comment.