diff --git a/NEWS b/NEWS index 69d0991f4c0..9e611c2f69a 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,10 @@ User visible changes in Buildbot. -*- outline -*- +** Builder.ping no longer accepts timeout argument (#664). + +The implementation was not robust enough and could cause the master to +unexpectedly disconnect the slave. + ** MailNotifier's customMesg replaced by messageFormatter The customMesg mechanism had the unfortunate side effect of loading all data diff --git a/buildbot/interfaces.py b/buildbot/interfaces.py index ac735ce7a2e..68aa2f567a6 100644 --- a/buildbot/interfaces.py +++ b/buildbot/interfaces.py @@ -1043,12 +1043,13 @@ def getBuild(number): the build that is currently in progress: once the build finishes, there is nothing to control anymore.""" - def ping(timeout=30): + def ping(): """Attempt to contact the slave and see if it is still alive. This returns a Deferred which fires with either True (the slave is still - alive) or False (the slave did not respond). As a side effect, adds - an event to this builder's column in the waterfall display - containing the results of the ping.""" + alive) or False (the slave did not respond). As a side effect, adds an + event to this builder's column in the waterfall display containing the + results of the ping. Note that this may not fail for a long time, it is + implemented in terms of the timeout on the underlying TCP connection.""" # TODO: this ought to live in ISlaveControl, maybe with disconnect() # or something. However the event that is emitted is most useful in # the Builder column, so it kinda fits here too. diff --git a/buildbot/process/builder.py b/buildbot/process/builder.py index dca57907133..5ac77aa2ff0 100644 --- a/buildbot/process/builder.py +++ b/buildbot/process/builder.py @@ -120,7 +120,7 @@ def _attachFailure(self, why, where): def prepare(self, builder_status): return defer.succeed(None) - def ping(self, timeout, status=None): + def ping(self, status=None): """Ping the slave to make sure it is still there. Returns a Deferred that fires with True if it is. @@ -140,7 +140,7 @@ def ping(self, timeout, status=None): self.ping_watchers.insert(0, d2) # I think it will make the tests run smoother if the status # is updated before the ping completes - Ping().ping(self.remote, timeout).addCallback(self._pong) + Ping().ping(self.remote).addCallback(self._pong) def reset_state(res): if self.state == PINGING: @@ -173,9 +173,8 @@ def detached(self): class Ping: running = False - timer = None - def ping(self, remote, timeout): + def ping(self, remote): assert not self.running self.running = True log.msg("sending ping") @@ -185,40 +184,14 @@ def ping(self, remote, timeout): remote.callRemote("print", "ping").addCallbacks(self._pong, self._ping_failed, errbackArgs=(remote,)) - - # We use either our own timeout or the (long) TCP timeout to detect - # silently-missing slaves. This might happen because of a NAT - # timeout or a routing loop. If the slave just shuts down (and we - # somehow missed the FIN), we should get a "connection refused" - # message. - self.timer = reactor.callLater(timeout, self._ping_timeout, remote) return self.d - def _ping_timeout(self, remote): - log.msg("ping timeout") - # force the BuildSlave to disconnect, since this indicates that - # the bot is unreachable. - del self.timer - remote.broker.transport.loseConnection() - # the forcibly-lost connection will now cause the ping to fail - - def _stopTimer(self): - if not self.running: - return - self.running = False - - if self.timer: - self.timer.cancel() - del self.timer - def _pong(self, res): log.msg("ping finished: success") - self._stopTimer() self.d.callback(True) def _ping_failed(self, res, remote): log.msg("ping finished: failure") - self._stopTimer() # the slave has some sort of internal error, disconnect them. If we # don't, we'll requeue a build and ping them again right away, # creating a nasty loop. @@ -315,12 +288,12 @@ def _attachFailure(self, why, where): self.state = LATENT return AbstractSlaveBuilder._attachFailure(self, why, where) - def ping(self, timeout, status=None): + def ping(self, status=None): if not self.slave.substantiated: if status: status.addEvent(["ping", "latent"]).finish() return defer.succeed(True) - return AbstractSlaveBuilder.ping(self, timeout, status) + return AbstractSlaveBuilder.ping(self, status) class Builder(pb.Referenceable): @@ -363,7 +336,6 @@ class Builder(pb.Referenceable): """ expectations = None # this is created the first time we get a good build - START_BUILD_TIMEOUT = 10 CHOOSE_SLAVES_RANDOMLY = True # disabled for determinism during tests def __init__(self, setup, builder_status): @@ -755,13 +727,16 @@ def startBuild(self, build, sb): log.msg("starting build %s using slave %s" % (build, sb)) d = sb.prepare(self.builder_status) def _ping(ign): - # ping the slave to make sure they're still there. If they're + # ping the slave to make sure they're still there. If they've # fallen off the map (due to a NAT timeout or something), this # will fail in a couple of minutes, depending upon the TCP - # timeout. TODO: consider making this time out faster, or at - # least characterize the likely duration. + # timeout. + # + # TODO: This can unnecessarily suspend the starting of a build, in + # situations where the slave is live but is pushing lots of data to + # us in a build. log.msg("starting build %s.. pinging the slave %s" % (build, sb)) - return sb.ping(self.START_BUILD_TIMEOUT) + return sb.ping() d.addCallback(_ping) d.addCallback(self._startBuild_1, build, sb) return d @@ -888,13 +863,13 @@ def getPendingBuilds(self): def getBuild(self, number): return self.original.getBuild(number) - def ping(self, timeout=30): + def ping(self): if not self.original.slaves: self.original.builder_status.addPointEvent(["ping", "no slave"]) return defer.succeed(False) # interfaces.NoSlaveError dl = [] for s in self.original.slaves: - dl.append(s.ping(timeout, self.original.builder_status)) + dl.append(s.ping(self.original.builder_status)) d = defer.DeferredList(dl) d.addCallback(self._gatherPingResults) return d diff --git a/buildbot/scripts/runner.py b/buildbot/scripts/runner.py index a68ba7eee18..9542bac69e1 100644 --- a/buildbot/scripts/runner.py +++ b/buildbot/scripts/runner.py @@ -21,7 +21,11 @@ class MakerBase(usage.Options): ["quiet", "q", "Do not emit the commands being run"], ] - #["basedir", "d", None, "Base directory for the buildmaster"], + longdesc = """ + Operates upon the specified (or the current directory, if not + specified). + """ + opt_h = usage.Options.opt_help def parseArgs(self, *args): @@ -264,7 +268,7 @@ class UpgradeMasterOptions(MakerBase): ] def getSynopsis(self): - return "Usage: buildbot upgrade-master [options] " + return "Usage: buildbot upgrade-master [options] []" longdesc = """ This command takes an existing buildmaster working directory and @@ -318,7 +322,7 @@ class MasterOptions(MakerBase): "limit the number of kept old twisted log files"], ] def getSynopsis(self): - return "Usage: buildbot create-master [options] " + return "Usage: buildbot create-master [options] []" longdesc = """ This command creates a buildmaster working directory and buildbot.tac @@ -613,18 +617,18 @@ class StartOptions(MakerBase): ['quiet', 'q', "Don't display startup log messages"], ] def getSynopsis(self): - return "Usage: buildbot start " + return "Usage: buildbot start []" class StopOptions(MakerBase): def getSynopsis(self): - return "Usage: buildbot stop " + return "Usage: buildbot stop []" class ReconfigOptions(MakerBase): optFlags = [ ['quiet', 'q', "Don't display log messages about reconfiguration"], ] def getSynopsis(self): - return "Usage: buildbot reconfig " + return "Usage: buildbot reconfig []" @@ -633,7 +637,7 @@ class RestartOptions(MakerBase): ['quiet', 'q', "Don't display startup log messages"], ] def getSynopsis(self): - return "Usage: buildbot restart " + return "Usage: buildbot restart []" class DebugClientOptions(usage.Options): optFlags = [ diff --git a/buildbot/test/runutils.py b/buildbot/test/runutils.py index 1f6b02545fc..9a77de1fa84 100644 --- a/buildbot/test/runutils.py +++ b/buildbot/test/runutils.py @@ -222,15 +222,15 @@ def shutdownSlave(self, slavename, buildername): def _shutdownSlave_done(self, res, slavename): del self.slaves[slavename] - def killSlave(self): + def killSlave(self, slavename="bot1", buildername="dummy"): # the slave has died, its host sent a FIN. The .notifyOnDisconnect # callbacks will terminate the current step, so the build should be # flunked (no further steps should be started). - self.slaves['bot1'].bf.continueTrying = 0 - bot = self.slaves['bot1'].getServiceNamed("bot") - broker = bot.builders["dummy"].remote.broker + self.slaves[slavename].bf.continueTrying = 0 + bot = self.slaves[slavename].getServiceNamed("bot") + broker = bot.builders[buildername].remote.broker broker.transport.loseConnection() - del self.slaves['bot1'] + del self.slaves[slavename] def disappearSlave(self, slavename="bot1", buildername="dummy", allowReconnect=False): diff --git a/buildbot/test/test_locks.py b/buildbot/test/test_locks.py index cf54a7b6671..79d1c398cf8 100644 --- a/buildbot/test/test_locks.py +++ b/buildbot/test/test_locks.py @@ -496,3 +496,72 @@ def _testLock3_1(self, res): # ("done", 1), ("done", 3), # ("start", 2), ("done", 2)]) +class BuilderLocks(RunMixin, unittest.TestCase): + config = """\ +from buildbot import locks +from buildbot.process import factory +from buildbot.buildslave import BuildSlave +from buildbot.config import BuilderConfig +s = factory.s +from buildbot.test.test_locks import LockStep + +BuildmasterConfig = c = {} +c['slaves'] = [BuildSlave('bot1', 'sekrit'), BuildSlave('bot2', 'sekrit')] +c['schedulers'] = [] +c['slavePortnum'] = 0 + +master_lock = locks.MasterLock('master', maxCount=2) +f_excl = factory.BuildFactory([s(LockStep, timeout=0, + locks=[master_lock.access("exclusive")])]) +f_count = factory.BuildFactory([s(LockStep, timeout=0, + locks=[master_lock])]) + +slaves = ['bot1', 'bot2'] +c['builders'] = [ + BuilderConfig(name='excl_A', slavenames=slaves, factory=f_excl), + BuilderConfig(name='excl_B', slavenames=slaves, factory=f_excl), + BuilderConfig(name='count_A', slavenames=slaves, factory=f_count), + BuilderConfig(name='count_B', slavenames=slaves, factory=f_count), +] +""" + + def setUp(self): + N = 'test_builder' + RunMixin.setUp(self) + self.reqs = [BuildRequest("forced build", SourceStamp(), N) + for i in range(4)] + self.events = [] + for i in range(4): + self.reqs[i].number = i + self.reqs[i].events = self.events + d = self.master.loadConfig(self.config) + d.addCallback(lambda res: self.master.startService()) + d.addCallback(lambda res: self.connectSlaves(["bot1", "bot2"], + ["excl_A", "excl_B", + "count_A", "count_B"])) + return d + + def testOrder(self): + self.control.getBuilder("excl_A").requestBuild(self.reqs[0]) + self.control.getBuilder("excl_B").requestBuild(self.reqs[1]) + self.control.getBuilder("count_A").requestBuild(self.reqs[2]) + self.control.getBuilder("count_B").requestBuild(self.reqs[3]) + d = defer.DeferredList([r.waitUntilFinished() + for r in self.reqs]) + d.addCallback(self._testOrder) + return d + + def _testOrder(self, res): + # excl_A and excl_B cannot overlap with any other steps. + self.assert_(("start", 0) in self.events) + self.assert_(("done", 0) in self.events) + self.assert_(self.events.index(("start", 0)) + 1 == + self.events.index(("done", 0))) + + self.assert_(("start", 1) in self.events) + self.assert_(("done", 1) in self.events) + self.assert_(self.events.index(("start", 1)) + 1 == + self.events.index(("done", 1))) + + # FIXME: We really want to test that count_A and count_B were + # overlapped, but don't have a reliable way to do this. diff --git a/buildbot/test/test_run.py b/buildbot/test/test_run.py index aebc2b45d9c..e602a6c9e8f 100644 --- a/buildbot/test/test_run.py +++ b/buildbot/test/test_run.py @@ -240,7 +240,7 @@ def testPing(self): return d def _testPing_1(self, res): - d = interfaces.IControl(self.master).getBuilder("dummy").ping(1) + d = interfaces.IControl(self.master).getBuilder("dummy").ping() d.addCallback(self._testPing_2) return d @@ -330,13 +330,13 @@ def testIdle2(self): # forcing a build will work: the build detect that the slave is no # longer available and will be re-queued. Wait 5 seconds, then check # to make sure the build is still in the 'waiting for a slave' queue. - self.control.getBuilder("dummy").original.START_BUILD_TIMEOUT = 1 req = BuildRequest("forced build", SourceStamp(), "test_builder") self.failUnlessEqual(req.startCount, 0) self.control.getBuilder("dummy").requestBuild(req) - # this should ping the slave, which doesn't respond, and then give up - # after a second. The BuildRequest will be re-queued, and its - # .startCount will be incremented. + # this should ping the slave, which doesn't respond (and eventually + # times out). The BuildRequest will be re-queued, and its .startCount + # will be incremented. + self.killSlave() d = defer.Deferred() d.addCallback(self._testIdle2_1, req) reactor.callLater(3, d.callback, None) @@ -486,7 +486,7 @@ def testDisappear(self): bc = self.control.getBuilder("dummy") # ping should succeed - d = bc.ping(1) + d = bc.ping() d.addCallback(self._testDisappear_1, bc) return d @@ -496,8 +496,9 @@ def _testDisappear_1(self, res, bc): # now, before any build is run, make the slave disappear self.disappearSlave(allowReconnect=False) - # at this point, a ping to the slave should timeout - d = bc.ping(1) + # initiate the ping and then kill the slave, to simulate a disconnect. + d = bc.ping() + self.killSlave() d.addCallback(self. _testDisappear_2) return d def _testDisappear_2(self, res): diff --git a/buildbot/test/test_slaves.py b/buildbot/test/test_slaves.py index 310a613af94..15a6e09a0ae 100644 --- a/buildbot/test/test_slaves.py +++ b/buildbot/test/test_slaves.py @@ -143,11 +143,11 @@ def testFallback2(self): b1 = self.master.botmaster.builders["b1"] # reduce the ping time so we'll failover faster - b1.START_BUILD_TIMEOUT = 1 assert b1.CHOOSE_SLAVES_RANDOMLY b1.CHOOSE_SLAVES_RANDOMLY = False self.disappearSlave("bot1", "b1", allowReconnect=False) d = self.doBuild("b1") + self.killSlave("bot1", "b1") d.addCallback(self._testFallback2_1) return d def _testFallback2_1(self, res): diff --git a/docs/buildbot.texinfo b/docs/buildbot.texinfo index 97efa6b5e4c..b6c521bf37e 100644 --- a/docs/buildbot.texinfo +++ b/docs/buildbot.texinfo @@ -1045,6 +1045,13 @@ command: buildbot start @var{BASEDIR} @end example +The @var{BASEDIR} is option and can be omitted if the current directory +contains the buildbot configuration (the @file{buildbot.tac} file). + +@example +buildbot start +@end example + This command will start the daemon and then return, so normally it will not produce any output. To verify that the programs are indeed running, look for a pair of files named @file{twistd.log} and