Skip to content

Commit

Permalink
Merge branch 'bug664'
Browse files Browse the repository at this point in the history
  • Loading branch information
Dustin J. Mitchell committed Dec 29, 2009
2 parents 7e4e948 + 9bbedb5 commit e9b44bf
Show file tree
Hide file tree
Showing 9 changed files with 126 additions and 64 deletions.
5 changes: 5 additions & 0 deletions NEWS
@@ -1,5 +1,10 @@
User visible changes in Buildbot. -*- outline -*-

** Builder.ping no longer accepts timeout argument (#664).

The implementation was not robust enough and could cause the master to
unexpectedly disconnect the slave.

** MailNotifier's customMesg replaced by messageFormatter

The customMesg mechanism had the unfortunate side effect of loading all data
Expand Down
9 changes: 5 additions & 4 deletions buildbot/interfaces.py
Expand Up @@ -1043,12 +1043,13 @@ def getBuild(number):
the build that is currently in progress: once the build finishes,
there is nothing to control anymore."""

def ping(timeout=30):
def ping():
"""Attempt to contact the slave and see if it is still alive. This
returns a Deferred which fires with either True (the slave is still
alive) or False (the slave did not respond). As a side effect, adds
an event to this builder's column in the waterfall display
containing the results of the ping."""
alive) or False (the slave did not respond). As a side effect, adds an
event to this builder's column in the waterfall display containing the
results of the ping. Note that this may not fail for a long time, it is
implemented in terms of the timeout on the underlying TCP connection."""
# TODO: this ought to live in ISlaveControl, maybe with disconnect()
# or something. However the event that is emitted is most useful in
# the Builder column, so it kinda fits here too.
Expand Down
53 changes: 14 additions & 39 deletions buildbot/process/builder.py
Expand Up @@ -120,7 +120,7 @@ def _attachFailure(self, why, where):
def prepare(self, builder_status):
return defer.succeed(None)

def ping(self, timeout, status=None):
def ping(self, status=None):
"""Ping the slave to make sure it is still there. Returns a Deferred
that fires with True if it is.
Expand All @@ -140,7 +140,7 @@ def ping(self, timeout, status=None):
self.ping_watchers.insert(0, d2)
# I think it will make the tests run smoother if the status
# is updated before the ping completes
Ping().ping(self.remote, timeout).addCallback(self._pong)
Ping().ping(self.remote).addCallback(self._pong)

def reset_state(res):
if self.state == PINGING:
Expand Down Expand Up @@ -173,9 +173,8 @@ def detached(self):

class Ping:
running = False
timer = None

def ping(self, remote, timeout):
def ping(self, remote):
assert not self.running
self.running = True
log.msg("sending ping")
Expand All @@ -185,40 +184,14 @@ def ping(self, remote, timeout):
remote.callRemote("print", "ping").addCallbacks(self._pong,
self._ping_failed,
errbackArgs=(remote,))

# We use either our own timeout or the (long) TCP timeout to detect
# silently-missing slaves. This might happen because of a NAT
# timeout or a routing loop. If the slave just shuts down (and we
# somehow missed the FIN), we should get a "connection refused"
# message.
self.timer = reactor.callLater(timeout, self._ping_timeout, remote)
return self.d

def _ping_timeout(self, remote):
log.msg("ping timeout")
# force the BuildSlave to disconnect, since this indicates that
# the bot is unreachable.
del self.timer
remote.broker.transport.loseConnection()
# the forcibly-lost connection will now cause the ping to fail

def _stopTimer(self):
if not self.running:
return
self.running = False

if self.timer:
self.timer.cancel()
del self.timer

def _pong(self, res):
log.msg("ping finished: success")
self._stopTimer()
self.d.callback(True)

def _ping_failed(self, res, remote):
log.msg("ping finished: failure")
self._stopTimer()
# the slave has some sort of internal error, disconnect them. If we
# don't, we'll requeue a build and ping them again right away,
# creating a nasty loop.
Expand Down Expand Up @@ -315,12 +288,12 @@ def _attachFailure(self, why, where):
self.state = LATENT
return AbstractSlaveBuilder._attachFailure(self, why, where)

def ping(self, timeout, status=None):
def ping(self, status=None):
if not self.slave.substantiated:
if status:
status.addEvent(["ping", "latent"]).finish()
return defer.succeed(True)
return AbstractSlaveBuilder.ping(self, timeout, status)
return AbstractSlaveBuilder.ping(self, status)


class Builder(pb.Referenceable):
Expand Down Expand Up @@ -363,7 +336,6 @@ class Builder(pb.Referenceable):
"""

expectations = None # this is created the first time we get a good build
START_BUILD_TIMEOUT = 10
CHOOSE_SLAVES_RANDOMLY = True # disabled for determinism during tests

def __init__(self, setup, builder_status):
Expand Down Expand Up @@ -755,13 +727,16 @@ def startBuild(self, build, sb):
log.msg("starting build %s using slave %s" % (build, sb))
d = sb.prepare(self.builder_status)
def _ping(ign):
# ping the slave to make sure they're still there. If they're
# ping the slave to make sure they're still there. If they've
# fallen off the map (due to a NAT timeout or something), this
# will fail in a couple of minutes, depending upon the TCP
# timeout. TODO: consider making this time out faster, or at
# least characterize the likely duration.
# timeout.
#
# TODO: This can unnecessarily suspend the starting of a build, in
# situations where the slave is live but is pushing lots of data to
# us in a build.
log.msg("starting build %s.. pinging the slave %s" % (build, sb))
return sb.ping(self.START_BUILD_TIMEOUT)
return sb.ping()
d.addCallback(_ping)
d.addCallback(self._startBuild_1, build, sb)
return d
Expand Down Expand Up @@ -888,13 +863,13 @@ def getPendingBuilds(self):
def getBuild(self, number):
return self.original.getBuild(number)

def ping(self, timeout=30):
def ping(self):
if not self.original.slaves:
self.original.builder_status.addPointEvent(["ping", "no slave"])
return defer.succeed(False) # interfaces.NoSlaveError
dl = []
for s in self.original.slaves:
dl.append(s.ping(timeout, self.original.builder_status))
dl.append(s.ping(self.original.builder_status))
d = defer.DeferredList(dl)
d.addCallback(self._gatherPingResults)
return d
Expand Down
18 changes: 11 additions & 7 deletions buildbot/scripts/runner.py
Expand Up @@ -21,7 +21,11 @@ class MakerBase(usage.Options):
["quiet", "q", "Do not emit the commands being run"],
]

#["basedir", "d", None, "Base directory for the buildmaster"],
longdesc = """
Operates upon the specified <basedir> (or the current directory, if not
specified).
"""

opt_h = usage.Options.opt_help

def parseArgs(self, *args):
Expand Down Expand Up @@ -264,7 +268,7 @@ class UpgradeMasterOptions(MakerBase):
]

def getSynopsis(self):
return "Usage: buildbot upgrade-master [options] <basedir>"
return "Usage: buildbot upgrade-master [options] [<basedir>]"

longdesc = """
This command takes an existing buildmaster working directory and
Expand Down Expand Up @@ -318,7 +322,7 @@ class MasterOptions(MakerBase):
"limit the number of kept old twisted log files"],
]
def getSynopsis(self):
return "Usage: buildbot create-master [options] <basedir>"
return "Usage: buildbot create-master [options] [<basedir>]"

longdesc = """
This command creates a buildmaster working directory and buildbot.tac
Expand Down Expand Up @@ -613,18 +617,18 @@ class StartOptions(MakerBase):
['quiet', 'q', "Don't display startup log messages"],
]
def getSynopsis(self):
return "Usage: buildbot start <basedir>"
return "Usage: buildbot start [<basedir>]"

class StopOptions(MakerBase):
def getSynopsis(self):
return "Usage: buildbot stop <basedir>"
return "Usage: buildbot stop [<basedir>]"

class ReconfigOptions(MakerBase):
optFlags = [
['quiet', 'q', "Don't display log messages about reconfiguration"],
]
def getSynopsis(self):
return "Usage: buildbot reconfig <basedir>"
return "Usage: buildbot reconfig [<basedir>]"



Expand All @@ -633,7 +637,7 @@ class RestartOptions(MakerBase):
['quiet', 'q', "Don't display startup log messages"],
]
def getSynopsis(self):
return "Usage: buildbot restart <basedir>"
return "Usage: buildbot restart [<basedir>]"

class DebugClientOptions(usage.Options):
optFlags = [
Expand Down
10 changes: 5 additions & 5 deletions buildbot/test/runutils.py
Expand Up @@ -222,15 +222,15 @@ def shutdownSlave(self, slavename, buildername):
def _shutdownSlave_done(self, res, slavename):
del self.slaves[slavename]

def killSlave(self):
def killSlave(self, slavename="bot1", buildername="dummy"):
# the slave has died, its host sent a FIN. The .notifyOnDisconnect
# callbacks will terminate the current step, so the build should be
# flunked (no further steps should be started).
self.slaves['bot1'].bf.continueTrying = 0
bot = self.slaves['bot1'].getServiceNamed("bot")
broker = bot.builders["dummy"].remote.broker
self.slaves[slavename].bf.continueTrying = 0
bot = self.slaves[slavename].getServiceNamed("bot")
broker = bot.builders[buildername].remote.broker
broker.transport.loseConnection()
del self.slaves['bot1']
del self.slaves[slavename]

def disappearSlave(self, slavename="bot1", buildername="dummy",
allowReconnect=False):
Expand Down
69 changes: 69 additions & 0 deletions buildbot/test/test_locks.py
Expand Up @@ -496,3 +496,72 @@ def _testLock3_1(self, res):
# ("done", 1), ("done", 3),
# ("start", 2), ("done", 2)])

class BuilderLocks(RunMixin, unittest.TestCase):
config = """\
from buildbot import locks
from buildbot.process import factory
from buildbot.buildslave import BuildSlave
from buildbot.config import BuilderConfig
s = factory.s
from buildbot.test.test_locks import LockStep
BuildmasterConfig = c = {}
c['slaves'] = [BuildSlave('bot1', 'sekrit'), BuildSlave('bot2', 'sekrit')]
c['schedulers'] = []
c['slavePortnum'] = 0
master_lock = locks.MasterLock('master', maxCount=2)
f_excl = factory.BuildFactory([s(LockStep, timeout=0,
locks=[master_lock.access("exclusive")])])
f_count = factory.BuildFactory([s(LockStep, timeout=0,
locks=[master_lock])])
slaves = ['bot1', 'bot2']
c['builders'] = [
BuilderConfig(name='excl_A', slavenames=slaves, factory=f_excl),
BuilderConfig(name='excl_B', slavenames=slaves, factory=f_excl),
BuilderConfig(name='count_A', slavenames=slaves, factory=f_count),
BuilderConfig(name='count_B', slavenames=slaves, factory=f_count),
]
"""

def setUp(self):
N = 'test_builder'
RunMixin.setUp(self)
self.reqs = [BuildRequest("forced build", SourceStamp(), N)
for i in range(4)]
self.events = []
for i in range(4):
self.reqs[i].number = i
self.reqs[i].events = self.events
d = self.master.loadConfig(self.config)
d.addCallback(lambda res: self.master.startService())
d.addCallback(lambda res: self.connectSlaves(["bot1", "bot2"],
["excl_A", "excl_B",
"count_A", "count_B"]))
return d

def testOrder(self):
self.control.getBuilder("excl_A").requestBuild(self.reqs[0])
self.control.getBuilder("excl_B").requestBuild(self.reqs[1])
self.control.getBuilder("count_A").requestBuild(self.reqs[2])
self.control.getBuilder("count_B").requestBuild(self.reqs[3])
d = defer.DeferredList([r.waitUntilFinished()
for r in self.reqs])
d.addCallback(self._testOrder)
return d

def _testOrder(self, res):
# excl_A and excl_B cannot overlap with any other steps.
self.assert_(("start", 0) in self.events)
self.assert_(("done", 0) in self.events)
self.assert_(self.events.index(("start", 0)) + 1 ==
self.events.index(("done", 0)))

self.assert_(("start", 1) in self.events)
self.assert_(("done", 1) in self.events)
self.assert_(self.events.index(("start", 1)) + 1 ==
self.events.index(("done", 1)))

# FIXME: We really want to test that count_A and count_B were
# overlapped, but don't have a reliable way to do this.
17 changes: 9 additions & 8 deletions buildbot/test/test_run.py
Expand Up @@ -240,7 +240,7 @@ def testPing(self):
return d

def _testPing_1(self, res):
d = interfaces.IControl(self.master).getBuilder("dummy").ping(1)
d = interfaces.IControl(self.master).getBuilder("dummy").ping()
d.addCallback(self._testPing_2)
return d

Expand Down Expand Up @@ -330,13 +330,13 @@ def testIdle2(self):
# forcing a build will work: the build detect that the slave is no
# longer available and will be re-queued. Wait 5 seconds, then check
# to make sure the build is still in the 'waiting for a slave' queue.
self.control.getBuilder("dummy").original.START_BUILD_TIMEOUT = 1
req = BuildRequest("forced build", SourceStamp(), "test_builder")
self.failUnlessEqual(req.startCount, 0)
self.control.getBuilder("dummy").requestBuild(req)
# this should ping the slave, which doesn't respond, and then give up
# after a second. The BuildRequest will be re-queued, and its
# .startCount will be incremented.
# this should ping the slave, which doesn't respond (and eventually
# times out). The BuildRequest will be re-queued, and its .startCount
# will be incremented.
self.killSlave()
d = defer.Deferred()
d.addCallback(self._testIdle2_1, req)
reactor.callLater(3, d.callback, None)
Expand Down Expand Up @@ -486,7 +486,7 @@ def testDisappear(self):
bc = self.control.getBuilder("dummy")

# ping should succeed
d = bc.ping(1)
d = bc.ping()
d.addCallback(self._testDisappear_1, bc)
return d

Expand All @@ -496,8 +496,9 @@ def _testDisappear_1(self, res, bc):
# now, before any build is run, make the slave disappear
self.disappearSlave(allowReconnect=False)

# at this point, a ping to the slave should timeout
d = bc.ping(1)
# initiate the ping and then kill the slave, to simulate a disconnect.
d = bc.ping()
self.killSlave()
d.addCallback(self. _testDisappear_2)
return d
def _testDisappear_2(self, res):
Expand Down
2 changes: 1 addition & 1 deletion buildbot/test/test_slaves.py
Expand Up @@ -143,11 +143,11 @@ def testFallback2(self):

b1 = self.master.botmaster.builders["b1"]
# reduce the ping time so we'll failover faster
b1.START_BUILD_TIMEOUT = 1
assert b1.CHOOSE_SLAVES_RANDOMLY
b1.CHOOSE_SLAVES_RANDOMLY = False
self.disappearSlave("bot1", "b1", allowReconnect=False)
d = self.doBuild("b1")
self.killSlave("bot1", "b1")
d.addCallback(self._testFallback2_1)
return d
def _testFallback2_1(self, res):
Expand Down
7 changes: 7 additions & 0 deletions docs/buildbot.texinfo
Expand Up @@ -1045,6 +1045,13 @@ command:
buildbot start @var{BASEDIR}
@end example

The @var{BASEDIR} is option and can be omitted if the current directory
contains the buildbot configuration (the @file{buildbot.tac} file).

@example
buildbot start
@end example

This command will start the daemon and then return, so normally it
will not produce any output. To verify that the programs are indeed
running, look for a pair of files named @file{twistd.log} and
Expand Down

0 comments on commit e9b44bf

Please sign in to comment.