Skip to content

Commit a3919d8

Browse files
committed
swim: be ready to idle round steps when net is slow
First of all, the problem in a nutshell was that ev_timer with non-zero 'repeat' field in fact is a ev_periodic. It is restarted *automatically*, even if a user does not write ev_timer_again() nor ev_timer_start(). This led to a situation, that a round message send is scheduled, and next round step timer alarm happens before the message is actually sent. It, in turn, led to an assertion on attempt to schedule a task twice. This patch fixes the swim test harness to behave like ev_timer with 'repeat' > 0, and on first idle round step stops the timer - it will be restarted once the currently hanging task will be finally sent. Follow up #3234
1 parent 5001ecd commit a3919d8

File tree

4 files changed

+52
-5
lines changed

4 files changed

+52
-5
lines changed

src/lib/swim/swim.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1077,15 +1077,29 @@ swim_begin_step(struct ev_loop *loop, struct ev_timer *t, int events)
10771077
(void) events;
10781078
(void) loop;
10791079
struct swim *swim = (struct swim *) t->data;
1080+
/*
1081+
* There are possible false-positive wakeups. They can
1082+
* appear, when a round task was scheduled, but event
1083+
* loop was too busy to send the task, and the timer
1084+
* alarms again. In such a case stop it - it makes no
1085+
* sense to waste time on idle wakeups. Completion
1086+
* callback will restart the timer.
1087+
*/
1088+
if (swim_task_is_scheduled(&swim->round_step_task)) {
1089+
swim_ev_timer_stop(loop, t);
1090+
return;
1091+
}
10801092
if (! rlist_empty(&swim->round_queue))
10811093
say_verbose("SWIM %d: continue the round", swim_fd(swim));
10821094
else
10831095
swim_new_round(swim);
10841096
/*
10851097
* Possibly empty, if no members but self are specified.
10861098
*/
1087-
if (rlist_empty(&swim->round_queue))
1099+
if (rlist_empty(&swim->round_queue)) {
1100+
swim_ev_timer_stop(loop, t);
10881101
return;
1102+
}
10891103
swim_encode_round_msg(swim);
10901104
struct swim_member *m =
10911105
rlist_first_entry(&swim->round_queue, struct swim_member,
@@ -1104,6 +1118,10 @@ swim_complete_step(struct swim_task *task,
11041118
(void) rc;
11051119
(void) task;
11061120
struct swim *swim = swim_by_scheduler(scheduler);
1121+
/*
1122+
* It could be stopped by the step begin function, if the
1123+
* sending was too long.
1124+
*/
11071125
swim_ev_timer_again(loop(), &swim->round_tick);
11081126
/*
11091127
* It is possible that the original member was deleted

test/unit/swim.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -933,10 +933,27 @@ swim_test_encryption(void)
933933
swim_finish_test();
934934
}
935935

936+
static void
937+
swim_test_slow_net(void)
938+
{
939+
swim_start_test(0);
940+
struct swim_cluster *cluster = swim_cluster_new(2);
941+
swim_cluster_interconnect(cluster, 0, 1);
942+
swim_cluster_block_io(cluster, 0);
943+
swim_cluster_block_io(cluster, 1);
944+
945+
note("slow network leads to idle round steps, they should not produce "\
946+
"a new message");
947+
swim_run_for(5);
948+
949+
swim_cluster_delete(cluster);
950+
swim_finish_test();
951+
}
952+
936953
static int
937954
main_f(va_list ap)
938955
{
939-
swim_start_test(19);
956+
swim_start_test(20);
940957

941958
(void) ap;
942959
swim_test_ev_init();
@@ -961,6 +978,7 @@ main_f(va_list ap)
961978
swim_test_payload_refutation();
962979
swim_test_indirect_ping();
963980
swim_test_encryption();
981+
swim_test_slow_net();
964982

965983
swim_test_transport_free();
966984
swim_test_ev_free();

test/unit/swim.result

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
*** main_f ***
2-
1..19
2+
1..20
33
*** swim_test_one_link ***
44
1..6
55
ok 1 - no rounds - no fullmesh
@@ -195,4 +195,9 @@ ok 18 - subtests
195195
ok 3 - cluster works after encryption has been disabled
196196
ok 19 - subtests
197197
*** swim_test_encryption: done ***
198+
*** swim_test_slow_net ***
199+
1..0
200+
# slow network leads to idle round steps, they should not produce a new message
201+
ok 20 - subtests
202+
*** swim_test_slow_net: done ***
198203
*** main_f: done ***

test/unit/swim_test_ev.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,18 +177,24 @@ swim_timer_event_delete(struct swim_event *e)
177177
free(te);
178178
}
179179

180+
/** Create a new timer event. */
181+
static void
182+
swim_timer_event_new(struct ev_watcher *watcher, double delay);
183+
180184
/** Process a timer event and delete it. */
181185
static void
182186
swim_timer_event_process(struct swim_event *e, struct ev_loop *loop)
183187
{
184188
assert(e->type == SWIM_EVENT_TIMER);
185189
struct ev_watcher *w = ((struct swim_timer_event *) e)->watcher;
190+
struct ev_timer *t = (struct ev_timer *) w;
186191
swim_timer_event_delete(e);
187-
((struct ev_timer *) w)->at = 0;
192+
t->at = 0;
193+
if (t->repeat > 0)
194+
swim_timer_event_new(w, t->repeat);
188195
ev_invoke(loop, w, EV_TIMER);
189196
}
190197

191-
/** Create a new timer event. */
192198
static void
193199
swim_timer_event_new(struct ev_watcher *watcher, double delay)
194200
{

0 commit comments

Comments
 (0)