-
-
Notifications
You must be signed in to change notification settings - Fork 1.8k
/
Copy pathrpl_parallel.cc
3635 lines (3259 loc) · 116 KB
/
rpl_parallel.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include "mariadb.h"
#include "rpl_parallel.h"
#include "slave.h"
#include "rpl_mi.h"
#include "sql_parse.h"
#include "debug_sync.h"
#include "sql_repl.h"
#include "wsrep_mysqld.h"
#ifdef WITH_WSREP
#include "wsrep_trans_observer.h"
#endif
/*
Code for optional parallel execution of replicated events on the slave.
*/
/*
Maximum number of queued events to accumulate in a local free list, before
moving them to the global free list. There is additional a limit of how much
to accumulate based on opt_slave_parallel_max_queued.
*/
#define QEV_BATCH_FREE 200
struct rpl_parallel_thread_pool global_rpl_thread_pool;
static void signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi,
int err);
static void
register_wait_for_prior_event_group_commit(rpl_group_info *rgi,
rpl_parallel_entry *entry);
static int
rpt_handle_event(rpl_parallel_thread::queued_event *qev,
struct rpl_parallel_thread *rpt)
{
int err;
rpl_group_info *rgi= qev->rgi;
Relay_log_info *rli= rgi->rli;
THD *thd= rgi->thd;
Log_event *ev;
DBUG_ASSERT(qev->typ == rpl_parallel_thread::queued_event::QUEUED_EVENT);
ev= qev->ev;
#ifdef WITH_WSREP
if (wsrep_before_statement(thd))
{
WSREP_WARN("Parallel slave failed at wsrep_before_statement() hook");
return(1);
}
#endif /* WITH_WSREP */
thd->system_thread_info.rpl_sql_info->rpl_filter = rli->mi->rpl_filter;
ev->thd= thd;
safe_strcpy(rgi->event_relay_log_name_buf, sizeof(rgi->event_relay_log_name_buf),
qev->event_relay_log_name);
rgi->event_relay_log_name= rgi->event_relay_log_name_buf;
rgi->event_relay_log_pos= qev->event_relay_log_pos;
rgi->future_event_relay_log_pos= qev->future_event_relay_log_pos;
safe_strcpy(rgi->future_event_master_log_name, sizeof(rgi->future_event_master_log_name),
qev->future_event_master_log_name);
if (event_can_update_last_master_timestamp(ev))
rgi->last_master_timestamp= ev->when + ev->exec_time;
err= apply_event_and_update_pos_for_parallel(ev, thd, rgi);
rli->executed_entries++;
#ifdef WITH_WSREP
if (wsrep_after_statement(thd))
{
WSREP_WARN("Parallel slave failed at wsrep_after_statement() hook");
err= 1;
}
#endif /* WITH_WSREP */
/* ToDo: error handling. */
return err;
}
static void
handle_queued_pos_update(THD *thd, rpl_parallel_thread::queued_event *qev)
{
int cmp;
Relay_log_info *rli;
rpl_parallel_entry *e;
/*
Events that are not part of an event group, such as Format Description,
Stop, GTID List and such, are executed directly in the driver SQL thread,
to keep the relay log state up-to-date. But the associated position update
is done here, in sync with other normal events as they are queued to
worker threads.
*/
if ((thd->variables.option_bits & OPTION_BEGIN) &&
opt_using_transactions)
return;
/* Do not update position if an earlier event group caused an error abort. */
DBUG_ASSERT(qev->typ == rpl_parallel_thread::queued_event::QUEUED_POS_UPDATE);
rli= qev->rgi->rli;
e= qev->entry_for_queued;
if (e->stop_on_error_sub_id < (uint64)ULONGLONG_MAX ||
(e->force_abort && !rli->stop_for_until))
return;
mysql_mutex_lock(&rli->data_lock);
cmp= compare_log_name(rli->group_relay_log_name, qev->event_relay_log_name);
if (cmp < 0)
{
rli->group_relay_log_pos= qev->future_event_relay_log_pos;
strmake_buf(rli->group_relay_log_name, qev->event_relay_log_name);
} else if (cmp == 0 &&
rli->group_relay_log_pos < qev->future_event_relay_log_pos)
rli->group_relay_log_pos= qev->future_event_relay_log_pos;
cmp= compare_log_name(rli->group_master_log_name, qev->future_event_master_log_name);
if (cmp < 0)
{
safe_strcpy(rli->group_master_log_name, sizeof(rli->group_master_log_name),
qev->future_event_master_log_name);
rli->group_master_log_pos= qev->future_event_master_log_pos;
}
else if (cmp == 0
&& rli->group_master_log_pos < qev->future_event_master_log_pos)
rli->group_master_log_pos= qev->future_event_master_log_pos;
mysql_mutex_unlock(&rli->data_lock);
mysql_cond_broadcast(&rli->data_cond);
}
/*
Wait for any pending deadlock kills. Since deadlock kills happen
asynchronously, we need to be sure they will be completed before starting a
new transaction. Otherwise the new transaction might suffer a spurious kill.
*/
void
wait_for_pending_deadlock_kill(THD *thd, rpl_group_info *rgi)
{
PSI_stage_info old_stage;
mysql_mutex_lock(&thd->LOCK_wakeup_ready);
thd->set_time_for_next_stage();
thd->ENTER_COND(&thd->COND_wakeup_ready, &thd->LOCK_wakeup_ready,
&stage_waiting_for_deadlock_kill, &old_stage);
while (rgi->killed_for_retry == rpl_group_info::RETRY_KILL_PENDING)
mysql_cond_wait(&thd->COND_wakeup_ready, &thd->LOCK_wakeup_ready);
thd->EXIT_COND(&old_stage);
}
static void
finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id,
rpl_parallel_entry *entry, rpl_group_info *rgi)
{
THD *thd= rpt->thd;
wait_for_commit *wfc= &rgi->commit_orderer;
int err;
if (rgi->get_finish_event_group_called())
return;
thd->get_stmt_da()->set_overwrite_status(true);
if (unlikely(rgi->worker_error))
{
/*
In case a previous wait was killed, we need to re-register to be able to
repeat the wait.
And before doing that, we un-register any previous registration (in case
we got an error earlier and skipped waiting).
*/
thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
mysql_mutex_lock(&entry->LOCK_parallel_entry);
register_wait_for_prior_event_group_commit(rgi, entry);
mysql_mutex_unlock(&entry->LOCK_parallel_entry);
}
/*
Remove any left-over registration to wait for a prior commit to
complete. Normally, such wait would already have been removed at
this point by wait_for_prior_commit() called from within COMMIT
processing.
However, in case of MyISAM and no binlog, we might not have any commit
processing, and so we need to do the wait here, before waking up any
subsequent commits, to preserve correct order of event execution.
Also, in the error case we might have skipped waiting and thus need to
remove it explicitly. Or the wait might have been killed and we need to
repeat the registration and the wait.
It is important in the non-error case to do a wait, not just an
unregister. Because we might be last in a group-commit that is
replicated in parallel, and the following event will then wait
for us to complete and rely on this also ensuring that any other
event in the group has completed.
And in the error case, correct GCO lifetime relies on the fact that once
the last event group in the GCO has executed wait_for_prior_commit(),
all earlier event groups have also committed; this way no more
mark_start_commit() calls can be made and it is safe to de-allocate
the GCO.
Thus this final wait is done with kill ignored during the wait. This is
fine, at this point there is no active query or transaction to abort, and
the thread will continue as soon as earlier event groups complete.
Note though, that in the non-error case there is no guarantee that
finish_event_group() will be run in-order. For example, a successful
binlog group commit will wakeup all participating event groups
simultaneously so only thread scheduling will decide the order in which
finish_event_group() calls acquire LOCK_parallel_entry.
*/
err= wfc->wait_for_prior_commit(thd, false);
if (unlikely(err) && !rgi->worker_error)
signal_error_to_sql_driver_thread(thd, rgi, err);
thd->wait_for_commit_ptr= NULL;
/*
Calls to check_duplicate_gtid() must match up with
record_and_update_gtid() (or release_domain_owner() in error case). This
assertion tries to catch any missing release of the domain.
*/
DBUG_ASSERT(rgi->gtid_ignore_duplicate_state != rpl_group_info::GTID_DUPLICATE_OWNER);
mysql_mutex_lock(&entry->LOCK_parallel_entry);
/*
We need to mark that this event group started its commit phase, in case we
missed it before (otherwise we would deadlock the next event group that is
waiting for this). In most cases (normal DML), it will be a no-op.
*/
rgi->mark_start_commit_no_lock();
rgi->commit_orderer.wakeup_blocked= false;
if (entry->last_committed_sub_id < sub_id)
{
/*
Record that this event group has finished (eg. transaction is
committed, if transactional), so other event groups will no longer
attempt to wait for us to commit. Once we have increased
entry->last_committed_sub_id, no other threads will execute
register_wait_for_prior_commit() against us. Thus, by doing one
extra (usually redundant) wakeup_subsequent_commits() we can ensure
that no register_wait_for_prior_commit() can ever happen without a
subsequent wakeup_subsequent_commits() to wake it up.
We can race here with the next transactions, but that is fine, as
long as we check that we do not decrease last_committed_sub_id. If
this commit is done, then any prior commits will also have been
done and also no longer need waiting for.
*/
entry->last_committed_sub_id= sub_id;
if (entry->need_sub_id_signal)
mysql_cond_broadcast(&entry->COND_parallel_entry);
/* Now free any GCOs in which all transactions have committed. */
group_commit_orderer *tmp_gco= rgi->gco;
while (tmp_gco &&
(!tmp_gco->next_gco || tmp_gco->last_sub_id > sub_id ||
tmp_gco->next_gco->wait_count > entry->count_committing_event_groups))
{
/*
We must not free a GCO before the wait_count of the following GCO has
been reached and wakeup has been sent. Otherwise we will lose the
wakeup and hang (there were several such bugs in the past).
The intention is that this is ensured already since we only free when
the last event group in the GCO has committed
(tmp_gco->last_sub_id <= sub_id). However, if we have a bug, we have
extra check on next_gco->wait_count to hopefully avoid hanging; we
have here an assertion in debug builds that this check does not in
fact trigger.
*/
DBUG_ASSERT(!tmp_gco->next_gco || tmp_gco->last_sub_id > sub_id);
tmp_gco= tmp_gco->prev_gco;
}
while (tmp_gco)
{
group_commit_orderer *prev_gco= tmp_gco->prev_gco;
tmp_gco->next_gco->prev_gco= NULL;
rpt->loc_free_gco(tmp_gco);
tmp_gco= prev_gco;
}
}
/*
If this event group got error, then any following event groups that have
not yet started should just skip their group, preparing for stop of the
SQL driver thread.
*/
if (unlikely(rgi->worker_error) && entry->stop_on_error_sub_id > sub_id)
entry->stop_on_error_sub_id= sub_id;
mysql_mutex_unlock(&entry->LOCK_parallel_entry);
#ifdef ENABLED_DEBUG_SYNC
DBUG_EXECUTE_IF("hold_worker_on_schedule", {
if (entry->stop_on_error_sub_id < (uint64)ULONGLONG_MAX)
{
debug_sync_set_action(thd, STRING_WITH_LEN("now SIGNAL continue_worker"));
}
});
DBUG_EXECUTE_IF("rpl_parallel_simulate_wait_at_retry", {
if (rgi->current_gtid.seq_no == 1000) {
DBUG_ASSERT(entry->stop_on_error_sub_id == sub_id);
debug_sync_set_action(thd,
STRING_WITH_LEN("now WAIT_FOR proceed_by_1000"));
}
});
DBUG_EXECUTE_IF("hold_worker2_favor_worker3", {
if (rgi->current_gtid.seq_no == 2001) {
DBUG_ASSERT(!rgi->worker_error || entry->stop_on_error_sub_id == sub_id);
debug_sync_set_action(thd, STRING_WITH_LEN("now SIGNAL cont_worker3"));
}
});
#endif
if (rgi->killed_for_retry == rpl_group_info::RETRY_KILL_PENDING)
wait_for_pending_deadlock_kill(thd, rgi);
thd->clear_error();
thd->reset_killed();
/*
Would do thd->get_stmt_da()->set_overwrite_status(false) here, but
reset_diagnostics_area() already does that.
*/
thd->get_stmt_da()->reset_diagnostics_area();
wfc->wakeup_subsequent_commits(rgi->worker_error);
rgi->did_mark_start_commit= false;
rgi->set_finish_event_group_called(true);
}
static void
signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi, int err)
{
rgi->worker_error= err;
DBUG_EXECUTE_IF("hold_worker2_favor_worker3", {
if (rgi->current_gtid.seq_no == 2002) {
debug_sync_set_action(thd, STRING_WITH_LEN("now WAIT_FOR cont_worker2"));
}});
rgi->cleanup_context(thd, true);
rgi->rli->abort_slave= true;
rgi->rli->stop_for_until= false;
mysql_mutex_lock(rgi->rli->relay_log.get_log_lock());
rgi->rli->relay_log.signal_relay_log_update();
mysql_mutex_unlock(rgi->rli->relay_log.get_log_lock());
}
static void
unlock_or_exit_cond(THD *thd, mysql_mutex_t *lock, bool *did_enter_cond,
PSI_stage_info *old_stage)
{
if (*did_enter_cond)
{
thd->EXIT_COND(old_stage);
*did_enter_cond= false;
}
else
mysql_mutex_unlock(lock);
}
static void
register_wait_for_prior_event_group_commit(rpl_group_info *rgi,
rpl_parallel_entry *entry)
{
mysql_mutex_assert_owner(&entry->LOCK_parallel_entry);
if (rgi->wait_commit_sub_id > entry->last_committed_sub_id)
{
/*
Register that the commit of this event group must wait for the
commit of the previous event group to complete before it may
complete itself, so that we preserve commit order.
*/
wait_for_commit *waitee=
&rgi->wait_commit_group_info->commit_orderer;
rgi->commit_orderer.register_wait_for_prior_commit(waitee);
}
}
/*
Do not start parallel execution of this event group until all prior groups
have reached the commit phase that are not safe to run in parallel with.
*/
static void
do_gco_wait(rpl_group_info *rgi, group_commit_orderer *gco,
bool *did_enter_cond, PSI_stage_info *old_stage)
{
THD *thd= rgi->thd;
rpl_parallel_entry *entry= rgi->parallel_entry;
uint64 wait_count;
mysql_mutex_assert_owner(&entry->LOCK_parallel_entry);
if (!gco->installed)
{
group_commit_orderer *prev_gco= gco->prev_gco;
if (prev_gco)
{
prev_gco->last_sub_id= gco->prior_sub_id;
prev_gco->next_gco= gco;
}
gco->installed= true;
}
wait_count= gco->wait_count;
if (wait_count > entry->count_committing_event_groups)
{
DEBUG_SYNC(thd, "rpl_parallel_start_waiting_for_prior");
thd->set_time_for_next_stage();
thd->ENTER_COND(&gco->COND_group_commit_orderer,
&entry->LOCK_parallel_entry,
&stage_waiting_for_prior_transaction_to_start_commit,
old_stage);
*did_enter_cond= true;
do
{
if (!rgi->worker_error && unlikely(thd->check_killed(1)))
{
DEBUG_SYNC(thd, "rpl_parallel_start_waiting_for_prior_killed");
thd->clear_error();
thd->get_stmt_da()->reset_diagnostics_area();
thd->send_kill_message();
slave_output_error_info(rgi, thd);
signal_error_to_sql_driver_thread(thd, rgi, 1);
/*
Even though we were killed, we need to continue waiting for the
prior event groups to signal that we can continue. Otherwise we
mess up the accounting for ordering. However, now that we have
marked the error, events will just be skipped rather than
executed, and things will progress quickly towards stop.
*/
}
mysql_cond_wait(&gco->COND_group_commit_orderer,
&entry->LOCK_parallel_entry);
} while (wait_count > entry->count_committing_event_groups);
}
}
static bool
do_stop_handling(rpl_group_info *rgi)
{
bool should_stop= false;
rpl_parallel_entry *entry= rgi->parallel_entry;
mysql_mutex_assert_owner(&entry->LOCK_parallel_entry);
if (unlikely(entry->force_abort) && rgi->gtid_sub_id > entry->stop_sub_id)
{
/*
We are stopping (STOP SLAVE), and this event group need not be applied
before we can safely stop. So return a flag that will cause us to skip,
rather than execute, the following events. Once all queued events have
been skipped, the STOP SLAVE is complete (for this thread).
*/
should_stop= true;
}
if (unlikely(entry->stop_on_error_sub_id <= rgi->wait_commit_sub_id))
{
rgi->worker_error= 1;
should_stop= true;
}
if (likely(!should_stop))
{
/*
Since we did not decide to stop, bump the largest_started_sub_id while
still holding LOCK_parallel_entry.
*/
if (rgi->gtid_sub_id > entry->largest_started_sub_id)
entry->largest_started_sub_id= rgi->gtid_sub_id;
}
return should_stop;
}
static bool
do_ftwrl_wait(rpl_group_info *rgi,
bool *did_enter_cond, PSI_stage_info *old_stage)
{
THD *thd= rgi->thd;
rpl_parallel_entry *entry= rgi->parallel_entry;
uint64 sub_id= rgi->gtid_sub_id;
bool aborted= false;
DBUG_ENTER("do_ftwrl_wait");
mysql_mutex_assert_owner(&entry->LOCK_parallel_entry);
/*
If a FLUSH TABLES WITH READ LOCK (FTWRL) is pending, check if this
transaction is later than transactions that have priority to complete
before FTWRL. If so, wait here so that FTWRL can proceed and complete
first.
(entry->pause_sub_id is ULONGLONG_MAX if no FTWRL is pending, which makes
this test false as required).
*/
if (unlikely(sub_id > entry->pause_sub_id))
{
thd->set_time_for_next_stage();
thd->ENTER_COND(&entry->COND_parallel_entry, &entry->LOCK_parallel_entry,
&stage_waiting_for_ftwrl,
(*did_enter_cond ? nullptr : old_stage));
*did_enter_cond= true;
do
{
if (entry->force_abort || rgi->worker_error)
{
aborted= true;
break;
}
if (unlikely(thd->check_killed()))
{
slave_output_error_info(rgi, thd);
signal_error_to_sql_driver_thread(thd, rgi, 1);
break;
}
mysql_cond_wait(&entry->COND_parallel_entry, &entry->LOCK_parallel_entry);
} while (sub_id > entry->pause_sub_id);
DBUG_EXECUTE_IF("delay_ftwrl_wait_gtid_0_x_100", {
if (rgi->current_gtid.domain_id == 0 &&
rgi->current_gtid.seq_no == 100) {
/*
Simulate delayed wakeup from the mysql_cond_wait(). To do this, we
need to have the LOCK_parallel_entry mutex released during the wait.
*/
mysql_mutex_unlock(&entry->LOCK_parallel_entry);
debug_sync_set_action(thd,
STRING_WITH_LEN("now SIGNAL pause_wait_started WAIT_FOR pause_wait_continue"));
mysql_mutex_lock(&entry->LOCK_parallel_entry);
}
});
/*
We do not call EXIT_COND() here, as this will be done later by our
caller (since we set *did_enter_cond to true).
*/
}
DBUG_RETURN(aborted);
}
static int
pool_mark_busy(rpl_parallel_thread_pool *pool, THD *thd)
{
PSI_stage_info old_stage;
int res= 0;
bool did_enter_cond= false;
/*
Wait here while the queue is busy. This is done to make FLUSH TABLES WITH
READ LOCK work correctly, without incuring extra locking penalties in
normal operation. FLUSH TABLES WITH READ LOCK needs to lock threads in the
thread pool, and for this we need to make sure the pool will not go away
during the operation. The LOCK_rpl_thread_pool is not suitable for
this. It is taken by release_thread() while holding LOCK_rpl_thread; so it
must be released before locking any LOCK_rpl_thread lock, or a deadlock
can occur.
So we protect the infrequent operations of FLUSH TABLES WITH READ LOCK and
pool size changes with this condition wait.
*/
DBUG_EXECUTE_IF("mark_busy_mdev_22370",my_sleep(1000000););
mysql_mutex_lock(&pool->LOCK_rpl_thread_pool);
if (pool->busy)
{
if (thd)
{
thd->set_time_for_next_stage();
thd->ENTER_COND(&pool->COND_rpl_thread_pool, &pool->LOCK_rpl_thread_pool,
&stage_waiting_for_rpl_thread_pool, &old_stage);
did_enter_cond= true;
}
do
{
if (thd && unlikely(thd->check_killed()))
{
res= 1;
break;
}
mysql_cond_wait(&pool->COND_rpl_thread_pool, &pool->LOCK_rpl_thread_pool);
} while (pool->busy);
}
if (!res)
pool->busy= true;
if (did_enter_cond)
thd->EXIT_COND(&old_stage);
else
mysql_mutex_unlock(&pool->LOCK_rpl_thread_pool);
return res;
}
static void
pool_mark_not_busy(rpl_parallel_thread_pool *pool)
{
mysql_mutex_lock(&pool->LOCK_rpl_thread_pool);
DBUG_ASSERT(pool->busy);
pool->busy= false;
mysql_cond_broadcast(&pool->COND_rpl_thread_pool);
mysql_mutex_unlock(&pool->LOCK_rpl_thread_pool);
}
void
rpl_unpause_after_ftwrl(THD *thd)
{
uint32 i;
rpl_parallel_thread_pool *pool= &global_rpl_thread_pool;
DBUG_ENTER("rpl_unpause_after_ftwrl");
DBUG_ASSERT(pool->busy);
for (i= 0; i < pool->count; ++i)
{
rpl_parallel_entry *e;
rpl_parallel_thread *rpt= pool->threads[i];
mysql_mutex_lock(&rpt->LOCK_rpl_thread);
if (!rpt->current_owner)
{
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
continue;
}
e= rpt->current_entry;
mysql_mutex_lock(&e->LOCK_parallel_entry);
rpt->pause_for_ftwrl = false;
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
/*
Do not change pause_sub_id if force_abort is set.
force_abort is set in case of STOP SLAVE.
Reason: If pause_sub_id is not changed and force_abort_is set,
any parallel slave thread waiting in do_ftwrl_wait() will
on wakeup return from do_ftwrl_wait() with 1. This will set
skip_event_group to 1 in handle_rpl_parallel_thread() and the
parallel thread will abort at once.
If pause_sub_id is changed, the code in handle_rpl_parallel_thread()
would continue to execute the transaction in the queue, which would
cause some transactions to be lost.
*/
if (!e->force_abort)
e->pause_sub_id= (uint64)ULONGLONG_MAX;
mysql_cond_broadcast(&e->COND_parallel_entry);
mysql_mutex_unlock(&e->LOCK_parallel_entry);
}
pool_mark_not_busy(pool);
DBUG_VOID_RETURN;
}
/*
.
Note: in case of error return, rpl_unpause_after_ftwrl() must _not_ be called.
*/
int
rpl_pause_for_ftwrl(THD *thd)
{
uint32 i;
rpl_parallel_thread_pool *pool= &global_rpl_thread_pool;
int err;
Dynamic_array<Master_info*> mi_arr(4, 4); // array of replication source mi:s
DBUG_ENTER("rpl_pause_for_ftwrl");
/*
While the count_pending_pause_for_ftwrl counter is non-zero, the pool
cannot be shutdown/resized, so threads are guaranteed to not disappear.
This is required to safely be able to access the individual threads below.
(We cannot lock an individual thread while holding LOCK_rpl_thread_pool,
as this can deadlock against release_thread()).
*/
if ((err= pool_mark_busy(pool, thd)))
DBUG_RETURN(err);
for (i= 0; i < pool->count; ++i)
{
PSI_stage_info old_stage;
rpl_parallel_entry *e;
rpl_parallel_thread *rpt= pool->threads[i];
mysql_mutex_lock(&rpt->LOCK_rpl_thread);
if (!rpt->current_owner)
{
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
continue;
}
e= rpt->current_entry;
mysql_mutex_lock(&e->LOCK_parallel_entry);
/*
Setting the rpt->pause_for_ftwrl flag makes sure that the thread will not
de-allocate itself until signalled to do so by rpl_unpause_after_ftwrl().
*/
rpt->pause_for_ftwrl = true;
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
++e->need_sub_id_signal;
if (e->pause_sub_id == (uint64)ULONGLONG_MAX)
{
e->pause_sub_id= e->largest_started_sub_id;
DBUG_EXECUTE_IF("pause_for_ftwrl_wait", {
mysql_mutex_unlock(&e->LOCK_parallel_entry);
debug_sync_set_action(thd,
STRING_WITH_LEN("now "
"SIGNAL pause_ftwrl_waiting "
"WAIT_FOR pause_ftwrl_cont"));
mysql_mutex_lock(&e->LOCK_parallel_entry);
});
}
thd->set_time_for_next_stage();
thd->ENTER_COND(&e->COND_parallel_entry, &e->LOCK_parallel_entry,
&stage_waiting_for_ftwrl_threads_to_pause, &old_stage);
while (e->pause_sub_id < (uint64)ULONGLONG_MAX &&
e->last_committed_sub_id < e->pause_sub_id &&
!err)
{
if (unlikely(thd->check_killed()))
{
err= 1;
break;
}
mysql_cond_wait(&e->COND_parallel_entry, &e->LOCK_parallel_entry);
};
--e->need_sub_id_signal;
thd->EXIT_COND(&old_stage);
if (err)
break;
/*
Notify any source any domain waiting-for-master Start-Alter to give way.
*/
Master_info *mi= e->rli->mi;
bool found= false;
for (uint i= 0; i < mi_arr.elements() && !found; i++)
found= mi_arr.at(i) == mi;
if (!found)
{
mi_arr.append(mi);
start_alter_info *info=NULL;
mysql_mutex_lock(&mi->start_alter_list_lock);
List_iterator<start_alter_info> info_iterator(mi->start_alter_list);
while ((info= info_iterator++))
{
mysql_mutex_lock(&mi->start_alter_lock);
DBUG_ASSERT(info->state == start_alter_state::REGISTERED);
info->state= start_alter_state::ROLLBACK_ALTER;
info->direct_commit_alter= true;
mysql_cond_broadcast(&info->start_alter_cond);
mysql_mutex_unlock(&mi->start_alter_lock);
}
mysql_mutex_unlock(&mi->start_alter_list_lock);
}
}
if (err)
rpl_unpause_after_ftwrl(thd);
DBUG_RETURN(err);
}
#ifndef DBUG_OFF
static int
dbug_simulate_tmp_error(rpl_group_info *rgi, THD *thd)
{
if (rgi->current_gtid.domain_id == 0 && rgi->current_gtid.seq_no == 100 &&
rgi->retry_event_count == 4)
{
thd->clear_error();
thd->get_stmt_da()->reset_diagnostics_area();
my_error(ER_LOCK_DEADLOCK, MYF(0));
return 1;
}
return 0;
}
#endif
/*
If we detect a deadlock due to eg. storage engine locks that conflict with
the fixed commit order, then the later transaction will be killed
asynchroneously to allow the former to complete its commit.
In this case, we convert the 'killed' error into a deadlock error, and retry
the later transaction.
If we are doing optimistic parallel apply of transactions not known to be
safe, we convert any error to a deadlock error, but then at retry we will
wait for prior transactions to commit first, so that the retries can be
done non-speculative.
*/
static void
convert_kill_to_deadlock_error(rpl_group_info *rgi)
{
THD *thd= rgi->thd;
int err_code;
if (!thd->get_stmt_da()->is_error())
return;
err_code= thd->get_stmt_da()->sql_errno();
if ((rgi->speculation == rpl_group_info::SPECULATE_OPTIMISTIC &&
err_code != ER_PRIOR_COMMIT_FAILED) ||
((err_code == ER_QUERY_INTERRUPTED || err_code == ER_CONNECTION_KILLED) &&
rgi->killed_for_retry))
{
thd->clear_error();
my_error(ER_LOCK_DEADLOCK, MYF(0));
thd->reset_killed();
}
}
/*
Check if an event marks the end of an event group. Returns non-zero if so,
zero otherwise.
In addition, returns 1 if the group is committing, 2 if it is rolling back.
*/
static int
is_group_ending(Log_event *ev, Log_event_type event_type)
{
if (event_type == XID_EVENT || event_type == XA_PREPARE_LOG_EVENT)
return 1;
if (event_type == QUERY_EVENT) // COMMIT/ROLLBACK are never compressed
{
Query_log_event *qev = (Query_log_event *)ev;
if (qev->is_commit() ||
!strncmp(qev->query, STRING_WITH_LEN("XA COMMIT")) ||
!strncmp(qev->query, STRING_WITH_LEN("XA ROLLBACK")))
return 1;
if (qev->is_rollback())
return 2;
}
return 0;
}
static int
retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
rpl_parallel_thread::queued_event *orig_qev)
{
IO_CACHE rlog;
LOG_INFO linfo;
File fd= (File)-1;
const char *errmsg;
inuse_relaylog *ir= rgi->relay_log;
uint64 event_count;
uint64 events_to_execute= rgi->retry_event_count;
Relay_log_info *rli= rgi->rli;
int err;
ulonglong cur_offset, old_offset;
char log_name[FN_REFLEN];
THD *thd= rgi->thd;
rpl_parallel_entry *entry= rgi->parallel_entry;
ulong retries= 0;
Format_description_log_event *description_event= NULL;
do_retry:
event_count= 0;
err= 0;
errmsg= NULL;
#ifdef WITH_WSREP
DBUG_EXECUTE_IF("sync.wsrep_retry_event_group", {
const char act[]= "now "
"SIGNAL sync.wsrep_retry_event_group_reached "
"WAIT_FOR signal.wsrep_retry_event_group";
debug_sync_set_action(thd, STRING_WITH_LEN(act));
};);
#endif /* WITH_WSREP */
/*
If we already started committing before getting the deadlock (or other
error) that caused us to need to retry, we have already signalled
subsequent transactions that we have started committing. This is
potentially a problem, as now we will rollback, and if subsequent
transactions would start to execute now, they could see an unexpected
state of the database and get eg. key not found or duplicate key error.
However, to get a deadlock in the first place, there must have been
another earlier transaction that is waiting for us. Thus that other
transaction has _not_ yet started to commit, and any subsequent
transactions will still be waiting at this point.
So here, we decrement back the count of transactions that started
committing (if we already incremented it), undoing the effect of an
earlier mark_start_commit(). Then later, when the retry succeeds and we
commit again, we can do a new mark_start_commit() and eventually wake up
subsequent transactions at the proper time.
We need to do the unmark before the rollback, to be sure that the
transaction we deadlocked with will not signal that it started to commit
until after the unmark.
*/
DBUG_EXECUTE_IF("inject_mdev8302", { my_sleep(20000);});
rgi->unmark_start_commit();
DEBUG_SYNC(thd, "rpl_parallel_retry_after_unmark");
/*
We might get the deadlock error that causes the retry during commit, while
sitting in wait_for_prior_commit(). If this happens, we will have a
pending error in the wait_for_commit object. So clear this by
unregistering (and later re-registering) the wait.
*/
if(thd->wait_for_commit_ptr)
thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
DBUG_EXECUTE_IF("inject_mdev8031", {
/* Simulate that we get deadlock killed at this exact point. */
slave_background_kill_request(thd);
});
#ifdef ENABLED_DEBUG_SYNC
DBUG_EXECUTE_IF("rpl_parallel_simulate_wait_at_retry", {
if (rgi->current_gtid.seq_no == 1001) {
debug_sync_set_action(thd,
STRING_WITH_LEN("rpl_parallel_simulate_wait_at_retry WAIT_FOR proceed_by_1001"));
}
DEBUG_SYNC(thd, "rpl_parallel_simulate_wait_at_retry");
});
#endif
/*
We are still applying the event group, even though we will roll it back
and retry it. So for --gtid-ignore-duplicates, keep ownership of the
domain during the retry so another master connection will not try to take
over and duplicate apply the same event group (MDEV-33475).
*/
rgi->cleanup_context(thd, 1, 1 /* keep_domain_owner */);
wait_for_pending_deadlock_kill(thd, rgi);
thd->reset_killed();
thd->clear_error();
rgi->killed_for_retry = rpl_group_info::RETRY_KILL_NONE;
#ifdef ENABLED_DEBUG_SYNC
DBUG_EXECUTE_IF("hold_worker2_favor_worker3", {
if (rgi->current_gtid.seq_no == 2003) {
debug_sync_set_action(thd,
STRING_WITH_LEN("now WAIT_FOR cont_worker3"));
}
});
#endif
/*
If we retry due to a deadlock kill that occurred during the commit step, we
might have already updated (but not committed) an update of table
mysql.gtid_slave_pos, and cleared the gtid_pending flag. Now we have
rolled back any such update, so we must set the gtid_pending flag back to
true so that we will do a new update when/if we succeed with the retry.
*/
rgi->gtid_pending= true;
mysql_mutex_lock(&rli->data_lock);
++rli->retried_trans;
++rpt->last_trans_retry_count;
statistic_increment(slave_retried_transactions, LOCK_status);
mysql_mutex_unlock(&rli->data_lock);
for (;;)
{
mysql_mutex_lock(&entry->LOCK_parallel_entry);
if (rgi->gtid_sub_id < entry->stop_on_error_sub_id ||
DBUG_IF("simulate_mdev_12746"))
{
register_wait_for_prior_event_group_commit(rgi, entry);
}
else
{
/*
A failure of a preceding "parent" transaction may not be
seen by the current one through its own worker_error.
Such induced error gets set by ourselves now.
*/
err= rgi->worker_error= 1;
my_error(ER_PRIOR_COMMIT_FAILED, MYF(0));
mysql_mutex_unlock(&entry->LOCK_parallel_entry);
goto err;
}
mysql_mutex_unlock(&entry->LOCK_parallel_entry);
/*
Let us wait for all prior transactions to complete before trying again.
This way, we avoid repeatedly conflicting with and getting deadlock
killed by the same earlier transaction.
*/
if (!(err= thd->wait_for_prior_commit()))
{
rgi->speculation = rpl_group_info::SPECULATE_WAIT;
break;
}
convert_kill_to_deadlock_error(rgi);
if (!has_temporary_error(thd))