/
SQLiteNode.cpp
2280 lines (2117 loc) · 109 KB
/
SQLiteNode.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include <libstuff/libstuff.h>
#include "SQLiteNode.h"
#include "SQLiteServer.h"
#include "SQLiteCommand.h"
// Introduction
// ------------
// SQLiteNode builds atop STCPNode and SQLite to provide a distributed transactional SQL database. The STCPNode base
// class establishes and maintains connections with all peers: if any connection fails, it forever attempts to
// re-establish. This frees the SQLiteNode layer to focus on the high-level distributed database state machine.
//
// FIXME: Handle the case where two nodes have conflicting databases. Should find where they fork, tag the affected
// accounts for manual review, and adopt the higher-priority
//
// FIXME: Master should detect whether any slaves fall out of sync for any reason, identify/tag affected accounts, and
// re-synchronize.
//
// FIXME: Add test to measure how long it takes for master to stabilize.
//
// FIXME: If master dies before sending ESCALATE_RESPONSE (or if slave dies before receiving it), then a command might
// have been committed to the database without notifying whoever initiated it. Perhaps have the caller identify
// each command with a unique command id, and verify inside the query that the command hasn't been executed yet?
#undef SLOGPREFIX
#define SLOGPREFIX "{" << name << "/" << SQLiteNode::stateNames[_state] << "} "
// Initializations for static vars.
const uint64_t SQLiteNode::SQL_NODE_DEFAULT_RECV_TIMEOUT = STIME_US_PER_M * 5;
const uint64_t SQLiteNode::SQL_NODE_SYNCHRONIZING_RECV_TIMEOUT = STIME_US_PER_M;
atomic<bool> SQLiteNode::unsentTransactions(false);
uint64_t SQLiteNode::_lastSentTransactionID = 0;
const string SQLiteNode::stateNames[] = {"SEARCHING",
"SYNCHRONIZING",
"WAITING",
"STANDINGUP",
"MASTERING",
"STANDINGDOWN",
"SUBSCRIBING",
"SLAVING"};
const string SQLiteNode::consistencyLevelNames[] = {"ASYNC",
"ONE",
"QUORUM"};
SQLiteNode::SQLiteNode(SQLiteServer& server, SQLite& db, const string& name, const string& host,
const string& peerList, int priority, uint64_t firstTimeout, const string& version,
int quorumCheckpoint)
: STCPNode(name, host, max(SQL_NODE_DEFAULT_RECV_TIMEOUT, SQL_NODE_SYNCHRONIZING_RECV_TIMEOUT)),
_db(db), _commitState(CommitState::UNINITIALIZED), _server(server), _stateChangeCount(0)
{
SASSERT(priority >= 0);
_priority = priority;
_state = SEARCHING;
_syncPeer = nullptr;
_masterPeer = nullptr;
_stateTimeout = STimeNow() + firstTimeout;
_version = version;
_commitsSinceCheckpoint = 0;
_quorumCheckpoint = quorumCheckpoint;
// Get this party started
_changeState(SEARCHING);
// Add any peers.
list<string> parsedPeerList = SParseList(peerList);
for (const string& peer : parsedPeerList) {
// Get the params from this peer, if any
string host;
STable params;
SASSERT(SParseURIPath(peer, host, params));
string name = SGetDomain(host);
if (params.find("nodeName") != params.end()) {
name = params["nodeName"];
}
addPeer(name, host, params);
}
}
SQLiteNode::~SQLiteNode() {
// Make sure it's a clean shutdown
SASSERTWARN(_escalatedCommandMap.empty());
SASSERTWARN(!commitInProgress());
}
void SQLiteNode::startCommit(ConsistencyLevel consistency)
{
// Verify we're not already committing something, and then record that we have begun. This doesn't actually *do*
// anything, but `update()` will pick up the state in its next invocation and start the actual commit.
SASSERT(_commitState == CommitState::UNINITIALIZED ||
_commitState == CommitState::SUCCESS ||
_commitState == CommitState::FAILED);
_commitState = CommitState::WAITING;
_commitConsistency = consistency;
}
void SQLiteNode::sendResponse(const SQLiteCommand& command)
{
Peer* peer = getPeerByID(command.initiatingPeerID);
SASSERT(peer);
// If it was a peer message, we don't need to wrap it in an escalation response.
SData escalate("ESCALATE_RESPONSE");
escalate["ID"] = command.id;
escalate.content = command.response.serialize();
_sendToPeer(peer, escalate);
}
void SQLiteNode::beginShutdown(uint64_t usToWait) {
// Ignore redundant
if (!gracefulShutdown()) {
// Start graceful shutdown
SINFO("Beginning graceful shutdown.");
_gracefulShutdownTimeout.alarmDuration = usToWait;
_gracefulShutdownTimeout.start();
}
}
bool SQLiteNode::_isNothingBlockingShutdown() {
// Don't shutdown if in the middle of a transaction
if (_db.insideTransaction())
return false;
// If we're doing a commit, don't shut down.
if (commitInProgress()) {
return false;
}
// If we have non-"Connection: wait" commands escalated to master, not done
if (!_escalatedCommandMap.empty()) {
return false;
}
return true;
}
bool SQLiteNode::shutdownComplete() {
// First even see if we're shutting down
if (!gracefulShutdown())
return false;
// Next, see if we're timing out the graceful shutdown and killing non-gracefully
if (_gracefulShutdownTimeout.ringing()) {
SWARN("Graceful shutdown timed out, killing non gracefully.");
if (_escalatedCommandMap.size()) {
SWARN("Abandoned " << _escalatedCommandMap.size() << " escalated commands.");
for (auto& commandPair : _escalatedCommandMap) {
commandPair.second.response.methodLine = "500 Abandoned";
commandPair.second.complete = true;
_server.acceptCommand(move(commandPair.second), false);
}
_escalatedCommandMap.clear();
}
_changeState(SEARCHING);
return true;
}
// Not complete unless we're SEARCHING, SYNCHRONIZING, or WAITING
if (_state > WAITING) {
// Not in a shutdown state
SINFO("Can't graceful shutdown yet because state="
<< SQLiteNode::stateNames[_state] << ", commitInProgress=" << commitInProgress()
<< ", escalated=" << _escalatedCommandMap.size());
// If we end up with anything left in the escalated command map when we're trying to shut down, let's log it,
// so we can try and diagnose what's happening.
if (!_escalatedCommandMap.empty()) {
for (auto& cmd : _escalatedCommandMap) {
string name = cmd.first;
SQLiteCommand& command = cmd.second;
int64_t created = command.request.calcU64("commandExecuteTime");
int64_t elapsed = STimeNow() - created;
double elapsedSeconds = (double)elapsed / STIME_US_PER_S;
SINFO("Escalated command remaining at shutdown(" << name << "): " << command.request.methodLine
<< ". Created: " << command.request["commandExecuteTime"] << " (" << elapsedSeconds << "s ago)");
}
}
return false;
}
// If we have unsent data, not done
for (auto peer : peerList) {
if (peer->s && !peer->s->sendBufferEmpty()) {
// Still sending data
SINFO("Can't graceful shutdown yet because unsent data to peer '" << peer->name << "'");
return false;
}
}
// Finally, make sure nothing is blocking shutdown
if (_isNothingBlockingShutdown()) {
// Yes!
SINFO("Graceful shutdown is complete");
return true;
} else {
// Not done yet
SINFO("Can't graceful shutdown yet because waiting on commands: commitInProgress="
<< commitInProgress() << ", escalated=" << _escalatedCommandMap.size());
return false;
}
}
void SQLiteNode::_sendOutstandingTransactions() {
SQLITE_COMMIT_AUTOLOCK;
// Make sure we have something to do.
if (!unsentTransactions.load()) {
return;
}
auto transactions = _db.getCommittedTransactions();
for (auto& i : transactions) {
uint64_t id = i.first;
if (id <= _lastSentTransactionID) {
continue;
}
string& query = i.second.first;
string& hash = i.second.second;
SData transaction("BEGIN_TRANSACTION");
transaction["Command"] = "ASYNC";
transaction["NewCount"] = to_string(id);
transaction["NewHash"] = hash;
transaction["ID"] = "ASYNC_" + to_string(id);
transaction.content = query;
_sendToAllPeers(transaction, true); // subscribed only
for (auto peer : peerList) {
// Clear the response flag from the last transaction
(*peer)["TransactionResponse"].clear();
}
SData commit("COMMIT_TRANSACTION");
commit["ID"] = transaction["ID"];
commit["CommitCount"] = transaction["NewCount"];
commit["Hash"] = hash;
_sendToAllPeers(commit, true); // subscribed only
_lastSentTransactionID = id;
// Commits made by other threads are implicitly not quorum commits. We'll update our counter.
_commitsSinceCheckpoint++;
}
unsentTransactions.store(false);
}
void SQLiteNode::escalateCommand(SQLiteCommand&& command, bool forget) {
// If the master is currently standing down, we won't escalate, we'll give the command back to the caller.
if((*_masterPeer)["State"] == "STANDINGDOWN") {
SINFO("Asked to escalate command but master standing down, letting server retry.");
_server.acceptCommand(move(command), false);
return;
}
// Send this to the MASTER
SASSERT(_masterPeer);
SASSERTEQUALS((*_masterPeer)["State"], "MASTERING");
uint64_t elapsed = STimeNow() - command.request.calcU64("commandExecuteTime");
SINFO("Escalating '" << command.request.methodLine << "' (" << command.id << ") to MASTER '" << _masterPeer->name
<< "' after " << elapsed / 1000 << " ms");
// Create a command to send to our master.
SData escalate("ESCALATE");
escalate["ID"] = command.id;
escalate.content = command.request.serialize();
// Store the command as escalated, unless we intend to forget about it anyway.
if (forget) {
SINFO("Firing and forgetting command '" << command.request.methodLine << "' to master.");
} else {
command.escalationTimeUS = STimeNow();
_escalatedCommandMap.emplace(command.id, move(command));
}
// And send to master.
_sendToPeer(_masterPeer, escalate);
}
list<string> SQLiteNode::getEscalatedCommandRequestMethodLines() {
list<string> returnList;
for (auto& commandPair : _escalatedCommandMap) {
returnList.push_back(commandPair.second.request.methodLine);
}
return returnList;
}
// --------------------------------------------------------------------------
// State Machine
// --------------------------------------------------------------------------
// Here is a simplified state diagram showing the major state transitions:
//
// SEARCHING
// |
// SYNCHRONIZING
// |
// WAITING
// ___________/ \____________
// | |
// STANDINGUP SUBSCRIBING
// | |
// MASTERING SLAVING
// | |
// STANDINGDOWN |
// |___________ ____________|
// \ /
// SEARCHING
//
// In short, every node starts out in the SEARCHING state, where it simply tries
// to establish all its peer connections. Once done, each node SYNCHRONIZES with
// the freshest peer, meaning they download whatever "commits" they are
// missing. Then they WAIT until the highest priority node "stands up" to become
// the new "master". All other nodes then SUBSCRIBE and become "slaves". If the
// master "stands down", then all slaves unsubscribe and everybody goes back into
// the SEARCHING state and tries it all over again.
//
//
// State Transitions
// -----------------
// Each state transitions according to the following events and operates as follows:
bool SQLiteNode::update() {
// Process the database state machine
switch (_state) {
/// - SEARCHING: Wait for a period and try to connect to all known
/// peers. After a timeout, give up and go ahead with whoever
/// we were able to successfully connect to -- if anyone. The
/// logic for this state is as follows:
///
/// if( no peers configured ) goto MASTERING
/// if( !timeout ) keep waiting
/// if( no peers connected ) goto MASTERING
/// if( nobody has more commits than us ) goto WAITING
/// else send SYNCHRONIZE and goto SYNCHRONIZING
///
case SEARCHING: {
SASSERTWARN(!_syncPeer);
SASSERTWARN(!_masterPeer);
SASSERTWARN(_db.getUncommittedHash().empty());
// If we're trying to shut down, just do nothing
if (shutdownComplete())
return false; // Don't re-update
// If no peers, we're the master, unless we're shutting down.
if (peerList.empty()) {
// There are no peers, jump straight to mastering
SHMMM("No peers configured, jumping to MASTERING");
_changeState(MASTERING);
return true; // Re-update immediately
}
// How many peers have we logged in to?
int numFullPeers = 0;
int numLoggedInFullPeers = 0;
Peer* freshestPeer = nullptr;
for (auto peer : peerList) {
// Wait until all connected (or failed) and logged in
bool permaSlave = peer->params["Permaslave"] == "true";
bool loggedIn = peer->test("LoggedIn");
// Count how many full peers (non-permaslaves) we have
numFullPeers += !permaSlave;
// Count how many full peers are logged in
numLoggedInFullPeers += (!permaSlave) && loggedIn;
// Find the freshest peer
if (loggedIn) {
// The freshest peer is the one that has the most commits.
if (!freshestPeer || peer->calcU64("CommitCount") > freshestPeer->calcU64("CommitCount")) {
freshestPeer = peer;
}
}
}
// Keep searching until we connect to at least half our non-permaslave peers OR timeout
SINFO("Signed in to " << numLoggedInFullPeers << " of " << numFullPeers << " full peers (" << peerList.size()
<< " with permaslaves), timeout in " << (_stateTimeout - STimeNow()) / 1000
<< "ms");
if (((float)numLoggedInFullPeers < numFullPeers / 2.0) && (STimeNow() < _stateTimeout))
return false;
// We've given up searching; did we time out?
if (STimeNow() >= _stateTimeout)
SHMMM("Timeout SEARCHING for peers, continuing.");
// If no freshest (not connected to anyone), wait
if (!freshestPeer) {
// Unable to connect to anyone
SHMMM("Unable to connect to any peer, WAITING.");
_changeState(WAITING);
return true; // Re-update
}
// How does our state compare with the freshest peer?
SASSERT(freshestPeer);
uint64_t freshestPeerCommitCount = freshestPeer->calcU64("CommitCount");
if (freshestPeerCommitCount == _db.getCommitCount()) {
// We're up to date
SINFO("Synchronized with the freshest peer '" << freshestPeer->name << "', WAITING.");
_changeState(WAITING);
return true; // Re-update
}
// Are we fresher than the freshest peer?
if (freshestPeerCommitCount < _db.getCommitCount()) {
// Looks like we're the freshest peer overall
SINFO("We're the freshest peer, WAITING.");
_changeState(WAITING);
return true; // Re-update
}
// It has a higher commit count than us, synchronize.
SASSERT(freshestPeerCommitCount > _db.getCommitCount());
SASSERTWARN(!_syncPeer);
_updateSyncPeer();
if (_syncPeer) {
_sendToPeer(_syncPeer, SData("SYNCHRONIZE"));
} else {
SWARN("Updated to NULL _syncPeer when about to send SYNCHRONIZE. Going to WAITING.");
_changeState(WAITING);
return true; // Re-update
}
_changeState(SYNCHRONIZING);
return true; // Re-update
}
/// - SYNCHRONIZING: We only stay in this state while waiting for
/// the SYNCHRONIZE_RESPONSE. When we receive it, we'll enter
/// the WAITING state. Alternately, give up waitng after a
/// period and go SEARCHING.
///
case SYNCHRONIZING: {
SASSERTWARN(_syncPeer);
SASSERTWARN(!_masterPeer);
SASSERTWARN(_db.getUncommittedHash().empty());
// Nothing to do but wait
if (STimeNow() > _stateTimeout) {
// Give up on synchronization; reconnect that peer and go searching
SHMMM("Timed out while waiting for SYNCHRONIZE_RESPONSE, searching.");
_reconnectPeer(_syncPeer);
_syncPeer = nullptr;
_changeState(SEARCHING);
return true; // Re-update
}
break;
}
/// - WAITING: As the name implies, wait until something happens. The
/// logic for this state is as follows:
///
/// loop across "LoggedIn" peers to find the following:
/// - freshest peer (most commits)
/// - highest priority peer
/// - current master (might be STANDINGUP or STANDINGDOWN)
/// if( no peers logged in )
/// goto SEARCHING
/// if( a higher-priority MASTERING master exists )
/// send SUBSCRIBE and go SUBSCRIBING
/// if( the freshest peer has more commits han us )
/// goto SEARCHING
/// if( no master and we're the highest prioriy )
/// clear "StandupResponse" on all peers
/// goto STANDINGUP
///
case WAITING: {
SASSERTWARN(!_syncPeer);
SASSERTWARN(!_masterPeer);
SASSERTWARN(_db.getUncommittedHash().empty());
SASSERTWARN(_escalatedCommandMap.empty());
// If we're trying and ready to shut down, do nothing.
if (gracefulShutdown()) {
// Do we have an outstanding command?
if (1/* TODO: Commit in progress? */) {
// Nope! Let's just halt the FSM here until we shutdown so as to
// avoid potential confusion. (Technically it would be fine to continue
// the FSM, but it makes the logs clearer to just stop here.)
SINFO("Graceful shutdown underway and no queued commands, do nothing.");
return false; // No fast update
} else {
// We do have outstanding commands, even though a graceful shutdown
// has been requested. This is probably due to us previously being a master
// to which commands had been sent directly -- we got the signal to shutdown,
// and stood down immediately. All the slaves will re-escalate whatever
// commands they were waiting on us to process, so they're fine. But our own
// commands still need to be processed. We're no longer the master, so we
// can't do it. Rather, even though we're trying to do a graceful shutdown,
// we need to find and slave to the new master, and have it process our
// commands. Once the new master has processed our commands, then we can
// shut down gracefully.
SHMMM("Graceful shutdown underway but queued commands so continuing...");
}
}
// Loop across peers and find the highest priority and master
int numFullPeers = 0;
int numLoggedInFullPeers = 0;
Peer* highestPriorityPeer = nullptr;
Peer* freshestPeer = nullptr;
Peer* currentMaster = nullptr;
for (auto peer : peerList) {
// Make sure we're a full peer
if (peer->params["Permaslave"] != "true") {
// Verify we're logged in
++numFullPeers;
if (SIEquals((*peer)["LoggedIn"], "true")) {
// Verify we're still fresh
++numLoggedInFullPeers;
if (!freshestPeer || peer->calcU64("CommitCount") > freshestPeer->calcU64("CommitCount"))
freshestPeer = peer;
// See if it's the highest priority
if (!highestPriorityPeer || peer->calc("Priority") > highestPriorityPeer->calc("Priority"))
highestPriorityPeer = peer;
// See if it is currently the master (or standing up/down)
const string& peerState = (*peer)["State"];
if (SIEquals(peerState, "STANDINGUP") || SIEquals(peerState, "MASTERING") ||
SIEquals(peerState, "STANDINGDOWN")) {
// Found the current master
if (currentMaster)
PHMMM("Multiple peers trying to stand up (also '" << currentMaster->name
<< "'), let's hope they sort it out.");
currentMaster = peer;
}
}
}
}
// If there are no logged in peers, then go back to SEARCHING.
if (!highestPriorityPeer) {
// Not connected to any other peers
SHMMM("Configured to have peers but can't connect to any, re-SEARCHING.");
_changeState(SEARCHING);
return true; // Re-update
}
SASSERT(highestPriorityPeer);
SASSERT(freshestPeer);
// If there is already a master that is higher priority than us,
// subscribe -- even if we're not in sync with it. (It'll bring
// us back up to speed while subscribing.)
if (currentMaster && _priority < highestPriorityPeer->calc("Priority") &&
SIEquals((*currentMaster)["State"], "MASTERING")) {
// Subscribe to the master
SINFO("Subscribing to master '" << currentMaster->name << "'");
_masterPeer = currentMaster;
_masterVersion = (*_masterPeer)["Version"];
_sendToPeer(currentMaster, SData("SUBSCRIBE"));
_changeState(SUBSCRIBING);
return true; // Re-update
}
// No master to subscribe to, let's see if there's anybody else
// out there with commits we don't have. Might as well synchronize
// while waiting.
if (freshestPeer->calcU64("CommitCount") > _db.getCommitCount()) {
// Out of sync with a peer -- resynchronize
SHMMM("Lost synchronization while waiting; re-SEARCHING.");
_changeState(SEARCHING);
return true; // Re-update
}
// No master and we're in sync, perhaps everybody is waiting for us
// to stand up? If we're higher than the highest priority, and are
// connected to enough full peers to achieve quorum we should be
// master.
if (!currentMaster && numLoggedInFullPeers * 2 >= numFullPeers &&
_priority > highestPriorityPeer->calc("Priority")) {
// Yep -- time for us to stand up -- clear everyone's
// last approval status as they're about to send them.
SASSERT(_priority > 0); // Permaslave should never stand up
SINFO("No master and we're highest priority (over " << highestPriorityPeer->name << "), STANDINGUP");
for (auto peer : peerList) {
peer->erase("StandupResponse");
}
_changeState(STANDINGUP);
return true; // Re-update
}
// Keep waiting
SDEBUG("Connected to " << numLoggedInFullPeers << " of " << numFullPeers << " full peers (" << peerList.size()
<< " with permaslaves), priority=" << _priority);
break;
}
/// - STANDINGUP: We're waiting for peers to approve or deny our standup
/// request. The logic for this state is:
///
/// if( at least one peer has denied standup )
/// goto SEARCHING
/// if( everybody has responded and approved )
/// goto MASTERING
/// if( somebody hasn't responded but we're timing out )
/// goto SEARCHING
///
case STANDINGUP: {
SASSERTWARN(!_syncPeer);
SASSERTWARN(!_masterPeer);
SASSERTWARN(_db.getUncommittedHash().empty());
// Wait for everyone to respond
bool allResponded = true;
int numFullPeers = 0;
int numLoggedInFullPeers = 0;
if (gracefulShutdown()) {
SINFO("Shutting down while standing up, setting state to SEARCHING");
_changeState(SEARCHING);
return true; // Re-update
}
for (auto peer : peerList) {
// Check this peer; if not logged in, tacit approval
if (peer->params["Permaslave"] != "true") {
++numFullPeers;
if (SIEquals((*peer)["LoggedIn"], "true")) {
// Connected and logged in.
numLoggedInFullPeers++;
// Has it responded yet?
if (!peer->isSet("StandupResponse")) {
// At least one logged in full peer hasn't responded
allResponded = false;
} else if (!SIEquals((*peer)["StandupResponse"], "approve")) {
// It responeded, but didn't approve -- abort
PHMMM("Refused our STANDUP (" << (*peer)["Reason"] << "), cancel and RESEARCH");
_changeState(SEARCHING);
return true; // Re-update
}
}
}
}
// If everyone's responded with approval and we form a majority, then finish standup.
bool majorityConnected = numLoggedInFullPeers * 2 >= numFullPeers;
if (allResponded && majorityConnected) {
// Complete standup
SINFO("All peers approved standup, going MASTERING.");
_changeState(MASTERING);
return true; // Re-update
}
// See if we're taking too long
if (STimeNow() > _stateTimeout) {
// Timed out
SHMMM("Timed out waiting for STANDUP approval; reconnect all and re-SEARCHING.");
_reconnectAll();
_changeState(SEARCHING);
return true; // Re-update
}
break;
}
/// - MASTERING / STANDINGDOWN : These are the states where the magic
/// happens. In both states, the node will execute distributed
/// transactions. However, new transactions are only
/// started in the MASTERING state (while existing transactions are
/// concluded in the STANDINGDOWN) state. The logic for this state
/// is as follows:
///
/// if( we're processing a transaction )
/// if( all subscribed slaves have responded/approved )
/// commit this transaction to the local DB
/// broadcast COMMIT_TRANSACTION to all subscribed slaves
/// send a STATE to show we've committed a new transaction
/// notify the caller that the command is complete
/// if( we're MASTERING and not processing a command )
/// if( there is another MASTER ) goto STANDINGDOWN
/// if( there is a higher priority peer ) goto STANDINGDOWN
/// if( a command is queued )
/// if( processing the command affects the database )
/// clear the TransactionResponse of all peers
/// broadcast BEGIN_TRANSACTION to subscribed slaves
/// if( we're standing down and all slaves have unsubscribed )
/// goto SEARCHING
///
case MASTERING:
case STANDINGDOWN: {
SASSERTWARN(!_syncPeer);
SASSERTWARN(!_masterPeer);
// NOTE: This block very carefully will not try and call _changeState() while holding SQLite::g_commitLock,
// because that could cause a deadlock when called by an outside caller!
// If there's no commit in progress, we'll send any outstanding transactions that exist. We won't send them
// mid-commit, as they'd end up as nested transactions interleaved with the one in progress.
if (!commitInProgress()) {
_sendOutstandingTransactions();
}
// This means we've started a distributed transaction and need to decide if we should commit it, which can mean
// waiting on peers to approve the transaction. We can do this even after we've begun standing down.
if (_commitState == CommitState::COMMITTING) {
// Loop across all peers configured to see how many are:
int numFullPeers = 0; // Num non-permaslaves configured
int numFullSlaves = 0; // Num full peers that are "subscribed"
int numFullResponded = 0; // Num full peers that have responded approve/deny
int numFullApproved = 0; // Num full peers that have approved
int numFullDenied = 0; // Num full peers that have denied
for (auto peer : peerList) {
// Check this peer to see if it's full or a permaslave
if (peer->params["Permaslave"] != "true") {
// It's a full peer -- is it subscribed, and if so, how did it respond?
++numFullPeers;
if ((*peer)["Subscribed"] == "true") {
// Subscribed, did it respond?
numFullSlaves++;
const string& response = (*peer)["TransactionResponse"];
if (response.empty()) {
continue;
}
numFullResponded++;
numFullApproved += SIEquals(response, "approve");
if (!SIEquals(response, "approve")) {
SWARN("Peer '" << peer->name << "' denied transaction.");
++numFullDenied;
} else {
SDEBUG("Peer '" << peer->name << "' has approved transaction.");
}
}
}
}
// Did we get a majority? This is important whether or not our consistency level needs it, as it will
// reset the checkpoint limit either way.
bool majorityApproved = (numFullApproved * 2 >= numFullPeers);
// Figure out if we have enough consistency
bool consistentEnough = false;
switch (_commitConsistency) {
case ASYNC:
// Always consistent enough if we don't care!
consistentEnough = true;
break;
case ONE:
// So long at least one full approved (if we have any peers, that is), we're good.
consistentEnough = !numFullPeers || (numFullApproved > 0);
break;
case QUORUM:
// This one requires a majority
consistentEnough = majorityApproved;
break;
default:
SERROR("Invalid write consistency.");
break;
}
// See if all active non-permaslaves have responded.
// NOTE: This can be true if nobody responds if there are no full slaves - this includes machines that
// should be slaves that are disconnected.
bool everybodyResponded = numFullResponded >= numFullSlaves;
// Record these for posterity
SDEBUG( "numFullPeers=" << numFullPeers
<< ", numFullSlaves=" << numFullSlaves
<< ", numFullResponded=" << numFullResponded
<< ", numFullApproved=" << numFullApproved
<< ", majorityApproved=" << majorityApproved
<< ", writeConsistency=" << consistencyLevelNames[_commitConsistency]
<< ", consistencyRequired=" << consistencyLevelNames[_commitConsistency]
<< ", consistentEnough=" << consistentEnough
<< ", everybodyResponded=" << everybodyResponded
<< ", commitsSinceCheckpoint=" << _commitsSinceCheckpoint);
// If anyone denied this transaction, roll this back. Alternatively, roll it back if everyone we're
// currently connected to has responded, but that didn't generate enough consistency. This could happen, in
// theory, if we were disconnected from enough of the cluster that we could no longer reach QUORUM, but
// this should have been detected earlier and forced us out of mastering.
// TODO: we might want to remove the `numFullDenied` condition here. A single failure shouldn't cause the
// entire cluster to break. Imagine a scenario where a slave disk was full, and every write operation
// failed with an sqlite3 error.
if (numFullDenied || (everybodyResponded && !consistentEnough)) {
SINFO("Rolling back transaction because everybody currently connected responded "
"but not consistent enough. Num denied: " << numFullDenied << ". Slave write failure?");
_db.rollback();
// Notify everybody to rollback
SData rollback("ROLLBACK_TRANSACTION");
rollback.set("ID", _lastSentTransactionID + 1);
_sendToAllPeers(rollback, true); // true: Only to subscribed peers.
// Finished, but failed.
_commitState = CommitState::FAILED;
} else if (consistentEnough) {
// Commit this distributed transaction. Either we have quorum, or we don't need it.
SDEBUG("Committing current transaction because consistentEnough: " << _db.getUncommittedQuery());
uint64_t beforeCommit = STimeNow();
int result = _db.commit();
SINFO("SQLite::commit in SQLiteNode took " << ((STimeNow() - beforeCommit)/1000) << "ms.");
// If this is the case, there was a commit conflict.
if (result == SQLITE_BUSY_SNAPSHOT) {
_db.rollback();
// We already asked everyone to commit this (even if it was async), so we'll have to tell them to
// roll back.
SINFO("[performance] Conflict committing " << consistencyLevelNames[_commitConsistency]
<< " commit, rolling back.");
SData rollback("ROLLBACK_TRANSACTION");
rollback.set("ID", _lastSentTransactionID + 1);
_sendToAllPeers(rollback, true); // true: Only to subscribed peers.
// Finished, but failed.
_commitState = CommitState::FAILED;
} else {
// Hey, our commit succeeded! Record how long it took.
uint64_t beginElapsed, readElapsed, writeElapsed, prepareElapsed, commitElapsed, rollbackElapsed;
uint64_t totalElapsed = _db.getLastTransactionTiming(beginElapsed, readElapsed, writeElapsed,
prepareElapsed, commitElapsed, rollbackElapsed);
SINFO("Committed master transaction for '"
<< _db.getCommitCount() << " (" << _db.getCommittedHash() << "). "
<< _commitsSinceCheckpoint << " commits since quorum (consistencyRequired="
<< consistencyLevelNames[_commitConsistency] << "), " << numFullApproved << " of "
<< numFullPeers << " approved (" << peerList.size() << " total) in "
<< totalElapsed / 1000 << " ms ("
<< beginElapsed / 1000 << "+" << readElapsed / 1000 << "+"
<< writeElapsed / 1000 << "+" << prepareElapsed / 1000 << "+"
<< commitElapsed / 1000 << "+" << rollbackElapsed / 1000 << "ms)");
SINFO("[performance] Successfully committed " << consistencyLevelNames[_commitConsistency]
<< " transaction. Sending COMMIT_TRANSACTION to peers.");
SData commit("COMMIT_TRANSACTION");
commit.set("ID", _lastSentTransactionID + 1);
_sendToAllPeers(commit, true); // true: Only to subscribed peers.
// clear the unsent transactions, we've sent them all (including this one);
_db.getCommittedTransactions();
// Update the last sent transaction ID to reflect that this is finished.
_lastSentTransactionID = _db.getCommitCount();
// If this was a quorum commit, we'll reset our counter, otherwise, we'll update it.
if (_commitConsistency == QUORUM) {
_commitsSinceCheckpoint = 0;
} else {
_commitsSinceCheckpoint++;
}
// Done!
_commitState = CommitState::SUCCESS;
}
} else {
// Not consistent enough, but not everyone's responded yet, so we'll wait.
SINFO("Waiting to commit. consistencyRequired=" << consistencyLevelNames[_commitConsistency]
<< ", commitsSinceCheckpoint=" << _commitsSinceCheckpoint);
// We're going to need to read from the network to finish this.
return false;
}
// We were committing, but now we're not. The only code path through here that doesn't lead to the point
// is the 'return false' immediately above here, everything else completes the transaction (even if it was
// a failed transaction), so we can safely unlock now.
SQLite::g_commitLock.unlock();
}
// If there's a transaction that's waiting, we'll start it. We do this *before* we check to see if we should
// stand down, and since we return true, we'll never stand down as long as we keep adding new transactions
// here. It's up to the server to stop giving us transactions to process if it wants us to stand down.
if (_commitState == CommitState::WAITING) {
// Lock the database. We'll unlock it when we complete in a future update cycle.
SQLite::g_commitLock.lock();
_commitState = CommitState::COMMITTING;
// Figure out how much consistency we need. Go with whatever the caller specified, unless we're over our
// checkpoint limit.
if (_commitsSinceCheckpoint >= _quorumCheckpoint) {
_commitConsistency = QUORUM;
}
SINFO("[performance] Beginning " << consistencyLevelNames[_commitConsistency] << " commit.");
// Now that we've grabbed the commit lock, we can safely clear out any outstanding transactions, no new
// ones can be added until we release the lock.
_sendOutstandingTransactions();
// We'll send the commit count to peers.
uint64_t commitCount = _db.getCommitCount();
// If there was nothing changed, then we shouldn't have anything to commit.
// Except that this is allowed right now.
// SASSERT(!_db.getUncommittedQuery().empty());
// There's no handling for a failed prepare. This should only happen if the DB has been corrupted or
// something catastrophic like that.
SASSERT(_db.prepare());
// Begin the distributed transaction
SData transaction("BEGIN_TRANSACTION");
SINFO("beginning distributed transaction for commit #" << commitCount + 1 << " ("
<< _db.getUncommittedHash() << ")");
transaction.set("NewCount", commitCount + 1);
transaction.set("NewHash", _db.getUncommittedHash());
if (_commitConsistency == ASYNC) {
transaction["ID"] = "ASYNC_" + to_string(_lastSentTransactionID + 1);
} else {
transaction.set("ID", _lastSentTransactionID + 1);
}
transaction.content = _db.getUncommittedQuery();
for (auto peer : peerList) {
// Clear the response flag from the last transaction
(*peer)["TransactionResponse"].clear();
}
// And send it to everyone who's subscribed.
uint64_t beforeSend = STimeNow();
_sendToAllPeers(transaction, true);
SINFO("SQLite::_sendToAllPeers in SQLiteNode took " << ((STimeNow() - beforeSend)/1000) << "ms.");
// We return `true` here to immediately re-update and thus commit this transaction immediately if it was
// asynchronous.
return true;
}
// Check to see if we should stand down. We'll finish any outstanding commits before we actually do.
if (_state == MASTERING) {
string standDownReason;
if (gracefulShutdown()) {
// Graceful shutdown. Set priority 1 and stand down so we'll re-connect to the new master and finish
// up our commands.
standDownReason = "Shutting down, setting priority 1 and STANDINGDOWN.";
_priority = 1;
} else {
// Loop across peers
for (auto peer : peerList) {
// Check this peer
if (SIEquals((*peer)["State"], "MASTERING")) {
// Hm... somehow we're in a multi-master scenario -- not good.
// Let's get out of this as soon as possible.
standDownReason = "Found another MASTER (" + peer->name + "), STANDINGDOWN to clean it up.";
} else if (SIEquals((*peer)["State"], "WAITING")) {
// We have a WAITING peer; is it waiting to STANDUP?
if (peer->calc("Priority") > _priority) {
// We've got a higher priority peer in the works; stand down so it can stand up.
standDownReason = "Found higher priority WAITING peer (" + peer->name
+ ") while MASTERING, STANDINGDOWN";
} else if (peer->calcU64("CommitCount") > _db.getCommitCount()) {
// It's got data that we don't, stand down so we can get it.
standDownReason = "Found WAITING peer (" + peer->name +
") with more data than us (we have " + SToStr(_db.getCommitCount()) +
"/" + _db.getCommittedHash() + ", it has " + (*peer)["CommitCount"] +
"/" + (*peer)["Hash"] + ") while MASTERING, STANDINGDOWN";
}
}
}
}
// Do we want to stand down, and can we?
if (!standDownReason.empty()) {
SHMMM(standDownReason);
_changeState(STANDINGDOWN);
SINFO("Standing down: " << standDownReason);
}
}
// At this point, we're no longer committing. We'll have returned false above, or we'll have completed any
// outstanding transaction, we can complete standing down if that's what we're doing.
if (_state == STANDINGDOWN) {
// See if we're done
// We can only switch to SEARCHING if the server has no outstanding write work to do.
if (_standDownTimeOut.ringing()) {
SWARN("Timeout STANDINGDOWN, giving up on server and continuing.");
} else if (!_server.canStandDown()) {
// Try again.
SINFO("Can't switch from STANDINGDOWN to SEARCHING yet, server prevented state change.");
return false;
}
// Standdown complete
SINFO("STANDDOWN complete, SEARCHING");
_changeState(SEARCHING);
// We're no longer waiting on responses from peers, we can re-update immediately and start becoming a
// slave node instead.
return true;
}
break;
}
/// - SUBSCRIBING: We're waiting for a SUBSCRIPTION_APPROVED from the
/// master. When we receive it, we'll go SLAVING. Otherwise, if we
/// timeout, go SEARCHING.
///
case SUBSCRIBING:
SASSERTWARN(!_syncPeer);
SASSERTWARN(_masterPeer);
SASSERTWARN(_db.getUncommittedHash().empty());
// Nothing to do but wait
if (STimeNow() > _stateTimeout) {
// Give up
SHMMM("Timed out waiting for SUBSCRIPTION_APPROVED, reconnecting to master and re-SEARCHING.");
_reconnectPeer(_masterPeer);
_masterPeer = nullptr;
_changeState(SEARCHING);
return true; // Re-update
}
break;
/// - SLAVING: This is where the other half of the magic happens. Most
/// nodes will (hopefully) spend 99.999% of their time in this state.
/// SLAVING nodes simply begin and commit transactions with the
/// following logic:
///
/// if( master steps down or disconnects ) goto SEARCHING
/// if( new queued commands ) send ESCALATE to master
///
case SLAVING:
SASSERTWARN(!_syncPeer);
SASSERT(_masterPeer);
// If graceful shutdown requested, stop slaving once there is
// nothing blocking shutdown. We stop listening for new commands