/
faulttest.go
174 lines (152 loc) · 4.02 KB
/
faulttest.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
// Copyright 2017 Factom Foundation
// Use of this source code is governed by the MIT
// license that can be found in the LICENSE file.
package engine
import (
"fmt"
"time"
"math/rand"
"os"
)
func waitToKill(k *bool) {
t := rand.Int()%120 + 60
for t > 0 {
os.Stderr.WriteString(fmt.Sprintf(" Will kill some servers in about %d seconds\n", t))
if t < 30 {
time.Sleep(time.Duration(t) * time.Second)
} else {
time.Sleep(30 * time.Second)
}
t -= 30
}
*k = true
}
// Wait some random amount of time between 0 and 2 minutes, and bring the node back. We might
// come back before we are faulted, or we might not.
func bringback(f *FactomNode) {
t := rand.Int()%120 + 60
for t > 0 {
if !f.State.GetNetStateOff() {
return
}
os.Stderr.WriteString(fmt.Sprintf(" Bringing %s back in %d seconds.\n", f.State.FactomNodeName, t))
if t < 30 {
time.Sleep(time.Duration(t) * time.Second)
} else {
time.Sleep(30 * time.Second)
}
t -= 30
}
f.State.SetNetStateOff(false) // Bring this node back
}
func offlineReport(faulting *bool) {
for *faulting {
// How many nodes are running.
stmt := "Offline: "
for _, f := range fnodes {
if f.State.GetNetStateOff() {
stmt = stmt + fmt.Sprintf(" %s", f.State.FactomNodeName)
}
}
if len(stmt) > 10 {
os.Stderr.WriteString(stmt + "\n")
}
time.Sleep(20 * time.Second)
}
}
func faultTest(faulting *bool) {
killsome := false
killing := false
numleaders := 0
currentdbht := 0
currentminute := 0
goodleaders := 0
go offlineReport(faulting)
for *faulting {
var leaders []*FactomNode
lastgood := goodleaders
goodleaders = 0
// How many of the running nodes are leaders
for _, f := range fnodes {
if f.State.GetNetStateOff() {
continue
}
if !f.State.Leader {
continue
}
if int(f.State.LLeaderHeight) < currentdbht {
continue
}
if int(f.State.LLeaderHeight) == currentdbht && int(f.State.CurrentMinute) < currentminute {
continue
}
goodleaders++
leaders = append(leaders, f)
pl := f.State.LeaderPL
if pl != nil && len(pl.FedServers) > numleaders {
numleaders = len(pl.FedServers)
}
}
if lastgood != goodleaders {
os.Stderr.WriteString(fmt.Sprintf("Of %d Leaders, we now have %d in working order.\n", numleaders, goodleaders))
}
nextblk := false
lastdbht := currentdbht
lastminute := currentminute
// Look at their process lists. How many leaders do we expect? What is the dbheight?
for _, f := range fnodes {
if int(f.State.LLeaderHeight) > currentdbht {
currentminute = 0
currentdbht = int(f.State.LLeaderHeight)
nextblk = true
}
if !nextblk && f.State.CurrentMinute > currentminute {
currentminute = f.State.CurrentMinute
}
}
if !killing && goodleaders >= numleaders {
if currentdbht > lastdbht || currentminute > lastminute {
killing = true
go waitToKill(&killsome)
}
}
// Can't run this test without at least three leaders.
if numleaders < 3 {
os.Stderr.WriteString("Not enough leaders to run fault test\n")
*faulting = false
return
}
if killsome && len(leaders) > 0 && goodleaders >= numleaders {
killing = false
killsome = false
// Wait some random amount of time.
delta := rand.Int() % 20
time.Sleep(time.Duration(delta) * time.Second)
kill := 1
maxLeadersToKill := numleaders / 2
if maxLeadersToKill == 0 {
maxLeadersToKill = 1
} else {
kill = rand.Int() % maxLeadersToKill
kill++
}
kill = 1
os.Stderr.WriteString(fmt.Sprintf("Killing %3d of %3d Leaders\n", kill, numleaders))
for i := 0; i < kill; {
n := rand.Int() % len(leaders)
if !leaders[n].State.GetNetStateOff() {
os.Stderr.WriteString(fmt.Sprintf(" >>>> Killing %10s %s\n",
leaders[n].State.FactomNodeName,
leaders[n].State.GetIdentityChainID().String()[4:16]))
leaders[n].State.SetNetStateOff(true)
go bringback(leaders[n])
i++
time.Sleep(time.Duration(rand.Int()%40) * time.Second)
totalServerFaults++
}
}
} else {
time.Sleep(1 * time.Second)
}
}
}