Skip to content

Commit 5e70797

Browse files
authored
chore: let testinfra try to connect again if ssh conn is lost (#1355)
* chore: let testinfra try to connect again if ssh conn is lost * fix: resolve file
1 parent 7d374fd commit 5e70797

File tree

1 file changed

+75
-40
lines changed

1 file changed

+75
-40
lines changed

testinfra/test_ami_nix.py

+75-40
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,14 @@
1212
from time import sleep
1313

1414
# if GITHUB_RUN_ID is not set, use a default value that includes the user and hostname
15-
RUN_ID = os.environ.get("GITHUB_RUN_ID", "unknown-ci-run-" + os.environ.get("USER", "unknown-user") + '@' + socket.gethostname())
16-
AMI_NAME = os.environ.get('AMI_NAME')
15+
RUN_ID = os.environ.get(
16+
"GITHUB_RUN_ID",
17+
"unknown-ci-run-"
18+
+ os.environ.get("USER", "unknown-user")
19+
+ "@"
20+
+ socket.gethostname(),
21+
)
22+
AMI_NAME = os.environ.get("AMI_NAME")
1723
postgresql_schema_sql_content = """
1824
ALTER DATABASE postgres SET "app.settings.jwt_secret" TO 'my_jwt_secret_which_is_not_so_secret';
1925
ALTER DATABASE postgres SET "app.settings.jwt_exp" TO 3600;
@@ -158,12 +164,12 @@
158164

159165
logger = logging.getLogger("ami-tests")
160166
handler = logging.StreamHandler()
161-
formatter = logging.Formatter(
162-
'%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
167+
formatter = logging.Formatter("%(asctime)s %(name)-12s %(levelname)-8s %(message)s")
163168
handler.setFormatter(formatter)
164169
logger.addHandler(handler)
165170
logger.setLevel(logging.DEBUG)
166171

172+
167173
# scope='session' uses the same container for all the tests;
168174
# scope='function' uses a new container per test function.
169175
@pytest.fixture(scope="session")
@@ -232,7 +238,7 @@ def gzip_then_base64_encode(s: str) -> str:
232238
"Tags": [
233239
{"Key": "Name", "Value": "ci-ami-test-nix"},
234240
{"Key": "creator", "Value": "testinfra-ci"},
235-
{"Key": "testinfra-run-id", "Value": RUN_ID}
241+
{"Key": "testinfra-run-id", "Value": RUN_ID},
236242
],
237243
}
238244
],
@@ -264,48 +270,76 @@ def gzip_then_base64_encode(s: str) -> str:
264270
logger.warning("waiting for ssh to be available")
265271
sleep(10)
266272

267-
host = testinfra.get_host(
273+
def get_ssh_connection(instance_ip, ssh_identity_file, max_retries=10):
274+
for attempt in range(max_retries):
275+
try:
276+
return testinfra.get_host(
277+
f"paramiko://ubuntu@{instance_ip}?timeout=60",
278+
ssh_identity_file=ssh_identity_file,
279+
)
280+
except Exception as e:
281+
if attempt == max_retries - 1:
282+
raise
283+
logger.warning(
284+
f"Ssh connection failed, retrying: {attempt + 1}/{max_retries} failed, retrying ..."
285+
)
286+
sleep(5)
287+
288+
host = get_ssh_connection(
268289
# paramiko is an ssh backend
269-
f"paramiko://ubuntu@{instance.public_ip_address}?timeout=60",
270-
ssh_identity_file=temp_key.get_priv_key_file(),
290+
instance.public_ip_address,
291+
temp_key.get_priv_key_file(),
271292
)
272293

273-
def is_healthy(host) -> bool:
274-
cmd = host.run("sudo -u postgres /usr/bin/pg_isready -U postgres")
275-
if cmd.failed is True:
276-
logger.warning("pg not ready")
277-
return False
278-
279-
cmd = host.run(f"curl -sf -k --connect-timeout 30 --max-time 60 https://localhost:8085/health -H 'apikey: {supabase_admin_key}'")
280-
if cmd.failed is True:
281-
logger.warning("adminapi not ready")
282-
return False
283-
284-
cmd = host.run("curl -sf --connect-timeout 30 --max-time 60 http://localhost:3001/ready")
285-
if cmd.failed is True:
286-
logger.warning("postgrest not ready")
287-
return False
288-
289-
cmd = host.run("curl -sf --connect-timeout 30 --max-time 60 http://localhost:8081/health")
290-
if cmd.failed is True:
291-
logger.warning("gotrue not ready")
292-
return False
293-
294-
# TODO(thebengeu): switch to checking Envoy once it's the default.
295-
cmd = host.run("sudo kong health")
296-
if cmd.failed is True:
297-
logger.warning("kong not ready")
298-
return False
299-
300-
cmd = host.run("sudo fail2ban-client status")
301-
if cmd.failed is True:
302-
logger.warning("fail2ban not ready")
303-
return False
294+
def is_healthy(host, instance_ip, ssh_identity_file) -> bool:
295+
health_checks = [
296+
(
297+
"postgres",
298+
lambda h: h.run("sudo -u postgres /usr/bin/pg_isready -U postgres"),
299+
),
300+
(
301+
"adminapi",
302+
lambda h: h.run(
303+
f"curl -sf -k --connect-timeout 30 --max-time 60 https://localhost:8085/health -H 'apikey: {supabase_admin_key}'"
304+
),
305+
),
306+
(
307+
"postgrest",
308+
lambda h: h.run(
309+
"curl -sf --connect-timeout 30 --max-time 60 http://localhost:3001/ready"
310+
),
311+
),
312+
(
313+
"gotrue",
314+
lambda h: h.run(
315+
"curl -sf --connect-timeout 30 --max-time 60 http://localhost:8081/health"
316+
),
317+
),
318+
("kong", lambda h: h.run("sudo kong health")),
319+
("fail2ban", lambda h: h.run("sudo fail2ban-client status")),
320+
]
321+
322+
for service, check in health_checks:
323+
try:
324+
cmd = check(host)
325+
if cmd.failed is True:
326+
logger.warning(f"{service} not ready")
327+
return False
328+
except Exception:
329+
logger.warning(
330+
f"Connection failed during {service} check, attempting reconnect..."
331+
)
332+
host = get_ssh_connection(instance_ip, ssh_identity_file)
333+
return False
304334

305335
return True
306336

307337
while True:
308-
if is_healthy(host):
338+
if is_healthy(
339+
host=host,
340+
instance_ip=instance.public_ip_address,
341+
ssh_identity_file=temp_key.get_priv_key_file(),
342+
):
309343
break
310344
sleep(1)
311345

@@ -393,6 +427,7 @@ def test_postgrest_ending_apikey_query_parameter_is_removed(host):
393427
)
394428
assert res.ok
395429

430+
396431
# There would be an error if the empty key query parameter isn't removed,
397432
# since PostgREST treats empty key query parameters as malformed input.
398433
#

0 commit comments

Comments
 (0)