Skip to content

Commit

Permalink
Recovery: Implement checksum comparison to suspicious replica recoverer
Browse files Browse the repository at this point in the history
rucio#5334

Before a suspicious file which is the last remaining copy is declared bad, its checksum should be compared to varify that it is corrupt.
A new method is added to /core/replica.py which retrieves the reason for which a replica was declared bad. If the reason is related
to a checksum problem, then the replica is declared bad.
  • Loading branch information
ChristophAmes committed May 16, 2022
1 parent 1326078 commit 29261d6
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 1 deletion.
31 changes: 31 additions & 0 deletions lib/rucio/core/replica.py
Expand Up @@ -3920,6 +3920,37 @@ def get_suspicious_files(rse_expression, available_elsewhere, filter_=None, logg
return result


@read_session
def get_suspicious_reason(rse_id, scope, name, nattempts=0, logger=logging.log, session=None):
"""
Returns the error message(s) which lead to the replica(s) being declared suspicious.
:param rse_id: ID of RSE.
:param scope: Scope of the replica DID.
:param name: Name of the replica DID.
:param session: The database session in use. Default value = None.
"""
# Alias for bad replicas
bad_replicas_alias = aliased(models.BadReplicas, name='bad_replicas_alias')

# query base
query = session.query(bad_replicas_alias.scope, bad_replicas_alias.name, bad_replicas_alias.reason, bad_replicas_alias.rse_id)\
.filter(bad_replicas_alias.rse_id.in_([rse_id]),
bad_replicas_alias.scope.in_([scope]),
bad_replicas_alias.name.in_([name]))

query_result = query.group_by(bad_replicas_alias.rse_id, bad_replicas_alias.scope, bad_replicas_alias.name, bad_replicas_alias.reason).having(func.count() > nattempts).all()

result = []
rses = {}
for scope_, name_, reason, rse_id_ in query_result:
if rse_id_ not in rses:
rse = get_rse_name(rse_id=rse_id_, session=session)
rses[rse_id_] = rse
result.append({'scope': scope, 'name': name, 'rse': rses[rse_id_], 'rse_id': rse_id_, 'reason': reason})
return result


@transactional_session
def set_tombstone(rse_id, scope, name, tombstone=OBSOLETE, session=None):
"""
Expand Down
Expand Up @@ -46,7 +46,7 @@
from rucio.core.heartbeat import live, die, sanity_check
from rucio.core.monitor import record_counter
from rucio.core.did import get_metadata
from rucio.core.replica import list_replicas, get_suspicious_files, add_bad_pfns, declare_bad_file_replicas
from rucio.core.replica import list_replicas, get_suspicious_files, add_bad_pfns, declare_bad_file_replicas, get_suspicious_reason
from rucio.core.rse_expression_parser import parse_expression

from rucio.core.vo import list_vos
Expand Down Expand Up @@ -286,6 +286,9 @@ def declare_suspicious_replicas_bad(once=False, younger_than=3, nattempts=10, vo
files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key])
del recoverable_replicas[vo][rse_key][replica_key]
else:
suspicious_reason = get_suspicious_reason(recoverable_replicas[vo][rse_key][replica_key]["rse_id"], recoverable_replicas[vo][rse_key][replica_key]["scope"], recoverable_replicas[vo][rse_key][replica_key]["name"], nattempts)
# suspicious_reason = get_suspicious_reason([replica_key["rse_id"]], [replica_key["scope"]], [replica_key["name"]])[0]["reason"]
print("suspicious reason: ", suspicious_reason)
# Deal with replicas based on their metadata.
if file_metadata["datatype"] is None: # "None" type has no function "split()"
files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key])
Expand All @@ -296,6 +299,8 @@ def declare_suspicious_replicas_bad(once=False, younger_than=3, nattempts=10, vo
if action == "ignore":
files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key])
elif action == "declare bad":
# suspicious_reason = get_suspicious_reason([replica_key["rse_id"]], [replica_key["scope"]], [replica_key["name"]])[0]["reason"])

files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key])
else:
logger(logging.WARNING, "RSE: %s, replica name %s, surl %s: Match for the metadata 'datatype' (%s) of replica found in json file, but no match for 'action' (%s)",
Expand Down Expand Up @@ -390,3 +395,5 @@ def stop():
Graceful exit.
"""
GRACEFUL_STOP.set()

run(once=True, sleep_time=0)

0 comments on commit 29261d6

Please sign in to comment.