From 09ba7e51eaaf300cc973648bd335f706696a065c Mon Sep 17 00:00:00 2001 From: Ihor Sokhan Date: Wed, 30 Apr 2025 17:43:55 +0300 Subject: [PATCH 1/2] ability to sync preprints that have missing dois --- admin/management/views.py | 4 +++- admin/templates/management/commands.html | 3 ++- osf/management/commands/sync_doi_metadata.py | 25 ++++++++++++++++---- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/admin/management/views.py b/admin/management/views.py index 16057f147ca..525f0d8d64a 100644 --- a/admin/management/views.py +++ b/admin/management/views.py @@ -150,10 +150,12 @@ def post(self, request): class BulkResync(ManagementCommandPermissionView): def post(self, request): + missing_dois_only = request.POST.get('missing_preprint_dois_only', False) sync_doi_metadata.apply_async(kwargs={ 'modified_date': timezone.now(), 'batch_size': None, - 'dry_run': False + 'dry_run': False, + 'missing_preprint_dois_only': missing_dois_only }) messages.success(request, 'Resyncing with CrossRef and DataCite! It will take some time.') return redirect(reverse('management:commands')) diff --git a/admin/templates/management/commands.html b/admin/templates/management/commands.html index beaaf9cfb5d..93eeaf24c18 100644 --- a/admin/templates/management/commands.html +++ b/admin/templates/management/commands.html @@ -74,7 +74,7 @@

Ban spam users by regular expression


- +
@@ -133,6 +133,7 @@

Resync with CrossRef and DataCite

{% csrf_token %} +
diff --git a/osf/management/commands/sync_doi_metadata.py b/osf/management/commands/sync_doi_metadata.py index 8002feeb961..d23a623ffee 100644 --- a/osf/management/commands/sync_doi_metadata.py +++ b/osf/management/commands/sync_doi_metadata.py @@ -5,7 +5,7 @@ from django.contrib.contenttypes.models import ContentType from django.core.management.base import BaseCommand -from osf.models import GuidMetadataRecord, Identifier, Registration +from osf.models import GuidMetadataRecord, Identifier, Registration, Preprint from framework.celery_tasks import app logger = logging.getLogger(__name__) @@ -14,8 +14,8 @@ RATE_LIMIT_RETRY_DELAY = 60 * 5 -@app.task(name='osf.management.commands.sync_doi_metadata', max_retries=5, default_retry_delay=RATE_LIMIT_RETRY_DELAY) -def sync_identifier_doi(identifier_id): +@app.task(name='osf.management.commands.sync_doi_metadata', bind=True, acks_late=True, max_retries=5, default_retry_delay=RATE_LIMIT_RETRY_DELAY) +def sync_identifier_doi(self, identifier_id): try: identifier = Identifier.objects.get(id=identifier_id) identifier.referent.request_identifier_update('doi') @@ -23,17 +23,21 @@ def sync_identifier_doi(identifier_id): logger.info(f'Doi update for {identifier.value} complete') except Exception as err: logger.warning(f'[{err.__class__.__name__}] Doi update for {identifier.value} failed because of error: {err}') - sync_identifier_doi.retry(exc=err, countdown=RATE_LIMIT_RETRY_DELAY) + self.retry() @app.task(name='osf.management.commands.sync_doi_metadata_command', max_retries=5, default_retry_delay=RATE_LIMIT_RETRY_DELAY) -def sync_doi_metadata(modified_date, batch_size=100, dry_run=True, sync_private=False, rate_limit=100): +def sync_doi_metadata(modified_date, batch_size=100, dry_run=True, sync_private=False, rate_limit=100, missing_preprint_dois_only=False): identifiers = Identifier.objects.filter( category='doi', deleted__isnull=True, modified__lte=modified_date, object_id__isnull=False, ) + if missing_preprint_dois_only: + sync_preprint_missing_dois.apply_async() + identifiers = identifiers.exclude(content_type=ContentType.objects.get_for_model(Preprint)) + if batch_size: identifiers = identifiers[:batch_size] rate_limit = batch_size if batch_size > rate_limit else rate_limit @@ -55,6 +59,17 @@ def sync_doi_metadata(modified_date, batch_size=100, dry_run=True, sync_private= sync_identifier_doi.apply_async(kwargs={'identifier_id': identifier.id}) +@app.task(name='osf.management.commands.sync_preprint_missing_dois', bind=True, acks_late=True, max_retries=5, default_retry_delay=RATE_LIMIT_RETRY_DELAY) +def sync_preprint_missing_dois(self): + preprints = Preprint.objects.filter(preprint_doi_created=None) + for preprint in preprints: + try: + preprint.request_identifier_update('doi', create=True) + except Exception as err: + logger.warning(f'[{err.__class__.__name__}] Doi creation failed for the preprint with id {preprint._id} because of error: {err}') + self.retry() + + @app.task(name='osf.management.commands.sync_doi_empty_metadata_dataarchive_registrations_command', max_retries=5, default_retry_delay=RATE_LIMIT_RETRY_DELAY) def sync_doi_empty_metadata_dataarchive_registrations(modified_date, batch_size=100, dry_run=True, sync_private=False, rate_limit=100): registrations_ids = list( From c0b2596a0287fc0d63197a73f05bf4b581026cf0 Mon Sep 17 00:00:00 2001 From: Ihor Sokhan Date: Thu, 1 May 2025 17:50:17 +0300 Subject: [PATCH 2/2] run crossref request asynchronously for each preprint respecting rate limit --- osf/management/commands/sync_doi_metadata.py | 28 +++++++++++++------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/osf/management/commands/sync_doi_metadata.py b/osf/management/commands/sync_doi_metadata.py index d23a623ffee..e6b079100f3 100644 --- a/osf/management/commands/sync_doi_metadata.py +++ b/osf/management/commands/sync_doi_metadata.py @@ -35,7 +35,7 @@ def sync_doi_metadata(modified_date, batch_size=100, dry_run=True, sync_private= object_id__isnull=False, ) if missing_preprint_dois_only: - sync_preprint_missing_dois.apply_async() + sync_preprint_missing_dois.apply_async(kwargs={'rate_limit': rate_limit}) identifiers = identifiers.exclude(content_type=ContentType.objects.get_for_model(Preprint)) if batch_size: @@ -59,15 +59,25 @@ def sync_doi_metadata(modified_date, batch_size=100, dry_run=True, sync_private= sync_identifier_doi.apply_async(kwargs={'identifier_id': identifier.id}) -@app.task(name='osf.management.commands.sync_preprint_missing_dois', bind=True, acks_late=True, max_retries=5, default_retry_delay=RATE_LIMIT_RETRY_DELAY) -def sync_preprint_missing_dois(self): +@app.task(name='osf.management.commands.sync_preprint_missing_dois', max_retries=5, default_retry_delay=RATE_LIMIT_RETRY_DELAY) +def sync_preprint_missing_dois(rate_limit): preprints = Preprint.objects.filter(preprint_doi_created=None) - for preprint in preprints: - try: - preprint.request_identifier_update('doi', create=True) - except Exception as err: - logger.warning(f'[{err.__class__.__name__}] Doi creation failed for the preprint with id {preprint._id} because of error: {err}') - self.retry() + for record_number, preprint in enumerate(preprints, 1): + # in order to not reach rate limit that CrossRef has, we make delay + if not record_number % rate_limit: + time.sleep(RATE_LIMIT_RETRY_DELAY) + + async_request_identifier_update.apply_async(kwargs={'preprint_id': preprint._id}) + + +@app.task(name='osf.management.commands.async_request_identifier_update', bind=True, acks_late=True, max_retries=5, default_retry_delay=RATE_LIMIT_RETRY_DELAY) +def async_request_identifier_update(self, preprint_id): + preprint = Preprint.load(preprint_id) + try: + preprint.request_identifier_update('doi', create=True) + except Exception as err: + logger.warning(f'[{err.__class__.__name__}] Doi creation failed for the preprint with id {preprint._id} because of error: {err}') + self.retry() @app.task(name='osf.management.commands.sync_doi_empty_metadata_dataarchive_registrations_command', max_retries=5, default_retry_delay=RATE_LIMIT_RETRY_DELAY)