# Study Inclusion Nanopublication Generator (ASReview Export)

This notebook generates nanopublications from ASReview screening results.

**Input:** `study_inclusion.json` generated by `asreview-to-nanopub.ipynb`

**Output:** One nanopub per included study (283 files for your review)

## Step 1: Configuration

In [15]:
# Path to your JSON configuration file from ASReview export
STUDY_INCLUSION_FILE = "../screening_results/study_inclusion.json"

# Output directory for generated nanopubs
OUTPUT_DIR = "nanopubs_study_inclusion"

# Generate all studies or just test with first N?
# Set to None for all, or an integer like 3 for testing
LIMIT_STUDIES = None  # e.g., 3 for first 3 only

## Step 2: Load Configuration

In [16]:
import json
from pathlib import Path

with open(STUDY_INCLUSION_FILE, 'r') as f:
    config = json.load(f)

# Extract metadata from ASReview export format
review_meta = config['review_metadata']
provenance = config['provenance']
studies = config['studies']

print(f"Review: {review_meta['title']}")
print(f"Screener: {review_meta['screener_name']} ({review_meta['screener_orcid']})")
print(f"Screening date: {review_meta['screening_date']}")
print(f"Tool: {review_meta['screening_tool']}")
print()
print(f"Studies: {len(studies)} included")
print()
print("Provenance chain:")
print(f"  PICO: {provenance['pico_nanopub']}")
print(f"  Search Strategy: {provenance['search_strategy_nanopub']}")
print(f"  Search Execution: {provenance['search_execution_nanopub']}")
print()
print("Sample studies:")
for i, study in enumerate(studies[:5]):
    label = study['label'][:60] + "..." if len(study['label']) > 60 else study['label']
    print(f"  {i+1}. {label}")
if len(studies) > 5:
    print(f"  ... and {len(studies) - 5} more")

Review: Quantum Computing Applications in Biodiversity Research
Screener: Anne Fouilloux (0000-0002-1784-2920)
Screening date: 2025-12-27
Tool: ASReview LAB v2.2

Studies: 283 included

Provenance chain:
  PICO: https://w3id.org/np/RA8B3ptXUOsN7obpkFGtA0FBmsh0OnID53wOsUIpSKTcg
  Search Strategy: https://w3id.org/np/RAEK3jctU2x3IKW174OTgmFH9zDygPiaD-vb4zGrD39A4
  Search Execution: https://w3id.org/np/RAMPy96eCLCXlGR9VvCVf6rJmpN_DlxxarMGm91_5n-O8

Sample studies:
  1. Joint Optimization of Radio and Computational Resources for ...
  2. Machine learning &amp; artificial intelligence in the quantu...
  3. The prospects of quantum computing in computational molecula...
  4. Quantum Machine Learning Applications in the Biomedical Doma...
  5. Quantum-Inspired Real-Time Optimization for 6G Networks: Opp...
  ... and 278 more


## Step 3: Setup Namespaces

In [17]:
from datetime import datetime, timezone
from rdflib import Graph, Dataset, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD, FOAF, PROV

# Define namespaces
NP = Namespace("http://www.nanopub.org/nschema#")
NPX = Namespace("http://purl.org/nanopub/x/")
NT = Namespace("https://w3id.org/np/o/ntemplate/")
ORCID = Namespace("https://orcid.org/")
SLV = Namespace("https://w3id.org/sciencelive/o/terms/")
DISCO = Namespace("http://rdf-vocabulary.ddialliance.org/discovery#")
DCT = Namespace("http://purl.org/dc/terms/")
TEMP_NP = Namespace("http://purl.org/nanopub/temp/np/")

# Template URIs
STUDY_INCLUSION_TEMPLATE = URIRef("https://w3id.org/np/RAivw_N13pxVoXRMP6Y3ErfA--Z011qMqwKccfiKVxF0w")
PROVENANCE_TEMPLATE = URIRef("https://w3id.org/np/RA7lSq6MuK_TIC6JMSHvLtee3lpLoZDOqLJCLXevnrPoU")
PUBINFO_TEMPLATE_1 = URIRef("https://w3id.org/np/RA0J4vUn_dekg-U1kK3AOEt02p9mT2WO03uGxLDec1jLw")
PUBINFO_TEMPLATE_2 = URIRef("https://w3id.org/np/RAukAcWHRDlkqxk7H2XNSegc1WnHI569INvNr-xdptDGI")

# Create output directory
Path(OUTPUT_DIR).mkdir(exist_ok=True)

print("✓ Namespaces configured")
print(f"✓ Output directory: {OUTPUT_DIR}/")

✓ Namespaces configured
✓ Output directory: nanopubs_study_inclusion/


## Step 4: Define Nanopub Generation Function

In [18]:
def create_study_inclusion_nanopub(study_data, systematic_review_uri, author_orcid, author_name):
    """
    Create a Study Inclusion nanopub for a single study.
    
    Args:
        study_data: dict with 'label' and 'uri' keys
        systematic_review_uri: URI of the systematic review/PICO nanopub
        author_orcid: ORCID ID of the screener
        author_name: Name of the screener
    
    Returns:
        tuple: (Dataset, trig_string)
    """
    ds = Dataset()
    ds.bind("this", "http://purl.org/nanopub/temp/np/")
    ds.bind("sub", TEMP_NP)
    ds.bind("np", NP)
    ds.bind("dct", DCT)
    ds.bind("nt", NT)
    ds.bind("npx", NPX)
    ds.bind("xsd", XSD)
    ds.bind("rdfs", RDFS)
    ds.bind("orcid", ORCID)
    ds.bind("prov", PROV)
    ds.bind("foaf", FOAF)
    ds.bind("slv", SLV)
    ds.bind("disco", DISCO)
    
    # URIs
    np_uri = URIRef("http://purl.org/nanopub/temp/np/")
    head_uri = TEMP_NP.Head
    assertion_uri = TEMP_NP.assertion
    provenance_uri = TEMP_NP.provenance
    pubinfo_uri = TEMP_NP.pubinfo
    study_uri = TEMP_NP.study
    author_uri = ORCID[author_orcid]
    
    # HEAD graph
    head = ds.graph(head_uri)
    head.add((np_uri, RDF.type, NP.Nanopublication))
    head.add((np_uri, NP.hasAssertion, assertion_uri))
    head.add((np_uri, NP.hasProvenance, provenance_uri))
    head.add((np_uri, NP.hasPublicationInfo, pubinfo_uri))
    
    # ASSERTION graph
    assertion = ds.graph(assertion_uri)
    assertion.add((study_uri, RDF.type, DISCO.Study))
    assertion.add((study_uri, RDFS.label, Literal(study_data['label'])))
    assertion.add((study_uri, DCT.source, URIRef(study_data['uri'])))
    assertion.add((URIRef(systematic_review_uri), SLV.includesStudy, study_uri))
    
    # PROVENANCE graph
    provenance = ds.graph(provenance_uri)
    provenance.add((assertion_uri, PROV.wasAttributedTo, author_uri))
    
    # PUBINFO graph
    pubinfo = ds.graph(pubinfo_uri)
    pubinfo.add((author_uri, FOAF.name, Literal(author_name)))
    
    timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
    pubinfo.add((np_uri, DCT.created, Literal(timestamp, datatype=XSD.dateTime)))
    pubinfo.add((np_uri, DCT.creator, author_uri))
    pubinfo.add((np_uri, DCT.license, URIRef("https://creativecommons.org/licenses/by/4.0/")))
    pubinfo.add((np_uri, NPX.introduces, study_uri))
    pubinfo.add((np_uri, NPX.wasCreatedAt, URIRef("https://nanodash.knowledgepixels.com/")))
    
    # Label (truncated)
    label = study_data['label'][:50] + "..." if len(study_data['label']) > 50 else study_data['label']
    pubinfo.add((np_uri, RDFS.label, Literal(label)))
    
    # Template references
    pubinfo.add((np_uri, NT.wasCreatedFromTemplate, STUDY_INCLUSION_TEMPLATE))
    pubinfo.add((np_uri, NT.wasCreatedFromProvenanceTemplate, PROVENANCE_TEMPLATE))
    pubinfo.add((np_uri, NT.wasCreatedFromPubinfoTemplate, PUBINFO_TEMPLATE_1))
    pubinfo.add((np_uri, NT.wasCreatedFromPubinfoTemplate, PUBINFO_TEMPLATE_2))
    
    return ds, ds.serialize(format='trig')

print("✓ Function defined")

✓ Function defined


## Step 5: Generate Nanopubs

In [19]:
# Determine which studies to process
if LIMIT_STUDIES is not None:
    studies_to_process = studies[:LIMIT_STUDIES]
else:
    studies_to_process = studies

print(f"Generating {len(studies_to_process)} nanopubs...")
print()

generated_files = []
skipped = 0

for idx, study in enumerate(studies_to_process):
    # Skip studies without valid URIs
    if not study.get('uri') or study['uri'].startswith('urn:study:'):
        skipped += 1
        continue
    
    ds, trig_output = create_study_inclusion_nanopub(
        study_data=study,
        systematic_review_uri=provenance['pico_nanopub'],
        author_orcid=review_meta['screener_orcid'],
        author_name=review_meta['screener_name']
    )
    
    # Generate filename
    filename = f"{OUTPUT_DIR}/study-inclusion-{idx+1:03d}.trig"
    Path(filename).write_text(trig_output)
    generated_files.append(filename)
    
    # Progress indicator (every 50 files)
    if (idx + 1) % 50 == 0:
        print(f"  Generated {idx + 1}/{len(studies_to_process)}...")

print()
print(f"✅ Generated: {len(generated_files)} nanopub files")
if skipped > 0:
    print(f"⚠️  Skipped: {skipped} studies without DOI/URL")

Generating 283 nanopubs...

  Generated 50/283...
  Generated 100/283...
  Generated 150/283...
  Generated 200/283...
  Generated 250/283...

✅ Generated: 238 nanopub files
⚠️  Skipped: 45 studies without DOI/URL


## Step 6: Display Sample Output

In [20]:
if generated_files:
    sample_file = Path(generated_files[0])
    print(f"Sample output ({sample_file}):")
    print("=" * 60)
    print(sample_file.read_text())

Sample output (nanopubs_study_inclusion/study-inclusion-001.trig):
@prefix dct: <http://purl.org/dc/terms/> .
@prefix disco: <http://rdf-vocabulary.ddialliance.org/discovery#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix np: <http://www.nanopub.org/nschema#> .
@prefix npx: <http://purl.org/nanopub/x/> .
@prefix nt: <https://w3id.org/np/o/ntemplate/> .
@prefix orcid: <https://orcid.org/> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix slv: <https://w3id.org/sciencelive/o/terms/> .
@prefix sub: <http://purl.org/nanopub/temp/np/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

sub:assertion {
    <https://w3id.org/np/RA8B3ptXUOsN7obpkFGtA0FBmsh0OnID53wOsUIpSKTcg> slv:includesStudy sub:study .

    sub:study a disco:Study ;
        rdfs:label "Joint Optimization of Radio and Computational Resources for Multicell Mobile-Edge Computing" ;
        dct:source <https://doi.org/10.1109/tsipn.2015.2448520> .
}

sub:pro

## Step 7: Validate Sample

In [21]:
from nanopub import Nanopub, NanopubConf

conf = NanopubConf(use_test_server=True)

if generated_files:
    try:
        np_obj = Nanopub(rdf=Path(generated_files[0]), conf=conf)
        print(f"✅ Validation passed for {generated_files[0]}")
    except Exception as e:
        print(f"❌ Validation error: {e}")

✅ Validation passed for nanopubs_study_inclusion/study-inclusion-001.trig


## Step 8: Summary

In [22]:
print("=" * 60)
print("GENERATION COMPLETE")
print("=" * 60)
print(f"\nReview: {review_meta['title']}")
print(f"Screener: {review_meta['screener_name']}")
print(f"\nResults:")
print(f"  Total included studies: {len(studies)}")
print(f"  Nanopubs generated: {len(generated_files)}")
print(f"  Skipped (no DOI): {skipped}")
print(f"\nOutput files in: {OUTPUT_DIR}/")
print(f"\nProvenance chain:")
print(f"  1. PICO: {provenance['pico_nanopub']}")
print(f"  2. Search Strategy: {provenance['search_strategy_nanopub']}")
print(f"  3. Search Execution: {provenance['search_execution_nanopub']}")
print(f"  4. Study Inclusion: {len(generated_files)} nanopubs")
print("=" * 60)
print("\nNext: Run Step 9 to batch sign and publish (optional)")

GENERATION COMPLETE

Review: Quantum Computing Applications in Biodiversity Research
Screener: Anne Fouilloux

Results:
  Total included studies: 283
  Nanopubs generated: 238
  Skipped (no DOI): 45

Output files in: nanopubs_study_inclusion/

Provenance chain:
  1. PICO: https://w3id.org/np/RA8B3ptXUOsN7obpkFGtA0FBmsh0OnID53wOsUIpSKTcg
  2. Search Strategy: https://w3id.org/np/RAEK3jctU2x3IKW174OTgmFH9zDygPiaD-vb4zGrD39A4
  3. Search Execution: https://w3id.org/np/RAMPy96eCLCXlGR9VvCVf6rJmpN_DlxxarMGm91_5n-O8
  4. Study Inclusion: 238 nanopubs

Next: Run Step 9 to batch sign and publish (optional)


## Step 9: Batch Sign and Publish (Optional)

⚠️ **Warning:** This will publish all generated nanopubs. Uncomment only when ready.

In [23]:
# Batch Sign and Publish
PUBLISH = True  # Set to True when ready
USE_TEST_SERVER = False  # Set to True for testing

if PUBLISH:
    from nanopub import Nanopub, NanopubConf, load_profile
    
    profile = load_profile()
    print(f"Loaded profile: {profile.name}")
    
    conf = NanopubConf(profile=profile, use_test_server=USE_TEST_SERVER)
    
    published_uris = []
    errors = []
    
    for i, filename in enumerate(generated_files):
        try:
            np_obj = Nanopub(rdf=Path(filename), conf=conf)
            np_obj.sign()
            np_obj.publish()
            published_uris.append(np_obj.source_uri)
            
            if (i + 1) % 25 == 0:
                print(f"Published {i + 1}/{len(generated_files)}...")
        except Exception as e:
            errors.append((filename, str(e)))
    
    print(f"\n✅ Published: {len(published_uris)}")
    if errors:
        print(f"❌ Errors: {len(errors)}")
        for f, e in errors[:5]:
            print(f"   {f}: {e}")
    
    # Save URIs
    output = {
        "review": review_meta['title'],
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "total_published": len(published_uris),
        "nanopub_uris": published_uris
    }
    with open(f"{OUTPUT_DIR}/published_uris.json", 'w') as f:
        json.dump(output, f, indent=2)
    print(f"\n✓ Saved URIs to: {OUTPUT_DIR}/published_uris.json")
else:
    print("Publishing disabled. Set PUBLISH = True when ready.")

Loaded profile: Anne Fouilloux
Published 25/238...
Published 50/238...
Published 75/238...
Published 100/238...
Published 125/238...
Published 150/238...
Published 175/238...
Published 200/238...
Published 225/238...

✅ Published: 238

✓ Saved URIs to: nanopubs_study_inclusion/published_uris.json
