# Collect metadata and store it in a MongoDB instance 
This Notebook assumes that you have a list of DOIs that link to datasets of which you want to extract the metadata. 

## Connect to MongoDB instance 


In [4]:
from pymongo import MongoClient

# Replace with your MongoDB connection string
CONNECTION_STRING = "mongodb://127.0.0.1:27018"  # For local MongoDB

# Connect to the database and collection
client = MongoClient(CONNECTION_STRING)
db = client["archaeology_metadata"] # Create new database
collection = db["collection"] # Create new collection


## Extract DOIs

In [None]:
import pandas as pd
df = pd.read_csv('../data/archaeology_metadata.csv')
dois = df.dsPersistentId.tolist()

In [4]:
import requests

def get_json(doi): 
    """
    Get JSON data of a dataset from the Archaeology Data Stations API.

    :param doi: DOI of the dataset 
    :return: JSON data of the dataset 
    """

    url = f"https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId={doi}"

    try:
        # Send a GET request to the URL
        response = requests.get(url)

        # Check if the request was successful
        response.raise_for_status()

        # Parse the JSON data
        return response.json()


    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None





In [5]:
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_parallel(dois, batch_size=50, max_workers=5):
    """
    Process DOIs in parallel using ThreadPoolExecutor.

    :param dois: List of DOIs to process
    :param batch_size: Number of DOIs to process in each batch
    :param max_workers: Number of threads to use for parallel processing
    """
    for i in range(0, len(dois), batch_size):
        batch = dois[i:i + batch_size]
        #print(f"Processing batch {i // batch_size + 1} (DOIs {i} to {i + len(batch) - 1})...")

        results = []
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit tasks for all DOIs in the current batch
            future_to_doi = {executor.submit(get_json, doi): doi for doi in batch}

            for future in as_completed(future_to_doi):
                doi = future_to_doi[future]
                try:
                    data = future.result()
                    if data is not None:  # Only add valid data
                        results.append(data)
                except Exception as e:
                    print(f"An error occurred while processing DOI {doi}: {e}")

        # Insert the batch into MongoDB
        if results:
            collection.insert_many(results)
            #print(f"Inserted {len(results)} documents into MongoDB.")

        # Optional: Add a delay between batches to respect API limits
        time.sleep(1)

In [8]:
# Run the batch processing function
process_parallel(dois, batch_size=50)