# Use the DataLab API to Inject Data

In [None]:
import random
import warnings
from pathlib import Path
from datalab_api import DatalabClient, DuplicateItemError
import os

# You need to generate your own API key from the Datalab web interface
# and set it as an environment variable before running this script.
os.environ['DATALAB_API_KEY'] = 'tZhRZwqWhhwEjyVSVJeWitnfEPKXKArg'

def import_labcup(filename: Path, client: DatalabClient, collection_id: str = None, update_if_exists: bool = False):
    """Helper function to import a ChemInventory xlsx export into datalab, using the
    'starting_materials' item type.

    Migrated from the original version in the main datalab repository:
        https://github.com/the-grey-group/datalab/blob/43764fb494c2cc1bf9f7dc90c25594aeb79d5767/pydatalab/tasks.py#L350-L413
    Args:
        filename (Path): Path to the ChemInventory xlsx export file.
        client (DatalabClient): Datalab API client instance.
        collection_id (str, optional): Collection ID to use for the imported items. If None,
            the collection ID will be generated based on the current date.
        update_if_exists (bool, optional): If True, existing items with the same ID will
            be updated instead of raising a DuplicateItemError. Defaults to False.  
    
    Usage:
        import_labcup(Path("path/to/your/labcup_export.xlsx"), client, collection_id="my_collection", update_if_exists=True)        

    """

    warnings.warn(
        DeprecationWarning(
            "This helper is deprecated in favour of the full-fledged plugin at: https://github.com/datalab-industries/datalab-cheminventory-plugin"
        )
    )

    def _generate_random_startingmaterial_id():
        """Generate 'XX' + a random 15-length string for use as an id for starting materials
        that don't have a barcode.
        """
        yield "".join(["XX"] + random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=15))

    try:
        import pandas as pd

        inventory_df = pd.read_excel(filename)
    except ImportError as exc:
        raise ImportError(
            "Please install pandas + openpyxl to use this helper function, via `pip install datalab-api[cheminventory-helper]`"
        ) from exc
    #TODO: add variable to specify the item type, currently hardcoded to 'starting_materials'
    inventory_df["type"] = "starting_materials"
    inventory_df["item_id"] = inventory_df["Barcode"]
    # Fill missing barcodes
    inventory_df["item_id"] = inventory_df["item_id"].fillna(
        inventory_df["item_id"].apply(lambda _: next(_generate_random_startingmaterial_id()))
    )
    # Note that all other fields are inserted from the fields specified in inventory_df 

    counts = {"success": 0, "duplicate": 0, "failed": 0}
    
    # if collection_id is None, then we use the date as the name
    if collection_id is None:
        from datetime import datetime
        collection_id = f"labcup-import-{datetime.now().strftime('%d-%m-%Y')}"
        
    # Helper function to replace non-JSON-compliant values (NaN, Inf) with None    
    import math
    def sanitize_for_json(obj):
        """Recursively replace NaNs/Infs with None in nested structures."""
        if isinstance(obj, dict):
            return {k: sanitize_for_json(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [sanitize_for_json(v) for v in obj]
        elif isinstance(obj, float) and (math.isnan(obj) or math.isinf(obj)):
            return None
        else:
            return obj
        
    def update_existing_item(item_id: str, item_data: dict, base_url: str):
        """Update an existing item using direct HTTP request."""
        import httpx
        import os
        
        try:
            response = httpx.post(
                f"{base_url}/save-item",
                json={"item_id": item_id, "data": item_data},
                follow_redirects=True,
                headers={"DATALAB-API-KEY": os.environ["DATALAB_API_KEY"]},
                timeout=30.0
            )
            response.raise_for_status()
            return True
        except Exception as e:
            print(f"Failed to update item {item_id}: {e}")
            return False
    for raw_item in inventory_df.to_dict(orient="records"):
        # clean up the item to ensure it's JSON-compliant
        item = sanitize_for_json(raw_item)  
        try:
            client.create_item(
                item["item_id"], item["type"], item, collection_id=collection_id
            )
            counts["success"] += 1
            print(f"Successfully imported item: {item['item_id']}")
        except DuplicateItemError:
            counts["duplicate"] += 1
            
            if not update_if_exists:
                print(f"Duplicate item found: {item['item_id']}, but skipping entry as no update is requested.")
            else:
                # Try to update the existing item instead
                print(f"Duplicate item found: {item['item_id']}, attempting to update...")
                
                # Get the client's base URL (assuming it's available)
                base_url = getattr(client, 'base_url', 'http://sce-chem-c01894.chem.ed.ac.uk:5001')
                
                if update_existing_item(item["item_id"], item, base_url):
                    print(f"Successfully updated item: {item['item_id']}")
                else:
                    counts["duplicate"] += 1
                    print(f"Failed to update duplicate item: {item['item_id']}")
        except Exception as exc:
            counts["failed"] += 1
            print(f"Failed to import item: {item}. Error: {exc}")
            continue

    print(f"Done: {counts=}")

In [17]:
# import the CLEANED data into datalab
import_labcup(filename="/home/ian/datalab-deployment/datalab/Chemicals_all_09_06_2025_updated.xlsx", client=DatalabClient("http://sce-chem-c01894.chem.ed.ac.uk:5001"))



Duplicate item found: CH009014, attempting to update...
Successfully updated item: CH009014
Duplicate item found: CH009470, attempting to update...
Successfully updated item: CH009470
Duplicate item found: CH009053, attempting to update...
Successfully updated item: CH009053
Duplicate item found: CH009071, attempting to update...
Successfully updated item: CH009071
Duplicate item found: CH009017, attempting to update...
Successfully updated item: CH009017
Duplicate item found: CH080327, attempting to update...
Successfully updated item: CH080327
Duplicate item found: CH079910, attempting to update...
Successfully updated item: CH079910
Duplicate item found: CH079666, attempting to update...
Successfully updated item: CH079666
Duplicate item found: CH079665, attempting to update...
Successfully updated item: CH079665
Duplicate item found: CH077730, attempting to update...
Successfully updated item: CH077730
Duplicate item found: CH001479, attempting to update...
Successfully updated ite

KeyboardInterrupt: 

# Database Connection Troubleshoot

If we run into issues, it might be because the database is not running or running at a different port than we think.

In [8]:
# Additional troubleshooting - run these commands in a terminal
import subprocess
import os

def run_command(command):
    """Run a shell command and return the output"""
    try:
        result = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=10)
        return result.stdout, result.stderr, result.returncode
    except subprocess.TimeoutExpired:
        return "", "Command timed out", 1
    except Exception as e:
        return "", str(e), 1

# Check if MongoDB is running locally
print("=== Checking if MongoDB is running locally ===")
stdout, stderr, code = run_command("sudo systemctl status mongod")
if code == 0:
    print("MongoDB service status:")
    print(stdout[:500] + "..." if len(stdout) > 500 else stdout)
else:
    print("MongoDB service not found or not running locally")
    
# Check if port 27017 is listening
print("\n=== Checking if port 27017 is in use ===")
stdout, stderr, code = run_command("netstat -tuln | grep 27017")
if stdout:
    print("Port 27017 status:")
    print(stdout)
else:
    print("Port 27017 is not listening")

# Test DNS resolution
print("\n=== Testing DNS resolution ===")
stdout, stderr, code = run_command("nslookup sce-chem-c01894.chem.ed.ac.uk")
if code == 0:
    print("DNS resolution successful:")
    print(stdout)
else:
    print("DNS resolution failed:")
    print(stderr)

# Test ping
print("\n=== Testing connectivity ===")
stdout, stderr, code = run_command("ping -c 3 sce-chem-c01894.chem.ed.ac.uk")
if code == 0:
    print("Ping successful:")
    print(stdout)
else:
    print("Ping failed:")
    print(stderr)

=== Checking if MongoDB is running locally ===
MongoDB service not found or not running locally

=== Checking if port 27017 is in use ===
Port 27017 is not listening

=== Testing DNS resolution ===
DNS resolution successful:
Server:		127.0.0.53
Address:	127.0.0.53#53

Name:	sce-chem-c01894.chem.ed.ac.uk
Address: 127.0.1.1



=== Testing connectivity ===
Ping successful:
PING sce-chem-c01894.chem.ed.ac.uk (127.0.1.1) 56(84) bytes of data.
64 bytes from sce-chem-c01894.chem.ed.ac.uk (127.0.1.1): icmp_seq=1 ttl=64 time=0.023 ms
64 bytes from sce-chem-c01894.chem.ed.ac.uk (127.0.1.1): icmp_seq=2 ttl=64 time=0.058 ms
64 bytes from sce-chem-c01894.chem.ed.ac.uk (127.0.1.1): icmp_seq=3 ttl=64 time=0.061 ms

--- sce-chem-c01894.chem.ed.ac.uk ping statistics ---
3 packets transmitted, 3 received, 0% packet loss, time 2061ms
rtt min/avg/max/mdev = 0.023/0.047/0.061/0.017 ms

