In [1]:
import os
from openai import OpenAI
import json

In [None]:
from dotenv import load_dotenv, find_dotenv
from charset_normalizer import from_path
import os 
env_path = find_dotenv()

# detect encoding
result = from_path(env_path).best()
encoding = result.encoding if result else "utf-8"
print(f"Detected encoding: {encoding}")
load_dotenv(env_path, encoding=encoding)

Detected encoding: ascii


True

In [3]:
encoding

'ascii'

In [2]:
model_name = "qwen/qwen3-32b"

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),    
)

In [7]:
Prompt = """You are an ontology engineer.
Your task is to output **one** JSON object that satisfies the
JSON-Schema provided below.

▸ Copy *label* and *comment* verbatim from the user section.
▸ Do **NOT** introduce keys that are absent from the schema.
▸ Every value must respect the declared JSON type
  (e.g. hasProperty is a string, hasConstraint is an array, …).
▸ Reply with the JSON object only — no markdown fences, no narration.

### JSON-Schema
{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://example.org/schemas/iadopt-variable.json",
  "title": "I-ADOPT decomposed variable",
  "description": "Single variable expressed in the compact JSON layout used for LLM-driven decomposition.",
  "type": "object",

  "required": ["label", "comment", "hasProperty", "hasObjectOfInterest"],

  "properties": {
    "label": {
      "type": "string",
      "description": "Variable name."
    },
    "comment": {
      "type": "string",
      "description": "Definition / description."
    },
    "hasProperty": {
      "type": "string",
      "description": "Property being observed (e.g. 'distance', 'temperature')."
    },
    "hasStatisticalModifier": {
      "type": "string",
      "description": "Statistical qualifier (e.g. 'maximum', 'latest')."
    },
    "hasMatrix": {
      "$ref": "#/$defs/entityOrSystem",
      "description": "The *Matrix* (Entity or System) in which the observation takes place."
    },
    "hasObjectOfInterest": {
      "$ref": "#/$defs/entityOrSystem",
      "description": "The main *Object of Interest* (Entity or System) that is being characterised."
    },
    "hasContextObject": {
      "$ref": "#/$defs/entityOrSystem",
      "description": "Additional background Entity or System that gives context to the Object of Interest."
    },
    "hasConstraint": {
      "type": "array",
      "description": "List of constraints, each with a label and the target it applies to.",
      "items": {
        "type": "object",
        "required": ["label", "on"],
        "properties": {
          "label": {
            "type": "string",
            "description": "Constraint label."
          },
          "on": {
            "type": "string",
            "description": "What the constraint applies to."
          }
        },
        "additionalProperties": false
      },
      "minItems": 1
    }
  },

  "additionalProperties": false,

  "$defs": {
    "entityOrSystem": {
      "description": "Either a simple label or a structured system.",
      "oneOf": [
        { "type": "string" },
        {
          "$comment": "Asymmetric system",
          "type": "object",
          "required": [
            "AsymmetricSystem",
            "hasSource",
            "hasTarget",
            "hasNumerator",
            "hasDenominator"
          ],
          "properties": {
            "AsymmetricSystem": { "type": "string" },
            "hasSource":        { "type": "string" },
            "hasTarget":        { "type": "string" }
          },
          "additionalProperties": false
        },
        {
          "$comment": "Symmetric system",
          "type": "object",
          "required": ["SymmetricSystem", "hasPart"],
          "properties": {
            "SymmetricSystem": { "type": "string" },
            "hasPart": {
              "type": "array",
              "items": { "type": "string" },
              "minItems": 1
            }
          },
          "additionalProperties": false
        }
      ]
    }
  }
}

### Examples (valid against the same schema)
{
  "label": "Distance to nearest neighbour habitat patch",
  "comment": "This variable is part of the EBV Connectivity of terrestrial ecosystem habitat types and helps to measure the degree of connection of EUNIS habitats within a landscape, in terms of their spatial distribution. https://github.com/EuropaBON/EBV-Descriptions/wiki/Terrestrial-Connectivity-of-terrestrial-ecosystem-habitat-types",
  "hasProperty": "distance",
  "hasObjectOfInterest": {
    "AsymmetricSystem": "habitat patch system",
    "hasSource": "habitat patch",
    "hasTarget": "habitat patch"
  },
  "hasConstraint": [
    {
      "label": "nearest neighbour",
      "on": "hasTarget: habitat patch"
    }
  ]
}

{
  "label": "Weight specific-ingestion Carbon rate at 15 °C",
  "comment": "The amount of carbon consumed by an organism at non-limiting concentration of food relative to the individual dry weight measured at 15°C. It is expressed as μg C mg DW−1 h−1",
  "hasProperty": "mass flow rate",
  "hasMatrix": "organism",
  "hasObjectOfInterest": "Carbon",
  "hasConstraint": [
    {
      "label": "weight-specific",
      "on": "mass flow rate"
    },
    {
      "label": "dry",
      "on": "organism"
    },
    {
      "label": "at non-limiting conditions",
      "on": "Carbon"
    },
    {
      "label": "at 15°C temperature",
      "on": "mass flow rate"
    },
    {
      "label": "due to ingestion",
      "on": "mass flow rate"
    }
  ]
}

{
  "label": "Mass flux of carbon into soil from vegetation due to senescence",
  "comment": "In accordance with common usage in geophysical disciplines, \"flux\" implies per unit area, called \"flux density\" in physics. \"Vegetation\" means any living plants e.g. trees, shrubs, grass. The specification of a physical process by the phrase \"due_to_\" process means that the quantity named is a single term in a sum of terms which together compose the general quantity named by omitting the phrase. The term \"senescence\" means loss of living biomass excluding plant death, e.g. leaf drop and other seasonal effects. The term refers to changes in the whole plant and is not confined only to leaf drop.",
  "hasProperty": "mass flux",
  "hasMatrix": {
    "AsymmetricSystem": "from vegetation to soil",
    "hasSource": "vegetation",
    "hasTarget": "soil"
  },
  "hasObjectOfInterest": "carbon",
  "hasConstraint": [
    {
      "label": "due to senescence",
      "on": "mass flux"
    }
  ]
}

### Variable to decompose
label: Date of last PCR-test
comment: date of last COVID-19 PCR test performed by a certified laboratory

### Expected output
*(only the JSON object)*
"""

In [8]:
completion = client.chat.completions.create(
  extra_headers={
    "HTTP-Referer": "<YOUR_SITE_URL>", # Optional. Site URL for rankings on openrouter.ai.
    "X-Title": "<YOUR_SITE_NAME>", # Optional. Site title for rankings on openrouter.ai.
  },
  extra_body={},
  model=model_name,
  temperature=0.5,
  messages=[
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": Prompt
        } 
      ]
    }
  ],
)
# print(completion.choices[0].message.content)

In [9]:
completion.choices[0].message.content
parsed = json.loads(completion.choices[0].message.content)
print(json.dumps(parsed, indent=4))

{
    "label": "Date of last PCR-test",
    "comment": "date of last COVID-19 PCR test performed by a certified laboratory",
    "hasProperty": "date",
    "hasObjectOfInterest": "PCR-test",
    "hasConstraint": [
        {
            "label": "last",
            "on": "PCR-test"
        }
    ],
    "hasMatrix": "certified laboratory"
}


In [10]:
## Ground Truth
{
  "label": "Date of last PCR-test",
  "comment": "date of last COVID-19 PCR test performed by a certified laboratory",
  "hasProperty": "date",
  "hasMatrix": "person",
  "hasObjectOfInterest": "COVID-19 PCR test",
  "hasStatisticalModifier": "latest",
  "hasConstraint": [
    {
      "label": "performed by a certified laboratory",
      "on": "COVID-19 PCR test"
    }
  ]
}


{'label': 'Date of last PCR-test',
 'comment': 'date of last COVID-19 PCR test performed by a certified laboratory',
 'hasProperty': 'date',
 'hasMatrix': 'person',
 'hasObjectOfInterest': 'COVID-19 PCR test',
 'hasStatisticalModifier': 'latest',
 'hasConstraint': [{'label': 'performed by a certified laboratory',
   'on': 'COVID-19 PCR test'}]}

In [61]:
# Note M1 GPU support is experimental, see Thinc issue #792
# !pip install -U pip setuptools wheel
# !pip install -U 'spacy[apple]'
# !python -m spacy download en_core_web_trf
# !pip install spacy[transformers]


In [66]:
import spacy
from spacyfishing import EntityFishing

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("entityfishing", config={"api_ef_base": "http://nerd.huma-num.fr/nerd/service"})

doc = nlp(term)
for ent in doc.ents:
    print(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata)


In [62]:
doc.ents

()

In [77]:
import requests

ENTITY_FISHING_API = "http://nerd.huma-num.fr/nerd/service/disambiguate"

def get_entities_with_uris(text: str, lang: str = "en") -> dict:
    payload = {"text": text, "language": lang}
    r = requests.post(ENTITY_FISHING_API, json=payload, timeout=60)
    r.raise_for_status()
    data = r.json()
    entities = {}
    for ent in data.get("entities", []):
        qid = ent.get("wikidataId")
        if qid:
            entities[ent["rawName"]] = f"https://www.wikidata.org/wiki/{qid}"
    return entities

# Example
text = "Soil is a matrix."
print(get_entities_with_uris(text))


{'Soil': 'https://www.wikidata.org/wiki/Q36133'}


In [75]:
import requests
payload = {
    "text": "Soil CO₂ flux' measures the flux (a property) of carbon dioxide (CO₂) emitted from soil (a matrix) under environmental conditions (a context object).",
    "language": "en"
}
r = requests.post("http://nerd.huma-num.fr/nerd/service/disambiguate", json=payload)
print(r.status_code, r.text)


200 {"software": "entity-fishing", "version": "0.0.6", "date": "2025-10-27T11:16:02.139Z", "runtime": 32, "nbest": false, "text": "Soil CO₂ flux' measures the flux (a property) of carbon dioxide (CO₂) emitted from soil (a matrix) under environmental conditions (a context object).", "language": {"lang":"en", "conf": 1.0}, "global_categories": [{"weight" : 0.049180327868852486, "source" : "wikipedia-en", "category" : "Propellants", "page_id" : 1095504}, {"weight" : 0.016393442622950827, "source" : "wikipedia-en", "category" : "Natural materials", "page_id" : 2959852}, {"weight" : 0.049180327868852486, "source" : "wikipedia-en", "category" : "Gaseous signaling molecules", "page_id" : 45654458}], "entities": [{ "rawName" : "Soil", "offsetStart" : 0, "offsetEnd" : 4, "confidence_score":0.5141, "wikipediaExternalRef": 37738, "wikidataId" : "Q36133", "domains" : [ "Environment", "Gas" ]  }, { "rawName" : "CO₂", "offsetStart" : 5, "offsetEnd" : 8, "confidence_score":0.6705, "wikipediaExternalR

In [4]:
import spacy

text_en = "Victor Hugo and Honoré de Balzac are French writers who lived in Paris."

nlp_model_en = spacy.load("en_core_web_sm")

nlp_model_en.add_pipe("entityfishing")

doc_en = nlp_model_en(text_en)

for ent in doc_en.ents:
        print((ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score))

IndexError: list index out of range

In [1]:
import json
import pathlib

# Directory containing the JSON files
INPUT_DIR = pathlib.Path(
    "/Users/rastegar-a/Documents/GitHub/i-adopt-llm-based-service/benchmarking_example/data/Json_preferred/test_set"
)

# Output file
OUTPUT_FILE = INPUT_DIR.parent / "test_set_combined.json"

combined = []

# Read all JSON files in a stable order
for json_file in sorted(INPUT_DIR.glob("*.json")):
    with open(json_file, "r", encoding="utf-8") as f:
        obj = json.load(f)
        combined.append(obj)

# Write combined JSON
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(combined, f, indent=2, ensure_ascii=False)

print(f"Combined {len(combined)} files into {OUTPUT_FILE}")


Combined 102 files into /Users/rastegar-a/Documents/GitHub/i-adopt-llm-based-service/benchmarking_example/data/Json_preferred/test_set_combined.json
