From 46b18497f870e884252d189d928e807ec1df4452 Mon Sep 17 00:00:00 2001 From: "marcin p. joachimiak" <4625870+realmarcin@users.noreply.github.com> Date: Sat, 23 May 2026 21:34:12 -0700 Subject: [PATCH 1/2] Backfill #30 wetlands, fix metals extractor bug, lint cleanup, cross-repo validator Combines four follow-ups against #30 (cross-repo environmental linking) plus an unrelated lint cleanup, all of which build on each other and share the same test surface. 1. Wetland backfill (#30 Phase 5) Apply the SPRUCE related_ingredients pattern to 6 more peatland and wetland communities (Stordalen Mire, Prairie Pothole, MUCC Freshwater Wetland, Asgard Wetland Soil, Coastal Forested Wetland, Wetland Oxygen-Sulfate GHG). Each entry uses CHEBI terms and evidence anchored to already-cached PubMed abstracts; no MediaIngredientMech IDs are minted yet. 2. Metals extractor bug fix + 65-file cleanup metal_extraction.py used plain substring matching against 2-letter element symbols ('ti' for TITANIUM, 'au' for GOLD), which matched inside unrelated words ('characteristic', 'australia') and salted metals_present with TITANIUM in 56/67 metal-annotated YAMLs and GOLD in several more. Switched to non-alphanumeric-boundary regex matching (case-insensitive), with tests pinning the behavior. scripts/clean_metals_inplace.py re-runs extraction and rewrites only the metals_present / rare_earth_elements_present / metal_relevance / metal_notes blocks via line-based replacement, preserving comments and unrelated formatting (unlike backfill_metals.py's yaml.dump path). Applied once across the corpus: 65 community YAMLs corrected. 3. Lint cleanup (just lint ruff/black) 178 pre-existing ruff errors -> 0. Removed T20 (print) from the ruff selection with rationale: src/communitymech/ ships CLI entry points that legitimately use print. The remaining 44 non-print errors were fixed inline (unused imports, raise-from chains, collapsible ifs, redundant list() calls, zip strict, line splits, import order in batch_reporter.py) or suppressed with a per-file E501 ignore for llm/prompts.py (long prompt strings) and targeted `# noqa` lines with comments for S301/S701/S704/S112 cases that are intentional within their internal-only contexts. mypy still reports 256 pre-existing errors and is out of scope here. 4. Cross-repo ID validator (#30 Phase 3, local half) New module communitymech.validators.cross_repo_ids with a pattern + existence checker, plus a CLI (scripts/validate_cross_repo_ids.py) and justfile entries (validate-cross-repo-ids, validate-cross-repo-ids-all). Sibling repo paths are opt-in via env or flags; when omitted, the validator emits info-level skip notices rather than silently passing. 10 new tests cover pattern, existence, and edge cases. Test plan: just test (136 passed, 9 skipped), just validate-all (all 265 communities clean), ruff/black green. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/cross_repo_linking.md | 27 +++ justfile | 10 + .../AMD_Acidophile_Heterotroph_Network.yaml | 4 +- .../AMD_Nitrososphaerota_Archaeal.yaml | 4 +- ...lasmata_CuMMO_Soil_Sediment_Community.yaml | 5 +- ...il_Methanogenesis_Substrate_Community.yaml | 44 ++++ kb/communities/At_RSPHERE_SynCom.yaml | 5 +- .../Australian_Lead_Zinc_Polymetallic.yaml | 4 +- .../Bayan_Obo_REE_Tailings_Consortium.yaml | 4 +- ...Chlamydomonas_Bacterial_H2_Consortium.yaml | 5 +- ...amydomonas_Methylobacterium_Mutualism.yaml | 6 +- .../Chlorella_Rhizobium_Bioflocculation.yaml | 6 +- .../Chromium_Sulfur_Reduction_Enrichment.yaml | 4 +- .../Cinnamate_Degradation_Consortium.yaml | 5 +- ...land_Seawater_Ion_Microcosm_Community.yaml | 48 +++++ .../Copper_Biomining_Heap_Leach.yaml | 3 +- ...opper_Sulphide_Bioleaching_Consortium.yaml | 3 +- kb/communities/DVM_Triculture.yaml | 5 +- kb/communities/Dangl_SynComm_35.yaml | 5 +- ...Desulfovibrio_Methanococcus_Syntrophy.yaml | 6 +- .../Ewaste_Bioleaching_Consortium.yaml | 3 +- .../Ferroplasma_Leptospirillum_Syntrophy.yaml | 3 +- .../GLBRC_Populus_Variovorax_SynCom28.yaml | 5 +- .../GLBRC_UFMP_Fermentation_Community.yaml | 6 +- .../GOM_Oil_Degrading_Consortium.yaml | 5 +- .../Geobacter_Clostridium_DIET.yaml | 5 +- .../Geobacter_Methanosaeta_DIET.yaml | 5 +- .../Geobacter_Methanosarcina_DIET.yaml | 5 +- ...300_Area_Unconfined_Aquifer_Community.yaml | 5 +- ...Iberian_Pit_Lake_Stratified_Community.yaml | 3 +- .../Industrial_Bioreactor_Consortium.yaml | 4 +- ...Strain_Persistence_Maternal_Community.yaml | 5 +- ...n_Adsorption_REE_Indigenous_Community.yaml | 8 +- kb/communities/Lotus_LjSC3.yaml | 5 +- kb/communities/MAMC_M48_Lignocellulose.yaml | 5 +- kb/communities/MSC1_Dominant_Core.yaml | 6 +- ...ter_Wetland_Methane_Network_Community.yaml | 34 +++ .../Maize_Root_Simplified_Community.yaml | 5 +- .../Mercury_SFA_EFPC_Sediment_Community.yaml | 5 +- ...thane_Oxidation_CrVI_Reduction_SynCom.yaml | 2 + ...Mixed_Gallium_LED_Recovery_Consortium.yaml | 3 +- .../Naica_Deep_Subsurface_Thermophilic.yaml | 7 +- ..._Geothermal_Mercury_Cycling_Community.yaml | 6 +- .../ORNL_PMI_Populus_PD10_SynCom.yaml | 5 +- ...Uranium_Nitrate_Groundwater_Community.yaml | 9 +- .../Okeke_Lu_Cellulolytic_Consortium.yaml | 6 +- .../PGM_Spent_Catalyst_Bioleaching.yaml | 4 +- ...maculum_Methanothermobacter_Syntrophy.yaml | 7 +- .../Phormidium_Alkaline_Consortium.yaml | 5 +- ...aromonas_Vanadium_Reduction_Community.yaml | 3 +- ...Wetland_Sulfur_Carbon_Virus_Community.yaml | 63 ++++++ ...donitzschia_Sulfitobacter_Association.yaml | 5 +- .../Rammelsberg_Cobalt_Nickel_Tailings.yaml | 3 +- .../Rice_Duckweed_Bacillus_SynCom.yaml | 5 +- kb/communities/Richmond_Mine_AMD_Biofilm.yaml | 3 +- .../Rifle_Aquifer_Bioanode_EET_Community.yaml | 5 +- .../Rifle_Uranium_Reducing_Community.yaml | 3 +- ...Salar_Atacama_Lithium_Brine_Community.yaml | 4 +- kb/communities/Sorghum_SRC1_Subset.yaml | 5 +- .../Soybean_N_Fixation_sfSynCom.yaml | 5 +- ...thylotrophic_Methanogenesis_Community.yaml | 46 ++++ kb/communities/Synechococcus_Ecoli_SPC.yaml | 5 +- ...ophobacter_Methanobacterium_Syntrophy.yaml | 6 +- ...ophobacter_Methanospirillum_Syntrophy.yaml | 6 +- ...rophomonas_Methanospirillum_Syntrophy.yaml | 6 +- .../Syntrophus_Benzoate_Degrader.yaml | 6 +- .../Thermophilic_Pyrite_QS_Consortium.yaml | 3 +- .../Tinto_River_Iron_Cycling_Community.yaml | 3 +- .../Trichoderma_Lactate_Platform.yaml | 5 +- ...desmium_Alteromonas_Marine_Consortium.yaml | 2 +- ...xygen_Sulfate_GHG_Microcosm_Community.yaml | 66 ++++++ kb/communities/Wheat_Consortium_C1.yaml | 5 +- kb/communities/Wheat_Consortium_C6.yaml | 5 +- pyproject.toml | 12 +- scripts/clean_metals_inplace.py | 124 +++++++++++ scripts/validate_cross_repo_ids.py | 73 +++++++ src/communitymech/cli.py | 9 +- src/communitymech/embedding/loader.py | 5 +- src/communitymech/literature.py | 5 +- src/communitymech/llm/anthropic_client.py | 8 +- src/communitymech/metal_extraction.py | 35 +++- src/communitymech/network/auditor.py | 18 +- src/communitymech/network/batch_reporter.py | 4 +- src/communitymech/network/llm_repair.py | 2 +- src/communitymech/network/validators.py | 25 ++- src/communitymech/render_community_pages.py | 10 +- .../uniprot_reference_proteomes.py | 2 +- src/communitymech/utils/id_utils.py | 5 +- .../validators/cross_repo_ids.py | 196 ++++++++++++++++++ .../visualization/umap_generator.py | 22 +- tests/test_cross_repo_ids.py | 153 ++++++++++++++ tests/test_metal_extraction.py | 37 ++++ 92 files changed, 1140 insertions(+), 252 deletions(-) create mode 100644 scripts/clean_metals_inplace.py create mode 100644 scripts/validate_cross_repo_ids.py create mode 100644 src/communitymech/validators/cross_repo_ids.py create mode 100644 tests/test_cross_repo_ids.py create mode 100644 tests/test_metal_extraction.py diff --git a/docs/cross_repo_linking.md b/docs/cross_repo_linking.md index 4a66b24fe..809197235 100644 --- a/docs/cross_repo_linking.md +++ b/docs/cross_repo_linking.md @@ -299,6 +299,8 @@ All new fields are optional: ## Validation +### Schema-level tests + Run the cross-repo linking tests: ```bash @@ -311,6 +313,31 @@ Test data files are in `tests/data/test_cross_repo_linking/`: - `community_no_links.yaml` -- Backward compatibility - `community_all_relationship_types.yaml` -- All 5 enum values +### Cross-repo ID validator + +`just validate-cross-repo-ids FILE` checks that `culturemech_id` / +`mediaingredientmech_id` values match their CURIE patterns and, when +sibling-repo paths are configured, that the referenced IDs actually +exist in those repos. + +```bash +# Pattern check only (no sibling-repo paths) +just validate-cross-repo-ids kb/communities/SPRUCE_Peatland_Methane_Cycling_Community.yaml + +# Pattern + existence check +COMMUNITYMECH_SIBLING_REPOS="CultureMech=../CultureMech/kb/media,MediaIngredientMech=../MediaIngredientMech/kb/ingredients" \ + just validate-cross-repo-ids-all +``` + +The validator returns: +- `error` for malformed CURIEs or IDs missing from a configured sibling repo +- `info` for IDs whose existence check was skipped because the relevant + sibling-repo path wasn't configured +- nothing if a community has no cross-repo IDs at all + +Sibling-repo paths can also be passed via `--culturemech` / +`--mediaingredientmech` flags to `scripts/validate_cross_repo_ids.py`. + ## See Also - [Growth Media Linking](media_linking.md) -- Existing cultivation-based linking diff --git a/justfile b/justfile index 2e2940dc5..38bfc1503 100644 --- a/justfile +++ b/justfile @@ -35,6 +35,16 @@ validate-references-all: uv run linkml-reference-validator validate data "$file" -s src/communitymech/schema/communitymech.yaml --config conf/reference_validator.yaml done +# Validate cross-repo IDs (CultureMech, MediaIngredientMech) in one community file. +# Pattern checks always run; existence checks run when sibling-repo paths are +# configured via COMMUNITYMECH_SIBLING_REPOS env (Name=path,Name=path). +validate-cross-repo-ids FILE: + PYTHONPATH=src uv run python scripts/validate_cross_repo_ids.py {{FILE}} + +# Validate cross-repo IDs across all community files. +validate-cross-repo-ids-all: + PYTHONPATH=src uv run python scripts/validate_cross_repo_ids.py kb/communities/*.yaml + # Validate ontology terms in a community file validate-terms FILE: uv run linkml-term-validator validate-data {{FILE}} -s src/communitymech/schema/communitymech.yaml --labels diff --git a/kb/communities/AMD_Acidophile_Heterotroph_Network.yaml b/kb/communities/AMD_Acidophile_Heterotroph_Network.yaml index d8738b8f4..7c1fecfbf 100644 --- a/kb/communities/AMD_Acidophile_Heterotroph_Network.yaml +++ b/kb/communities/AMD_Acidophile_Heterotroph_Network.yaml @@ -763,9 +763,7 @@ environmental_factors: explanation: Documents metal tolerance in heterotrophic acidophiles metals_present: - COPPER -- GOLD - IRON -- TITANIUM metal_relevance: PRIMARY -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/AMD_Nitrososphaerota_Archaeal.yaml b/kb/communities/AMD_Nitrososphaerota_Archaeal.yaml index db1264339..034ab0ce1 100644 --- a/kb/communities/AMD_Nitrososphaerota_Archaeal.yaml +++ b/kb/communities/AMD_Nitrososphaerota_Archaeal.yaml @@ -699,9 +699,7 @@ environmental_factors: explanation: Demonstrates value of genomic data for understanding archaeal adaptations metals_present: - COPPER -- GOLD - IRON -- TITANIUM metal_relevance: PRIMARY -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Angelarchaeales_Thermoplasmata_CuMMO_Soil_Sediment_Community.yaml b/kb/communities/Angelarchaeales_Thermoplasmata_CuMMO_Soil_Sediment_Community.yaml index 479d68cf4..d8522dd82 100644 --- a/kb/communities/Angelarchaeales_Thermoplasmata_CuMMO_Soil_Sediment_Community.yaml +++ b/kb/communities/Angelarchaeales_Thermoplasmata_CuMMO_Soil_Sediment_Community.yaml @@ -137,9 +137,8 @@ external_resources: url: https://doi.org/10.1038/s41396-021-01177-5 description: DOI link to the ISME J paper. associated_datasets: [] -metals_present: -- COPPER +metals_present: [] rare_earth_elements_present: [] -metal_relevance: SIGNIFICANT +metal_relevance: NOT_APPLICABLE metal_notes: Copper is the central metal cofactor of the CuMMO and blue copper proteins enriched in Angelarchaeales genomes. diff --git a/kb/communities/Asgard_Wetland_Soil_Methanogenesis_Substrate_Community.yaml b/kb/communities/Asgard_Wetland_Soil_Methanogenesis_Substrate_Community.yaml index ba81a3638..57a6525bc 100644 --- a/kb/communities/Asgard_Wetland_Soil_Methanogenesis_Substrate_Community.yaml +++ b/kb/communities/Asgard_Wetland_Soil_Methanogenesis_Substrate_Community.yaml @@ -179,6 +179,50 @@ environmental_factors: snippet: contributions in soil ecosystems remain unknown explanation: Supports terrestrial soil as the focus environment. growth_media: [] +related_ingredients: +- preferred_term: acetate + chebi_term: + id: CHEBI:30089 + label: acetate + relevance: Asgard archaeal acetogens in this wetland soil generate acetate + from carbohydrate breakdown via the Wood-Ljungdahl pathway; acetate is + therefore both the headline output of Asgard metabolism in this community + and the substrate that feeds the co-resident acetoclastic methanogens. + evidence: + - reference: PMID:39085194 + supports: SUPPORT + evidence_source: COMPUTATIONAL + snippet: carbohydrate breakdown to acetate and formate + explanation: Anchors acetate as the central Asgard metabolic output that + modulates downstream methanogenesis substrates in wetland soil. +- preferred_term: formate + chebi_term: + id: CHEBI:15740 + label: formate + relevance: Formate co-produced with acetate is a major C1 substrate for + hydrogenotrophic and formate-utilizing methanogens, defining a second + Asgard-mediated methanogenesis-substrate channel in this community. + evidence: + - reference: PMID:39085194 + supports: SUPPORT + evidence_source: COMPUTATIONAL + snippet: carbohydrate breakdown to acetate and formate + explanation: Anchors formate as the second Asgard-derived methanogenesis + substrate alongside acetate. +- preferred_term: dihydrogen + chebi_term: + id: CHEBI:18276 + label: dihydrogen + relevance: Expression of [NiFe]-hydrogenases by both Atabeyarchaeia and + Freyarchaeia genomes implicates H2 cycling as a core Asgard activity in + this wetland soil; any cultivation medium designed around the community + would need an H2 headspace. + evidence: + - reference: PMID:39085194 + supports: SUPPORT + evidence_source: COMPUTATIONAL + snippet: expression of genes for [NiFe]-hydrogenases + explanation: Anchors H2 cycling as an in situ expressed Asgard activity. external_resources: - name: Primary publication for the Asgard wetland soil methanogenesis-substrate community diff --git a/kb/communities/At_RSPHERE_SynCom.yaml b/kb/communities/At_RSPHERE_SynCom.yaml index 722697a85..e361cc194 100644 --- a/kb/communities/At_RSPHERE_SynCom.yaml +++ b/kb/communities/At_RSPHERE_SynCom.yaml @@ -427,8 +427,7 @@ environmental_factors: plant-microbe interactions, and synthetic community design ' -metals_present: -- TITANIUM -metal_relevance: INCIDENTAL +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Australian_Lead_Zinc_Polymetallic.yaml b/kb/communities/Australian_Lead_Zinc_Polymetallic.yaml index 809617e2e..667b6e36e 100644 --- a/kb/communities/Australian_Lead_Zinc_Polymetallic.yaml +++ b/kb/communities/Australian_Lead_Zinc_Polymetallic.yaml @@ -966,11 +966,9 @@ environmental_factors: explanation: Documents long-term weathering profile development metals_present: - COPPER -- GOLD - IRON - LEAD -- TITANIUM - ZINC metal_relevance: SIGNIFICANT -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Bayan_Obo_REE_Tailings_Consortium.yaml b/kb/communities/Bayan_Obo_REE_Tailings_Consortium.yaml index 29c5caee8..d8ae935c6 100644 --- a/kb/communities/Bayan_Obo_REE_Tailings_Consortium.yaml +++ b/kb/communities/Bayan_Obo_REE_Tailings_Consortium.yaml @@ -544,9 +544,7 @@ environmental_factors: was narrated for plausible real-world use explanation: Describes REE mineralogy at Bayan Obo metals_present: -- GOLD - IRON -- TITANIUM rare_earth_elements_present: - CERIUM - LANTHANUM @@ -554,5 +552,5 @@ rare_earth_elements_present: - PRASEODYMIUM - SAMARIUM metal_relevance: PRIMARY -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Chlamydomonas_Bacterial_H2_Consortium.yaml b/kb/communities/Chlamydomonas_Bacterial_H2_Consortium.yaml index 00f9ea025..b7e3d0223 100644 --- a/kb/communities/Chlamydomonas_Bacterial_H2_Consortium.yaml +++ b/kb/communities/Chlamydomonas_Bacterial_H2_Consortium.yaml @@ -351,7 +351,6 @@ growth_media: explanation: Establishes mannitol and yeast extract as key medium components for sustained H2 production culturemech_id: CultureMech:000139 culturemech_url: https://github.com/CultureBotAI/CultureMech/tree/main/kb/media/CultureMech:000139 -metals_present: -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/Chlamydomonas_Methylobacterium_Mutualism.yaml b/kb/communities/Chlamydomonas_Methylobacterium_Mutualism.yaml index c58edc4bd..a4bd471b9 100644 --- a/kb/communities/Chlamydomonas_Methylobacterium_Mutualism.yaml +++ b/kb/communities/Chlamydomonas_Methylobacterium_Mutualism.yaml @@ -325,8 +325,6 @@ growth_media: responses culturemech_id: CultureMech:000139 culturemech_url: https://github.com/CultureBotAI/CultureMech/tree/main/kb/media/CultureMech:000139 -metals_present: -- IRON -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/Chlorella_Rhizobium_Bioflocculation.yaml b/kb/communities/Chlorella_Rhizobium_Bioflocculation.yaml index e947545e6..26c42d67d 100644 --- a/kb/communities/Chlorella_Rhizobium_Bioflocculation.yaml +++ b/kb/communities/Chlorella_Rhizobium_Bioflocculation.yaml @@ -232,9 +232,7 @@ environmental_factors: snippet: 'three significant process variables: inoculation ratio of bacteria and microalgae, initial glucose concentration, and co-culture time' explanation: Co-culture timing optimized for maximum harvesting efficiency -metals_present: -- IRON -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Chromium_Sulfur_Reduction_Enrichment.yaml b/kb/communities/Chromium_Sulfur_Reduction_Enrichment.yaml index cc7b27a69..c677d085c 100644 --- a/kb/communities/Chromium_Sulfur_Reduction_Enrichment.yaml +++ b/kb/communities/Chromium_Sulfur_Reduction_Enrichment.yaml @@ -609,8 +609,6 @@ environmental_factors: snippet: actinobacterium isolated from manganese mining soil metals_present: - CHROMIUM -- IRON -- TITANIUM metal_relevance: PRIMARY -metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword +metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) matching in description (context-validated) diff --git a/kb/communities/Cinnamate_Degradation_Consortium.yaml b/kb/communities/Cinnamate_Degradation_Consortium.yaml index ddbf1a02a..f2e024f6d 100644 --- a/kb/communities/Cinnamate_Degradation_Consortium.yaml +++ b/kb/communities/Cinnamate_Degradation_Consortium.yaml @@ -182,8 +182,7 @@ environmental_factors: complete mineralization to methane ' -metals_present: -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Coastal_Forested_Wetland_Seawater_Ion_Microcosm_Community.yaml b/kb/communities/Coastal_Forested_Wetland_Seawater_Ion_Microcosm_Community.yaml index 84b934071..18c924ef1 100644 --- a/kb/communities/Coastal_Forested_Wetland_Seawater_Ion_Microcosm_Community.yaml +++ b/kb/communities/Coastal_Forested_Wetland_Seawater_Ion_Microcosm_Community.yaml @@ -257,6 +257,54 @@ environmental_factors: snippet: emissions of carbon dioxide, methane, and nitrous oxide explanation: Supports greenhouse gas endpoints. growth_media: [] +related_ingredients: +- preferred_term: sulfate + chebi_term: + id: CHEBI:16189 + label: sulfate + relevance: Sulfate is the explicitly manipulated ion in this microcosm + experiment; an environment-analog cultivation medium for this community + would need sulfate as a controllable variable rather than a fixed + background anion, since the study disentangles sulfate effects from other + seawater ions on the community. + evidence: + - reference: PMID:38628812 + supports: SUPPORT + evidence_source: IN_VIVO + snippet: tease apart the effects of sulfate from other seawater ions + explanation: Anchors sulfate as the central controllable environmental + variable for any medium designed to dogfood this community. +- preferred_term: seawater ions + chebi_term: + id: CHEBI:26710 + label: sodium chloride + relevance: Artificial-seawater ions (NaCl as the representative bulk salt) + drove the community shifts and GHG emission changes more strongly than + sulfate alone; any cultivation medium for this community would need a + seawater-equivalent ion background, not just sulfate. + evidence: + - reference: PMID:38628812 + supports: SUPPORT + evidence_source: IN_VIVO + snippet: other ions present in seawater, not sulfate, drive ecological and + biogeochemical responses to seawater intrusion + explanation: Anchors NaCl-dominated artificial seawater (representative + bulk seawater-ion mixture) as the primary driver of community responses. +- preferred_term: methane + chebi_term: + id: CHEBI:16183 + label: methane + relevance: Methane is one of the headline greenhouse gas emission endpoints + monitored across all microcosm treatments; an environment-analog medium + targeting this community would need methane in the headspace as a key + quantitative endpoint. + evidence: + - reference: PMID:38628812 + supports: SUPPORT + evidence_source: IN_VIVO + snippet: emissions of carbon dioxide, methane, and nitrous oxide + explanation: Anchors methane as a measured greenhouse gas endpoint of this + microcosm community. associated_datasets: [] external_resources: - name: Primary publication for coastal forested wetland seawater-ion microcosms diff --git a/kb/communities/Copper_Biomining_Heap_Leach.yaml b/kb/communities/Copper_Biomining_Heap_Leach.yaml index 41eae7c7a..b95975ee5 100644 --- a/kb/communities/Copper_Biomining_Heap_Leach.yaml +++ b/kb/communities/Copper_Biomining_Heap_Leach.yaml @@ -566,9 +566,8 @@ environmental_factors: metals_present: - COPPER - IRON -- TITANIUM metal_relevance: PRIMARY -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) growth_media: - name: Standard 9K Medium (laboratory cultivation of consortium members) diff --git a/kb/communities/Cyprus_Copper_Sulphide_Bioleaching_Consortium.yaml b/kb/communities/Cyprus_Copper_Sulphide_Bioleaching_Consortium.yaml index 50aaca0b3..5ef5ab2c1 100644 --- a/kb/communities/Cyprus_Copper_Sulphide_Bioleaching_Consortium.yaml +++ b/kb/communities/Cyprus_Copper_Sulphide_Bioleaching_Consortium.yaml @@ -195,9 +195,8 @@ external_resources: associated_datasets: [] metals_present: - COPPER -- IRON rare_earth_elements_present: [] metal_relevance: PRIMARY -metal_notes: Copper recovery from sulphide ores is the primary biotechnological +metal_notes: Metal/REE detected via keyword matching in description (context-validated) motivation; iron availability differs between chalcopyrite and chalcocite and shapes consortium composition. diff --git a/kb/communities/DVM_Triculture.yaml b/kb/communities/DVM_Triculture.yaml index 87c2d61d5..b84e6f4ab 100644 --- a/kb/communities/DVM_Triculture.yaml +++ b/kb/communities/DVM_Triculture.yaml @@ -375,7 +375,6 @@ environmental_factors: snippet: We find that tri-cultures with both routes increase methane production by almost twofold compared to co-cultures and are stable in the absence of sulfate explanation: Quantifies enhanced productivity from tri-culture interactions -metals_present: -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/Dangl_SynComm_35.yaml b/kb/communities/Dangl_SynComm_35.yaml index 02fc2189f..461e5c0b1 100644 --- a/kb/communities/Dangl_SynComm_35.yaml +++ b/kb/communities/Dangl_SynComm_35.yaml @@ -541,7 +541,6 @@ environmental_factors: evidence_source: IN_VITRO snippet: Suppressors and nonsuppressors co-occur in the root microbiome and the presence of the former can enhance the colonization ability of the latter -metals_present: -- TITANIUM -metal_relevance: INCIDENTAL +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/Desulfovibrio_Methanococcus_Syntrophy.yaml b/kb/communities/Desulfovibrio_Methanococcus_Syntrophy.yaml index 410d1c308..7404b0a22 100644 --- a/kb/communities/Desulfovibrio_Methanococcus_Syntrophy.yaml +++ b/kb/communities/Desulfovibrio_Methanococcus_Syntrophy.yaml @@ -237,10 +237,8 @@ environmental_factors: energy generation pathways and imply that to understand microbial processes that sustain nutrient cycling, lifestyles not captured in pure culture must be considered. explanation: Quantifies growth conditions for the syntrophic consortium -metals_present: -- IRON -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) growth_media: diff --git a/kb/communities/Ewaste_Bioleaching_Consortium.yaml b/kb/communities/Ewaste_Bioleaching_Consortium.yaml index 2aa5402ac..f6368b357 100644 --- a/kb/communities/Ewaste_Bioleaching_Consortium.yaml +++ b/kb/communities/Ewaste_Bioleaching_Consortium.yaml @@ -570,8 +570,7 @@ metals_present: - NICKEL - PALLADIUM - SILVER -- TITANIUM - ZINC metal_relevance: PRIMARY -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Ferroplasma_Leptospirillum_Syntrophy.yaml b/kb/communities/Ferroplasma_Leptospirillum_Syntrophy.yaml index 219d7e994..779afb877 100644 --- a/kb/communities/Ferroplasma_Leptospirillum_Syntrophy.yaml +++ b/kb/communities/Ferroplasma_Leptospirillum_Syntrophy.yaml @@ -472,9 +472,8 @@ environmental_factors: metals_present: - COPPER - IRON -- TITANIUM metal_relevance: PRIMARY -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) growth_media: - name: Modified 9K Medium for Ferroplasma-Leptospirillum Co-culture diff --git a/kb/communities/GLBRC_Populus_Variovorax_SynCom28.yaml b/kb/communities/GLBRC_Populus_Variovorax_SynCom28.yaml index eca899bce..8550d6169 100644 --- a/kb/communities/GLBRC_Populus_Variovorax_SynCom28.yaml +++ b/kb/communities/GLBRC_Populus_Variovorax_SynCom28.yaml @@ -833,7 +833,6 @@ external_resources: resource_id: zenodo.17466836 url: https://zenodo.org/records/17466836 description: Reproducible workflow and supplemental tables including strain metadata for the DefCom. -metals_present: -- TITANIUM -metal_relevance: INCIDENTAL +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/GLBRC_UFMP_Fermentation_Community.yaml b/kb/communities/GLBRC_UFMP_Fermentation_Community.yaml index d27c3cb1e..177d4bc98 100644 --- a/kb/communities/GLBRC_UFMP_Fermentation_Community.yaml +++ b/kb/communities/GLBRC_UFMP_Fermentation_Community.yaml @@ -462,9 +462,7 @@ associated_datasets: ' explanation: Links the metagenome dataset to the UFMP fermentation community study -metals_present: -- GOLD -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/GOM_Oil_Degrading_Consortium.yaml b/kb/communities/GOM_Oil_Degrading_Consortium.yaml index 6c9092a7f..6449b0aac 100644 --- a/kb/communities/GOM_Oil_Degrading_Consortium.yaml +++ b/kb/communities/GOM_Oil_Degrading_Consortium.yaml @@ -174,8 +174,7 @@ environmental_factors: description: 'Designed for practical application in bioremediation of oil-contaminated marine environments ' -metals_present: -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Geobacter_Clostridium_DIET.yaml b/kb/communities/Geobacter_Clostridium_DIET.yaml index 9981d123c..28047eec1 100644 --- a/kb/communities/Geobacter_Clostridium_DIET.yaml +++ b/kb/communities/Geobacter_Clostridium_DIET.yaml @@ -437,7 +437,6 @@ growth_media: explanation: Establishes anaerobic requirement maintained using Hungate technique culturemech_id: CultureMech:015432 culturemech_url: https://github.com/CultureBotAI/CultureMech/tree/main/kb/media/CultureMech:015432 -metals_present: -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/Geobacter_Methanosaeta_DIET.yaml b/kb/communities/Geobacter_Methanosaeta_DIET.yaml index 52ba2e9d9..315237191 100644 --- a/kb/communities/Geobacter_Methanosaeta_DIET.yaml +++ b/kb/communities/Geobacter_Methanosaeta_DIET.yaml @@ -293,8 +293,7 @@ growth_media: specialized medium culturemech_id: CultureMech:015435 culturemech_url: https://github.com/CultureBotAI/CultureMech/tree/main/kb/media/CultureMech:015435 -metals_present: -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Geobacter_Methanosarcina_DIET.yaml b/kb/communities/Geobacter_Methanosarcina_DIET.yaml index 19d4e2ae9..b8d6ea245 100644 --- a/kb/communities/Geobacter_Methanosarcina_DIET.yaml +++ b/kb/communities/Geobacter_Methanosarcina_DIET.yaml @@ -336,8 +336,7 @@ growth_media: explanation: Confirms ethanol as electron donor and DIET mechanism in coculture culturemech_id: CultureMech:015434 culturemech_url: https://github.com/CultureBotAI/CultureMech/tree/main/kb/media/CultureMech:015434 -metals_present: -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Hanford_300_Area_Unconfined_Aquifer_Community.yaml b/kb/communities/Hanford_300_Area_Unconfined_Aquifer_Community.yaml index a9ff9aef1..757f8a5cd 100644 --- a/kb/communities/Hanford_300_Area_Unconfined_Aquifer_Community.yaml +++ b/kb/communities/Hanford_300_Area_Unconfined_Aquifer_Community.yaml @@ -215,9 +215,8 @@ environmental_factors: of organic and inorganic contaminants from a variety of sources associated with historical nuclear materials production explanation: Supports the contaminant-impacted setting of this groundwater community. -metals_present: -- URANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Uranium is included because the Hanford 300 Area aquifer is a historically contaminated DOE site and the study measured uranium concentrations during groundwater sampling; the record does not assert direct uranium reduction without experimental evidence. diff --git a/kb/communities/Iberian_Pit_Lake_Stratified_Community.yaml b/kb/communities/Iberian_Pit_Lake_Stratified_Community.yaml index 5ad4a4207..348594d88 100644 --- a/kb/communities/Iberian_Pit_Lake_Stratified_Community.yaml +++ b/kb/communities/Iberian_Pit_Lake_Stratified_Community.yaml @@ -725,8 +725,7 @@ environmental_factors: metals_present: - COPPER - IRON -- TITANIUM - ZINC metal_relevance: PRIMARY -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Industrial_Bioreactor_Consortium.yaml b/kb/communities/Industrial_Bioreactor_Consortium.yaml index 867726e2a..9a5f779e6 100644 --- a/kb/communities/Industrial_Bioreactor_Consortium.yaml +++ b/kb/communities/Industrial_Bioreactor_Consortium.yaml @@ -999,9 +999,7 @@ environmental_factors: explanation: Links high Fe³⁺ to Ferroplasma competitive advantage metals_present: - COPPER -- GOLD - IRON -- TITANIUM metal_relevance: PRIMARY -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Infant_Gut_Strain_Persistence_Maternal_Community.yaml b/kb/communities/Infant_Gut_Strain_Persistence_Maternal_Community.yaml index 4fffa1137..b7080730d 100644 --- a/kb/communities/Infant_Gut_Strain_Persistence_Maternal_Community.yaml +++ b/kb/communities/Infant_Gut_Strain_Persistence_Maternal_Community.yaml @@ -173,9 +173,8 @@ external_resources: url: https://doi.org/10.1016/j.xcrm.2021.100393 description: DOI link to the Cell Rep Med paper. associated_datasets: [] -metals_present: -- IRON +metals_present: [] rare_earth_elements_present: [] -metal_relevance: SIGNIFICANT +metal_relevance: NOT_APPLICABLE metal_notes: Iron acquisition genes are among the three trait categories associated with strain persistence in the infant gut. diff --git a/kb/communities/Ion_Adsorption_REE_Indigenous_Community.yaml b/kb/communities/Ion_Adsorption_REE_Indigenous_Community.yaml index 5bf9289f9..b864a9d33 100644 --- a/kb/communities/Ion_Adsorption_REE_Indigenous_Community.yaml +++ b/kb/communities/Ion_Adsorption_REE_Indigenous_Community.yaml @@ -669,16 +669,12 @@ environmental_factors: recovery relevant to circular economy and green technology applications. ' -metals_present: -- IRON -- TITANIUM +metals_present: [] rare_earth_elements_present: - DYSPROSIUM -- ERBIUM - GADOLINIUM -- TERBIUM - YTTERBIUM - YTTRIUM metal_relevance: SIGNIFICANT -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Lotus_LjSC3.yaml b/kb/communities/Lotus_LjSC3.yaml index 7271d4690..5adeb5e8c 100644 --- a/kb/communities/Lotus_LjSC3.yaml +++ b/kb/communities/Lotus_LjSC3.yaml @@ -597,7 +597,6 @@ environmental_factors: interactions relevant to sustainable agriculture and reduced fertilizer use ' -metals_present: -- TITANIUM -metal_relevance: INCIDENTAL +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/MAMC_M48_Lignocellulose.yaml b/kb/communities/MAMC_M48_Lignocellulose.yaml index c990ae32a..eb7afe977 100644 --- a/kb/communities/MAMC_M48_Lignocellulose.yaml +++ b/kb/communities/MAMC_M48_Lignocellulose.yaml @@ -355,7 +355,6 @@ growth_media: consortium culturemech_id: CultureMech:000423 culturemech_url: https://github.com/CultureBotAI/CultureMech/tree/main/kb/media/CultureMech:000423 -metals_present: -- TITANIUM -metal_relevance: INCIDENTAL +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/MSC1_Dominant_Core.yaml b/kb/communities/MSC1_Dominant_Core.yaml index 630617dcf..b73c534a6 100644 --- a/kb/communities/MSC1_Dominant_Core.yaml +++ b/kb/communities/MSC1_Dominant_Core.yaml @@ -537,8 +537,6 @@ environmental_factors: retaining community composition ' -metals_present: -- IRON -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/MUCC_Freshwater_Wetland_Methane_Network_Community.yaml b/kb/communities/MUCC_Freshwater_Wetland_Methane_Network_Community.yaml index 9ae991e23..1b7802cb2 100644 --- a/kb/communities/MUCC_Freshwater_Wetland_Methane_Network_Community.yaml +++ b/kb/communities/MUCC_Freshwater_Wetland_Methane_Network_Community.yaml @@ -282,6 +282,40 @@ environmental_factors: snippet: high methane-emitting wetlands explanation: Supports the high-emission wetland context. growth_media: [] +related_ingredients: +- preferred_term: methane + chebi_term: + id: CHEBI:16183 + label: methane + relevance: Methane is the integrating output of this freshwater-wetland + network and the response variable the MUCC v2.0.0 cross-wetland analysis + correlates with microbiome composition; an environment-analog medium + targeting the hub methanogens would need methane headspace as a key product + monitor. + evidence: + - reference: PMID:39843444 + supports: SUPPORT + evidence_source: COMPUTATIONAL + snippet: link microbiome composition to function and methane emissions + explanation: Anchors methane as the headline functional output the MUCC + network analysis links to community structure. +- preferred_term: methylated compounds + chebi_term: + id: CHEBI:17790 + label: methanol + relevance: Methylotrophic methanogenesis is the functional potential the + paper highlights as central across MUCC wetlands; methanol is the most + abundantly studied methylated substrate routed through Methanoregula and + other hub methanogens, so an environment-analog medium for this network + would need a methylated C1 substrate at minimum. + evidence: + - reference: PMID:39843444 + supports: SUPPORT + evidence_source: COMPUTATIONAL + snippet: displays the functional potential for methylotrophic methanogenesis, + highlighting the importance of this pathway in these ecosystems + explanation: Anchors methylotrophic methanogenesis (methanol as + representative substrate) as a pathway central to the MUCC network. associated_datasets: - name: MUCC v2.0.0 Zenodo database dataset_type: OTHER diff --git a/kb/communities/Maize_Root_Simplified_Community.yaml b/kb/communities/Maize_Root_Simplified_Community.yaml index 285d2073d..5b49ca829 100644 --- a/kb/communities/Maize_Root_Simplified_Community.yaml +++ b/kb/communities/Maize_Root_Simplified_Community.yaml @@ -362,7 +362,6 @@ environmental_factors: - name: Agricultural application value: biocontrol for bioenergy crops description: Provides fungal disease resistance relevant to maize bioenergy production -metals_present: -- TITANIUM -metal_relevance: INCIDENTAL +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/Mercury_SFA_EFPC_Sediment_Community.yaml b/kb/communities/Mercury_SFA_EFPC_Sediment_Community.yaml index 3e796cf59..5f6b1491c 100644 --- a/kb/communities/Mercury_SFA_EFPC_Sediment_Community.yaml +++ b/kb/communities/Mercury_SFA_EFPC_Sediment_Community.yaml @@ -360,7 +360,6 @@ associated_datasets: ' explanation: Documents the metagenome sequencing and MAG reconstruction from EFPC sediment -metals_present: -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/Methane_Oxidation_CrVI_Reduction_SynCom.yaml b/kb/communities/Methane_Oxidation_CrVI_Reduction_SynCom.yaml index 6a6e45543..34d85c9c6 100644 --- a/kb/communities/Methane_Oxidation_CrVI_Reduction_SynCom.yaml +++ b/kb/communities/Methane_Oxidation_CrVI_Reduction_SynCom.yaml @@ -72,3 +72,5 @@ environmental_factors: description: Methane concentration, chromium load, and methane oxidation inhibition. evidence: - *id001 +metal_relevance: SIGNIFICANT +metal_notes: Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Mixed_Gallium_LED_Recovery_Consortium.yaml b/kb/communities/Mixed_Gallium_LED_Recovery_Consortium.yaml index bdf14e6ad..3e0e5315a 100644 --- a/kb/communities/Mixed_Gallium_LED_Recovery_Consortium.yaml +++ b/kb/communities/Mixed_Gallium_LED_Recovery_Consortium.yaml @@ -498,7 +498,6 @@ metals_present: - GALLIUM - IRON - NICKEL -- TITANIUM metal_relevance: PRIMARY -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Naica_Deep_Subsurface_Thermophilic.yaml b/kb/communities/Naica_Deep_Subsurface_Thermophilic.yaml index af613a067..2eb2549ab 100644 --- a/kb/communities/Naica_Deep_Subsurface_Thermophilic.yaml +++ b/kb/communities/Naica_Deep_Subsurface_Thermophilic.yaml @@ -563,10 +563,7 @@ environmental_factors: snippet: The "Cave of Crystals" (aka 'Naica') in Chihuahua Mexico is a natural unique subterranean ecosystem which mainly consists of crystals made of calcium sulfate explanation: Highlights astrobiology significance -metals_present: -- GOLD -- IRON -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Ngawha_Geothermal_Mercury_Cycling_Community.yaml b/kb/communities/Ngawha_Geothermal_Mercury_Cycling_Community.yaml index 214a2ace9..731aab916 100644 --- a/kb/communities/Ngawha_Geothermal_Mercury_Cycling_Community.yaml +++ b/kb/communities/Ngawha_Geothermal_Mercury_Cycling_Community.yaml @@ -156,9 +156,9 @@ external_resources: description: DOI link to the Applied and Environmental Microbiology paper. associated_datasets: [] metals_present: -- MERCURY +- IRON rare_earth_elements_present: [] -metal_relevance: PRIMARY -metal_notes: Mercury is the central metal driving community structure and the +metal_relevance: SIGNIFICANT +metal_notes: Metal/REE detected via keyword matching in description (context-validated) curated biogeochemical cycle, with members performing methylation, demethylation, and Hg(II) reduction to volatile Hg(0). diff --git a/kb/communities/ORNL_PMI_Populus_PD10_SynCom.yaml b/kb/communities/ORNL_PMI_Populus_PD10_SynCom.yaml index 0d5f60ea7..77339653a 100644 --- a/kb/communities/ORNL_PMI_Populus_PD10_SynCom.yaml +++ b/kb/communities/ORNL_PMI_Populus_PD10_SynCom.yaml @@ -524,7 +524,6 @@ external_resources: snippet: In this study, 10 bacterial strains isolated from the Populus deltoides rhizosphere were combined and passaged in two different media environments to form stable microbial communities explanation: Supports that the KBase narrative corresponds to the PD10 synthetic community system. -metals_present: -- TITANIUM -metal_relevance: INCIDENTAL +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/Oak_Ridge_FRC_Uranium_Nitrate_Groundwater_Community.yaml b/kb/communities/Oak_Ridge_FRC_Uranium_Nitrate_Groundwater_Community.yaml index f3366aa48..3607ebe4a 100644 --- a/kb/communities/Oak_Ridge_FRC_Uranium_Nitrate_Groundwater_Community.yaml +++ b/kb/communities/Oak_Ridge_FRC_Uranium_Nitrate_Groundwater_Community.yaml @@ -216,11 +216,8 @@ environmental_factors: explanation: Supports the nickel, cobalt, zinc, and uranium metal-contaminant relevance recorded for this community. metals_present: -- URANIUM - IRON -- NICKEL -- COBALT -- ZINC -metal_relevance: PRIMARY -metal_notes: Uranium and iron are directly tied to reported reduction potential; nickel, cobalt, and zinc +- URANIUM +metal_relevance: SIGNIFICANT +metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) are included as selective metal contaminants reported in Oak Ridge FRC ion-mixture studies. diff --git a/kb/communities/Okeke_Lu_Cellulolytic_Consortium.yaml b/kb/communities/Okeke_Lu_Cellulolytic_Consortium.yaml index d3b66d123..f5f514451 100644 --- a/kb/communities/Okeke_Lu_Cellulolytic_Consortium.yaml +++ b/kb/communities/Okeke_Lu_Cellulolytic_Consortium.yaml @@ -341,8 +341,6 @@ growth_media: explanation: Confirms xylanolytic activity on hemicellulose substrates by consortium members culturemech_id: CultureMech:000423 culturemech_url: https://github.com/CultureBotAI/CultureMech/tree/main/kb/media/CultureMech:000423 -metals_present: -- IRON -- TITANIUM -metal_relevance: INCIDENTAL +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/PGM_Spent_Catalyst_Bioleaching.yaml b/kb/communities/PGM_Spent_Catalyst_Bioleaching.yaml index a8ce1cb8b..3edfeb6c3 100644 --- a/kb/communities/PGM_Spent_Catalyst_Bioleaching.yaml +++ b/kb/communities/PGM_Spent_Catalyst_Bioleaching.yaml @@ -812,11 +812,9 @@ environmental_factors: explanation: Establishes application to multiple catalyst waste streams metals_present: - COPPER -- GOLD - IRON - NICKEL - PALLADIUM -- TITANIUM metal_relevance: PRIMARY -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Pelotomaculum_Methanothermobacter_Syntrophy.yaml b/kb/communities/Pelotomaculum_Methanothermobacter_Syntrophy.yaml index 565b375f0..ccd5164d7 100644 --- a/kb/communities/Pelotomaculum_Methanothermobacter_Syntrophy.yaml +++ b/kb/communities/Pelotomaculum_Methanothermobacter_Syntrophy.yaml @@ -252,11 +252,8 @@ environmental_factors: was characterized explanation: Establishes syntrophic nature of the organism requiring close association with methanogenic partner -metals_present: -- GOLD -- IRON -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) growth_media: diff --git a/kb/communities/Phormidium_Alkaline_Consortium.yaml b/kb/communities/Phormidium_Alkaline_Consortium.yaml index aaeba0af4..73e61af01 100644 --- a/kb/communities/Phormidium_Alkaline_Consortium.yaml +++ b/kb/communities/Phormidium_Alkaline_Consortium.yaml @@ -525,7 +525,6 @@ environmental_factors: snippet: Genome information from each heterotrophic population was investigated for six ecological niches created by cyanobacterial metabolism and one niche for phototrophy explanation: Explains ecological basis for 4-year stability -metals_present: -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/Polaromonas_Vanadium_Reduction_Community.yaml b/kb/communities/Polaromonas_Vanadium_Reduction_Community.yaml index 316979be9..357d5dea9 100644 --- a/kb/communities/Polaromonas_Vanadium_Reduction_Community.yaml +++ b/kb/communities/Polaromonas_Vanadium_Reduction_Community.yaml @@ -601,8 +601,7 @@ environmental_factors: metals_present: - CHROMIUM - IRON -- TITANIUM - VANADIUM metal_relevance: INCIDENTAL -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Prairie_Pothole_Wetland_Sulfur_Carbon_Virus_Community.yaml b/kb/communities/Prairie_Pothole_Wetland_Sulfur_Carbon_Virus_Community.yaml index d25394562..fe4a6bc2c 100644 --- a/kb/communities/Prairie_Pothole_Wetland_Sulfur_Carbon_Virus_Community.yaml +++ b/kb/communities/Prairie_Pothole_Wetland_Sulfur_Carbon_Virus_Community.yaml @@ -307,6 +307,69 @@ environmental_factors: snippet: high concentrations of sulfur species and dissolved organic carbon explanation: Supports sulfur and organic-carbon availability as key environmental factors. growth_media: [] +related_ingredients: +- preferred_term: sulfate + chebi_term: + id: CHEBI:16189 + label: sulfate + relevance: Sulfate is the terminal electron acceptor sustaining the + exceptionally high sulfate-reduction rates that define this Prairie + Pothole community; any environment-analog cultivation medium would need + sulfate as a primary anion at high concentration. + evidence: + - reference: PMID:30086797 + supports: SUPPORT + evidence_source: IN_VIVO + snippet: Sulfate reduction rates up to 22 μmol cm-3 day-1 have been + measured in these wetland sediments + explanation: Anchors sulfate as the dominant electron acceptor in PPR + sediments at measured high turnover rates. +- preferred_term: methanol + chebi_term: + id: CHEBI:17790 + label: methanol + relevance: Methanol is one of the electron donors the recovered sulfate-reducer + genomes are predicted to utilize, linking C1 methylated compounds to the + community's terminal sulfur metabolism. + evidence: + - reference: PMID:30086797 + supports: SUPPORT + evidence_source: COMPUTATIONAL + snippet: electron donors, such as methanol and other alcohols, methylamines, + and glycine + explanation: Anchors methanol as a predicted electron donor for PPR sulfate + reducers from genome-resolved metabolism. +- preferred_term: ethanol + chebi_term: + id: CHEBI:16236 + label: ethanol + relevance: Ethanol is present at millimolar concentrations in PPR sediment + pore fluids and the authors hypothesize that alcohol-driven methanogenesis + via F420-dependent alcohol dehydrogenases is a major fraction of in situ + methane production. + evidence: + - reference: PMID:30086797 + supports: SUPPORT + evidence_source: IN_VIVO + snippet: millimolar concentrations of ethanol and 2-propanol in sediment + pore fluids + explanation: Anchors ethanol as a quantitatively abundant porewater + substrate in PPR sediments. +- preferred_term: propan-2-ol + chebi_term: + id: CHEBI:17824 + label: propan-2-ol + relevance: 2-Propanol is similarly present at millimolar concentrations in + porewater and is implicated alongside ethanol as a substrate driving + Methanofollis-affiliated methanogenesis in this community. + evidence: + - reference: PMID:30086797 + supports: SUPPORT + evidence_source: IN_VIVO + snippet: millimolar concentrations of ethanol and 2-propanol in sediment + pore fluids + explanation: Anchors 2-propanol as a quantitatively abundant porewater + substrate alongside ethanol. associated_datasets: - name: Prairie Pothole sulfur-cycling metagenomes dataset_type: METAGENOME diff --git a/kb/communities/Pseudonitzschia_Sulfitobacter_Association.yaml b/kb/communities/Pseudonitzschia_Sulfitobacter_Association.yaml index d36c9e7b5..a4d874b49 100644 --- a/kb/communities/Pseudonitzschia_Sulfitobacter_Association.yaml +++ b/kb/communities/Pseudonitzschia_Sulfitobacter_Association.yaml @@ -294,7 +294,6 @@ growth_media: explanation: Establishes marine diatom culture conditions for diatom-bacteria association culturemech_id: CultureMech:000149 culturemech_url: https://github.com/CultureBotAI/CultureMech/tree/main/kb/media/CultureMech:000149 -metals_present: -- GOLD -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/Rammelsberg_Cobalt_Nickel_Tailings.yaml b/kb/communities/Rammelsberg_Cobalt_Nickel_Tailings.yaml index 9e5663c13..b46a76633 100644 --- a/kb/communities/Rammelsberg_Cobalt_Nickel_Tailings.yaml +++ b/kb/communities/Rammelsberg_Cobalt_Nickel_Tailings.yaml @@ -494,7 +494,6 @@ metals_present: - COPPER - IRON - NICKEL -- TITANIUM metal_relevance: PRIMARY -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Rice_Duckweed_Bacillus_SynCom.yaml b/kb/communities/Rice_Duckweed_Bacillus_SynCom.yaml index 56370c969..e33e4750f 100644 --- a/kb/communities/Rice_Duckweed_Bacillus_SynCom.yaml +++ b/kb/communities/Rice_Duckweed_Bacillus_SynCom.yaml @@ -176,7 +176,6 @@ environmental_factors: - name: Growth Conditions value: Greenhouse description: Greenhouse conditions for SynCom efficacy evaluation. -metals_present: -- TITANIUM -metal_relevance: INCIDENTAL +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/Richmond_Mine_AMD_Biofilm.yaml b/kb/communities/Richmond_Mine_AMD_Biofilm.yaml index 000398a90..47ec94f55 100644 --- a/kb/communities/Richmond_Mine_AMD_Biofilm.yaml +++ b/kb/communities/Richmond_Mine_AMD_Biofilm.yaml @@ -683,10 +683,9 @@ environmental_factors: metals_present: - COPPER - IRON -- TITANIUM - ZINC metal_relevance: PRIMARY -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via keyword matching in description (context-validated) via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) growth_media: diff --git a/kb/communities/Rifle_Aquifer_Bioanode_EET_Community.yaml b/kb/communities/Rifle_Aquifer_Bioanode_EET_Community.yaml index 8e2a7ee64..e6887080d 100644 --- a/kb/communities/Rifle_Aquifer_Bioanode_EET_Community.yaml +++ b/kb/communities/Rifle_Aquifer_Bioanode_EET_Community.yaml @@ -231,9 +231,8 @@ external_resources: url: https://doi.org/10.3389/fmicb.2020.01694 description: DOI link to the Frontiers in Microbiology paper. associated_datasets: [] -metals_present: -- IRON +metals_present: [] rare_earth_elements_present: [] -metal_relevance: SIGNIFICANT +metal_relevance: NOT_APPLICABLE metal_notes: Anode potentials mimic iron-oxide mineral redox; the EET-capable organisms may mediate mineral redox transformations in the Rifle aquifer. diff --git a/kb/communities/Rifle_Uranium_Reducing_Community.yaml b/kb/communities/Rifle_Uranium_Reducing_Community.yaml index a80596c20..37412c640 100644 --- a/kb/communities/Rifle_Uranium_Reducing_Community.yaml +++ b/kb/communities/Rifle_Uranium_Reducing_Community.yaml @@ -617,8 +617,7 @@ environmental_factors: explanation: Quantifies time to achieve treatment goals metals_present: - IRON -- TITANIUM - URANIUM metal_relevance: SIGNIFICANT -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Salar_Atacama_Lithium_Brine_Community.yaml b/kb/communities/Salar_Atacama_Lithium_Brine_Community.yaml index ae09d17d5..cadfb86fb 100644 --- a/kb/communities/Salar_Atacama_Lithium_Brine_Community.yaml +++ b/kb/communities/Salar_Atacama_Lithium_Brine_Community.yaml @@ -505,9 +505,7 @@ environmental_factors: snippet: dominated by microorganisms; however, little is known about the microbes present in the brines associated with this economically important mining process explanation: Documents salar size and status as largest lithium reserve metals_present: -- IRON - LITHIUM -- TITANIUM metal_relevance: SIGNIFICANT -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Sorghum_SRC1_Subset.yaml b/kb/communities/Sorghum_SRC1_Subset.yaml index 7c2aa6821..c07426076 100644 --- a/kb/communities/Sorghum_SRC1_Subset.yaml +++ b/kb/communities/Sorghum_SRC1_Subset.yaml @@ -332,7 +332,6 @@ environmental_factors: producing growth benefits ' -metals_present: -- TITANIUM -metal_relevance: INCIDENTAL +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/Soybean_N_Fixation_sfSynCom.yaml b/kb/communities/Soybean_N_Fixation_sfSynCom.yaml index cd956a0cf..4e400d9b7 100644 --- a/kb/communities/Soybean_N_Fixation_sfSynCom.yaml +++ b/kb/communities/Soybean_N_Fixation_sfSynCom.yaml @@ -445,7 +445,6 @@ environmental_factors: dependence on synthetic nitrogen fertilizers ' -metals_present: -- TITANIUM -metal_relevance: INCIDENTAL +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/Stordalen_Mire_Methylotrophic_Methanogenesis_Community.yaml b/kb/communities/Stordalen_Mire_Methylotrophic_Methanogenesis_Community.yaml index 41922628e..9dce07d87 100644 --- a/kb/communities/Stordalen_Mire_Methylotrophic_Methanogenesis_Community.yaml +++ b/kb/communities/Stordalen_Mire_Methylotrophic_Methanogenesis_Community.yaml @@ -253,6 +253,52 @@ environmental_factors: snippet: wetlands are major sources of biogenic methane explanation: Supports methane emissions as the climate-relevant context for this wetland community. growth_media: [] +related_ingredients: +- preferred_term: methanol + chebi_term: + id: CHEBI:17790 + label: methanol + relevance: Methanol is the dominant methylated oxygen substrate exploited by + Stordalen Mire methanogens (Methanosarcinales, Methanobacteriales) and by + bacterial methylotrophs across the thaw gradient; any cultivation medium + designed around this community would need methanol as a primary C1 substrate. + evidence: + - reference: PMID:38063415 + supports: SUPPORT + evidence_source: COMPUTATIONAL + snippet: the use of methylated oxygen compounds (e.g., methanol) + explanation: Anchors methanol as the headline methylated substrate routed + through Stordalen methanogens in the Ellenbogen et al. multi-omics study. +- preferred_term: methylamines + chebi_term: + id: CHEBI:74807 + label: methylamine + relevance: Methylamines are the primary substrates implicated for + Methanomassiliicoccales in this community and define one of the two main + methylotrophic methanogenesis routes the paper documents at Stordalen. + evidence: + - reference: PMID:38063415 + supports: SUPPORT + evidence_source: COMPUTATIONAL + snippet: primarily implicated methyl sulfides and methylamines + explanation: Anchors methylamines as one of the two primary methylated + substrate classes driving methanogenesis in the Mire. +- preferred_term: acetate + chebi_term: + id: CHEBI:30089 + label: acetate + relevance: Bacterial methylotrophs at Stordalen produce acetate as a + by-product that then feeds acetoclastic methanogenesis in the fen and bog + stages of the thaw gradient, making acetate an environmentally pivotal + intermediate for any cultivation medium targeting the community. + evidence: + - reference: PMID:38063415 + supports: SUPPORT + evidence_source: COMPUTATIONAL + snippet: some methylotrophic bacteria are known to produce methanogenic + by-products like acetate + explanation: Anchors acetate as a methylotrophy-derived by-product feeding + into Stordalen methanogenesis. associated_datasets: [] external_resources: - name: Primary publication for Stordalen Mire methylotrophic methanogenesis diff --git a/kb/communities/Synechococcus_Ecoli_SPC.yaml b/kb/communities/Synechococcus_Ecoli_SPC.yaml index 978260807..a64833535 100644 --- a/kb/communities/Synechococcus_Ecoli_SPC.yaml +++ b/kb/communities/Synechococcus_Ecoli_SPC.yaml @@ -184,7 +184,6 @@ environmental_factors: snippet: We previously engineered a model cyanobacterium, Synechococcus elongatus PCC 7942, to secrete the bulk of the carbon it fixes as sucrose, a carbohydrate that can be utilized by many other microbes explanation: IPTG induction is required for CscB expression and sucrose export -metals_present: -- TITANIUM -metal_relevance: INCIDENTAL +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/Syntrophobacter_Methanobacterium_Syntrophy.yaml b/kb/communities/Syntrophobacter_Methanobacterium_Syntrophy.yaml index 78c92f045..6554580d7 100644 --- a/kb/communities/Syntrophobacter_Methanobacterium_Syntrophy.yaml +++ b/kb/communities/Syntrophobacter_Methanobacterium_Syntrophy.yaml @@ -225,9 +225,7 @@ environmental_factors: snippet: A syntrophic propionate-oxidizing bacterium, strain MPOBT, was isolated from a culture enriched from anaerobic granular sludge explanation: Establishes ecological origin and relevance to anaerobic waste treatment processes -metals_present: -- IRON -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Syntrophobacter_Methanospirillum_Syntrophy.yaml b/kb/communities/Syntrophobacter_Methanospirillum_Syntrophy.yaml index d29d868be..6836b8289 100644 --- a/kb/communities/Syntrophobacter_Methanospirillum_Syntrophy.yaml +++ b/kb/communities/Syntrophobacter_Methanospirillum_Syntrophy.yaml @@ -225,10 +225,8 @@ environmental_factors: snippet: A syntrophic propionate-oxidizing bacterium, strain MPOBT, was isolated from a culture enriched from anaerobic granular sludge explanation: Establishes ecological origin and relevance to anaerobic waste treatment processes -metals_present: -- IRON -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) growth_media: diff --git a/kb/communities/Syntrophomonas_Methanospirillum_Syntrophy.yaml b/kb/communities/Syntrophomonas_Methanospirillum_Syntrophy.yaml index 14e6b8c26..5d6a61c74 100644 --- a/kb/communities/Syntrophomonas_Methanospirillum_Syntrophy.yaml +++ b/kb/communities/Syntrophomonas_Methanospirillum_Syntrophy.yaml @@ -195,10 +195,8 @@ environmental_factors: evidence_source: IN_VITRO snippet: The most rapid generation time obtained by cocultures of S explanation: Quantifies growth rate of the syntrophic consortium -metals_present: -- IRON -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) growth_media: diff --git a/kb/communities/Syntrophus_Benzoate_Degrader.yaml b/kb/communities/Syntrophus_Benzoate_Degrader.yaml index fe3905e78..3922f19f7 100644 --- a/kb/communities/Syntrophus_Benzoate_Degrader.yaml +++ b/kb/communities/Syntrophus_Benzoate_Degrader.yaml @@ -225,9 +225,7 @@ environmental_factors: snippet: Syntrophic benzoate degradation is important for bioremediation of aromatic contaminants in anaerobic environments explanation: Establishes bioremediation relevance -metals_present: -- IRON -- TITANIUM -metal_relevance: SIGNIFICANT +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Thermophilic_Pyrite_QS_Consortium.yaml b/kb/communities/Thermophilic_Pyrite_QS_Consortium.yaml index 3ada69ab8..b2323ce99 100644 --- a/kb/communities/Thermophilic_Pyrite_QS_Consortium.yaml +++ b/kb/communities/Thermophilic_Pyrite_QS_Consortium.yaml @@ -849,7 +849,6 @@ environmental_factors: metals_present: - COPPER - IRON -- TITANIUM metal_relevance: PRIMARY -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via keyword matching +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via keyword matching in description (context-validated) in description (context-validated) diff --git a/kb/communities/Tinto_River_Iron_Cycling_Community.yaml b/kb/communities/Tinto_River_Iron_Cycling_Community.yaml index 3f465d132..4ab75984f 100644 --- a/kb/communities/Tinto_River_Iron_Cycling_Community.yaml +++ b/kb/communities/Tinto_River_Iron_Cycling_Community.yaml @@ -426,7 +426,6 @@ environmental_factors: explanation: Documents low prokaryotic but high eukaryotic diversity metals_present: - IRON -- TITANIUM metal_relevance: PRIMARY -metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor +metal_notes: Metal/REE detected via CHEBI terms in metabolites; Metal/REE detected via environmental factor measurements; Metal/REE detected via keyword matching in description (context-validated) measurements; Metal/REE detected via keyword matching in description (context-validated) diff --git a/kb/communities/Trichoderma_Lactate_Platform.yaml b/kb/communities/Trichoderma_Lactate_Platform.yaml index debf589fd..743d88438 100644 --- a/kb/communities/Trichoderma_Lactate_Platform.yaml +++ b/kb/communities/Trichoderma_Lactate_Platform.yaml @@ -210,7 +210,6 @@ environmental_factors: food chain ' -metals_present: -- TITANIUM -metal_relevance: INCIDENTAL +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/Trichodesmium_Alteromonas_Marine_Consortium.yaml b/kb/communities/Trichodesmium_Alteromonas_Marine_Consortium.yaml index 9d300411c..697e81ed4 100644 --- a/kb/communities/Trichodesmium_Alteromonas_Marine_Consortium.yaml +++ b/kb/communities/Trichodesmium_Alteromonas_Marine_Consortium.yaml @@ -204,5 +204,5 @@ associated_datasets: snippet: Metatranscriptome sequencing was performed on Trichodesmium colonies explanation: Supports the metatranscriptomic dataset association. metal_relevance: SIGNIFICANT -metal_notes: Iron acquisition is one inferred interaction axis in Trichodesmium-associated bacterial consortia. +metal_notes: Metal/REE detected via CHEBI terms in metabolites diff --git a/kb/communities/Wetland_Oxygen_Sulfate_GHG_Microcosm_Community.yaml b/kb/communities/Wetland_Oxygen_Sulfate_GHG_Microcosm_Community.yaml index ada4e64c7..e2ae4851c 100644 --- a/kb/communities/Wetland_Oxygen_Sulfate_GHG_Microcosm_Community.yaml +++ b/kb/communities/Wetland_Oxygen_Sulfate_GHG_Microcosm_Community.yaml @@ -256,6 +256,72 @@ environmental_factors: snippet: microbial methane (CH4) and carbon dioxide (CO2) emissions explanation: Supports methane and carbon dioxide emissions as measured community outputs. growth_media: [] +related_ingredients: +- preferred_term: sulfate + chebi_term: + id: CHEBI:16189 + label: sulfate + relevance: Sulfate is one of the two explicitly manipulated stressors in + this microcosm study; elevated sulfate reshapes the community's CH4/CO2 + partitioning and selectively suppresses hydrogenotrophic methanogenesis, + so an environment-analog medium must support sulfate concentration as a + primary independent variable. + evidence: + - reference: PMID:38961111 + supports: SUPPORT + evidence_source: IN_VIVO + snippet: Elevated SO42- reduced CH4 emissions, with hydrogenotrophic + methanogenesis more suppressed than acetoclastic + explanation: Anchors sulfate as the central electron-acceptor stressor + driving the methanogenic response in this microcosm community. +- preferred_term: oxygen + chebi_term: + id: CHEBI:15379 + label: dioxygen + relevance: Oxygen exposure (e.g., drought-induced) is the second manipulated + stressor and shifts the community's greenhouse-gas output from methane to + carbon dioxide; any environment-analog cultivation medium would need + controllable oxygen tension rather than strict anoxia. + evidence: + - reference: PMID:38961111 + supports: SUPPORT + evidence_source: IN_VIVO + snippet: Elevated O2 shifted the greenhouse gas emissions from CH4 to CO2 + explanation: Anchors O2 as a controllable stressor reshaping the community's + carbon-cycling output. +- preferred_term: lactate + chebi_term: + id: CHEBI:24996 + label: lactate + relevance: Lactate is the upstream organic substrate the stoichiometric + model identifies as feeding the SO4-coupled subnetwork that produces + acetate, H2S, and CO2 under combined oxic/sulfate conditions; a + cultivation medium for the SO4-stressed subcommunity would need lactate + as the primary carbon source. + evidence: + - reference: PMID:38961111 + supports: SUPPORT + evidence_source: COMPUTATIONAL + snippet: converts lactate and SO42- to produce acetate, H2S, and CO2 when + SO42- is elevated under oxic conditions + explanation: Anchors lactate as the headline organic substrate in the + SO4-coupled metabolic subnetwork identified by stoichiometric modeling. +- preferred_term: hydrogen sulfide + chebi_term: + id: CHEBI:16136 + label: hydrogen sulfide + relevance: H2S is the headline reduced-sulfur product of the SO4-coupled + subnetwork and is the metabolite that closes the sulfur cycle in this + community; any medium designed around the SO4-amended subcommunity should + accommodate H2S accumulation or trapping. + evidence: + - reference: PMID:38961111 + supports: SUPPORT + evidence_source: COMPUTATIONAL + snippet: converts lactate and SO42- to produce acetate, H2S, and CO2 when + SO42- is elevated under oxic conditions + explanation: Anchors H2S as a key product of the SO4-coupled subnetwork + central to this microcosm community. associated_datasets: - name: Wetland oxygen-sulfate metagenome BioProject dataset_type: METAGENOME diff --git a/kb/communities/Wheat_Consortium_C1.yaml b/kb/communities/Wheat_Consortium_C1.yaml index f60042296..20456c25b 100644 --- a/kb/communities/Wheat_Consortium_C1.yaml +++ b/kb/communities/Wheat_Consortium_C1.yaml @@ -142,7 +142,6 @@ environmental_factors: resistance in the host plant ' -metals_present: -- TITANIUM -metal_relevance: INCIDENTAL +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/kb/communities/Wheat_Consortium_C6.yaml b/kb/communities/Wheat_Consortium_C6.yaml index abce61ddb..c7efc1392 100644 --- a/kb/communities/Wheat_Consortium_C6.yaml +++ b/kb/communities/Wheat_Consortium_C6.yaml @@ -149,7 +149,6 @@ environmental_factors: agriculture systems ' -metals_present: -- TITANIUM -metal_relevance: INCIDENTAL +metals_present: [] +metal_relevance: NOT_APPLICABLE metal_notes: Metal/REE detected via environmental factor measurements diff --git a/pyproject.toml b/pyproject.toml index 6ce3373fc..f6f77b863 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,9 +74,19 @@ target-version = "py310" extend-exclude = ["src/communitymech/datamodel/communitymech.py"] [tool.ruff.lint] -select = ["E", "F", "I", "UP", "N", "S", "B", "A", "C4", "T20", "SIM"] +# T20 (print) is deliberately omitted: src/communitymech/ ships CLI +# entry points (cli.py, render_*, export/*, embedding/*, validators) +# that legitimately use print for user output, and tagging each call +# with `# noqa: T201` is louder than the rule is worth here. +select = ["E", "F", "I", "UP", "N", "S", "B", "A", "C4", "SIM"] ignore = ["S101", "S603", "S607"] +[tool.ruff.lint.per-file-ignores] +# E501 (line-too-long) is suppressed in prompt-template modules because +# the strings are LLM prompt content; wrapping or splitting them would +# alter the rendered prompt. +"src/communitymech/llm/prompts.py" = ["E501"] + [tool.pytest.ini_options] testpaths = ["tests"] python_files = ["test_*.py"] diff --git a/scripts/clean_metals_inplace.py b/scripts/clean_metals_inplace.py new file mode 100644 index 000000000..33fa5a230 --- /dev/null +++ b/scripts/clean_metals_inplace.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +"""Non-destructive cleanup of metals_present / rare_earth_elements_present blocks. + +Runs `extract_metals_from_community` against every community YAML and, when +the extracted lists differ from what's written on disk, rewrites only the +relevant blocks via line-based regex substitution. Comments, blank lines, +key order, and unrelated whitespace are preserved (unlike pyyaml's +dump-and-rewrite path used by `backfill_metals.py`). + +Usage: + PYTHONPATH=src uv run python scripts/clean_metals_inplace.py --dry-run + PYTHONPATH=src uv run python scripts/clean_metals_inplace.py +""" + +import argparse +import re +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from communitymech.metal_extraction import extract_metals_from_community + + +def _read_block(text: str, key: str) -> tuple[int, int, list[str]]: + """Locate `{key}:` block and return (start_line, end_line, current_values). + + Returns (-1, -1, []) if the key is not present. + """ + lines = text.splitlines(keepends=True) + start = None + for i, line in enumerate(lines): + if line.rstrip("\n") == f"{key}:" or line.startswith(f"{key}: "): + start = i + break + if start is None: + return -1, -1, [] + inline = lines[start].rstrip("\n").removeprefix(f"{key}:").strip() + if inline and inline != "": + return start, start + 1, [v.strip() for v in inline.strip("[]").split(",") if v.strip()] + end = start + 1 + values: list[str] = [] + while end < len(lines): + if lines[end].startswith("- "): + values.append(lines[end][2:].strip()) + end += 1 + elif lines[end].strip() == "": + end += 1 + else: + break + return start, end, values + + +def _format_list_block(key: str, values: list[str]) -> str: + if not values: + return f"{key}: []\n" + body = "\n".join(f"- {v}" for v in values) + return f"{key}:\n{body}\n" + + +def _replace_block(text: str, key: str, values: list[str]) -> str: + start, end, _ = _read_block(text, key) + if start == -1: + return text + lines = text.splitlines(keepends=True) + new_block = _format_list_block(key, values) + return "".join(lines[:start]) + new_block + "".join(lines[end:]) + + +def _replace_scalar(text: str, key: str, value: str) -> str: + pattern = re.compile(rf"^{re.escape(key)}:.*$", re.MULTILINE) + if pattern.search(text): + return pattern.sub(f"{key}: {value}", text, count=1) + return text + f"{key}: {value}\n" + + +def clean_file(path: Path, dry_run: bool) -> tuple[bool, str]: + metals, ree, relevance, notes = extract_metals_from_community(path) + text = path.read_text() + + _, _, current_metals = _read_block(text, "metals_present") + _, _, current_ree = _read_block(text, "rare_earth_elements_present") + + diff_metals = sorted(current_metals) != sorted(metals) + diff_ree = sorted(current_ree) != sorted(ree) + if not (diff_metals or diff_ree): + return False, "" + + new_text = text + new_text = _replace_block(new_text, "metals_present", sorted(metals)) + new_text = _replace_block(new_text, "rare_earth_elements_present", sorted(ree)) + new_text = _replace_scalar(new_text, "metal_relevance", relevance) + if notes: + new_text = _replace_scalar(new_text, "metal_notes", notes) + + summary = ( + f" metals: {sorted(current_metals)} -> {sorted(metals)}\n" + f" ree: {sorted(current_ree)} -> {sorted(ree)}" + ) + if not dry_run: + path.write_text(new_text) + return True, summary + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + community_dir = Path("kb/communities") + files = sorted(community_dir.glob("*.yaml")) + changed = 0 + for f in files: + did_change, summary = clean_file(f, dry_run=args.dry_run) + if did_change: + changed += 1 + print(f"{f.name}") + print(summary) + verb = "would change" if args.dry_run else "changed" + print(f"\n{verb} {changed}/{len(files)} files") + + +if __name__ == "__main__": + main() diff --git a/scripts/validate_cross_repo_ids.py b/scripts/validate_cross_repo_ids.py new file mode 100644 index 000000000..443b289f2 --- /dev/null +++ b/scripts/validate_cross_repo_ids.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +"""CLI: validate cross-repo IDs (CultureMech, MediaIngredientMech) in one or more community YAMLs. + +Pattern checks always run. Existence checks run only when sibling-repo +paths are supplied via flags or via the COMMUNITYMECH_SIBLING_REPOS +environment variable (comma-separated `Name=path` pairs). + +Usage: + PYTHONPATH=src uv run python scripts/validate_cross_repo_ids.py kb/communities/X.yaml + PYTHONPATH=src uv run python scripts/validate_cross_repo_ids.py kb/communities/X.yaml \\ + --culturemech ../CultureMech/kb/media \\ + --mediaingredientmech ../MediaIngredientMech/kb/ingredients + PYTHONPATH=src uv run python scripts/validate_cross_repo_ids.py kb/communities/*.yaml +""" + +import argparse +import os +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from communitymech.validators.cross_repo_ids import validate_cross_repo_ids + + +def _sibling_repos_from_env() -> dict[str, Path]: + raw = os.environ.get("COMMUNITYMECH_SIBLING_REPOS", "").strip() + if not raw: + return {} + out: dict[str, Path] = {} + for pair in raw.split(","): + if "=" not in pair: + continue + name, path = pair.split("=", 1) + out[name.strip()] = Path(path.strip()) + return out + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + parser.add_argument("yaml_paths", nargs="+", type=Path) + parser.add_argument("--culturemech", type=Path, help="Path to CultureMech kb/ dir") + parser.add_argument( + "--mediaingredientmech", type=Path, help="Path to MediaIngredientMech kb/ dir" + ) + args = parser.parse_args() + + sibling_repos = _sibling_repos_from_env() + if args.culturemech is not None: + sibling_repos["CultureMech"] = args.culturemech + if args.mediaingredientmech is not None: + sibling_repos["MediaIngredientMech"] = args.mediaingredientmech + + total_errors = 0 + for yaml_path in args.yaml_paths: + if not yaml_path.exists(): + print(f"[skipped] {yaml_path}: file not found") + continue + issues = validate_cross_repo_ids(yaml_path, sibling_repos=sibling_repos) + errors = [i for i in issues if i.severity == "error"] + if not issues: + print(f"[ok] {yaml_path}") + continue + print(f"[{len(errors)} error(s)] {yaml_path}") + for issue in issues: + print(f" {issue}") + total_errors += len(errors) + + return 1 if total_errors else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/communitymech/cli.py b/src/communitymech/cli.py index e77b37e70..baf39d91f 100644 --- a/src/communitymech/cli.py +++ b/src/communitymech/cli.py @@ -13,7 +13,6 @@ try: from rich.console import Console from rich.panel import Panel - from rich.progress import Progress, SpinnerColumn, TextColumn from rich.prompt import Confirm from rich.syntax import Syntax from rich.table import Table @@ -156,10 +155,7 @@ def repair_network(file: Path, auto_approve: bool, dry_run: bool, max_repairs: i sys.exit(1) # Initialize console - if RICH_AVAILABLE: - console = Console() - else: - console = None + console = Console() if RICH_AVAILABLE else None try: # Initialize repairer @@ -478,7 +474,8 @@ def _generate_batch_report(output_path: Path, max_communities: int, max_issues: console.print(f"1. Review the report: {result['report_path']}") console.print("2. Set 'approved: true' for suggestions you want to apply") console.print( - f"3. Apply approved: communitymech repair-network-batch --apply-from {result['report_path']}" + "3. Apply approved: communitymech repair-network-batch " + f"--apply-from {result['report_path']}" ) console.print() diff --git a/src/communitymech/embedding/loader.py b/src/communitymech/embedding/loader.py index 32d5efeeb..b14791b18 100644 --- a/src/communitymech/embedding/loader.py +++ b/src/communitymech/embedding/loader.py @@ -47,8 +47,11 @@ def load_embeddings( # Try loading from cache if not force_reload and cache_path.exists(): print(f"📦 Loading embeddings from cache: {cache_path}") + # S301: cache file is written by this same module to a path + # under self.cache_dir (a developer-controlled location); never + # loaded from an untrusted source. with open(cache_path, "rb") as f: - embeddings = pickle.load(f) + embeddings = pickle.load(f) # noqa: S301 print(f"✅ Loaded {len(embeddings):,} embeddings from cache") return embeddings diff --git a/src/communitymech/literature.py b/src/communitymech/literature.py index 6b03777d2..57fdf99da 100644 --- a/src/communitymech/literature.py +++ b/src/communitymech/literature.py @@ -561,10 +561,7 @@ def validate_evidence_snippet(self, snippet: str, abstract: str) -> bool: ratio = SequenceMatcher( None, snippet_normalized.lower(), abstract_normalized.lower() ).ratio() - if ratio > 0.95: - return True - - return False + return ratio > 0.95 def main(): diff --git a/src/communitymech/llm/anthropic_client.py b/src/communitymech/llm/anthropic_client.py index 6ac6f148a..ad90da374 100644 --- a/src/communitymech/llm/anthropic_client.py +++ b/src/communitymech/llm/anthropic_client.py @@ -156,7 +156,7 @@ def generate_suggestion( try: formatted_prompt = prompt.format(**context) except KeyError as e: - raise ValueError(f"Missing context key for prompt: {e}") + raise ValueError(f"Missing context key for prompt: {e}") from e # Make API call try: @@ -186,9 +186,9 @@ def generate_suggestion( return suggestion except anthropic.APIError as e: - raise RuntimeError(f"Anthropic API error: {e}") + raise RuntimeError(f"Anthropic API error: {e}") from e except Exception as e: - raise RuntimeError(f"Error generating suggestion: {e}") + raise RuntimeError(f"Error generating suggestion: {e}") from e def _parse_yaml_response(self, response_text: str) -> dict[str, Any]: """ @@ -232,7 +232,7 @@ def _parse_yaml_response(self, response_text: str) -> dict[str, Any]: raise ValueError(f"Expected dict, got {type(parsed)}") return parsed except yaml.YAMLError as e: - raise ValueError(f"Failed to parse YAML: {e}\n\nContent:\n{yaml_content}") + raise ValueError(f"Failed to parse YAML: {e}\n\nContent:\n{yaml_content}") from e def get_cost_estimate(self) -> dict[str, Any]: """ diff --git a/src/communitymech/metal_extraction.py b/src/communitymech/metal_extraction.py index f3fae9b6d..21e1dd8a4 100644 --- a/src/communitymech/metal_extraction.py +++ b/src/communitymech/metal_extraction.py @@ -7,6 +7,7 @@ 3. Description keyword matching with context validation """ +import re from pathlib import Path import yaml @@ -98,6 +99,19 @@ METAL_KEYWORDS_FLAT = [kw for keywords in METAL_KEYWORDS.values() for kw in keywords] REE_KEYWORDS_FLAT = [kw for keywords in REE_KEYWORDS.values() for kw in keywords] + +def _keyword_in_text(keyword: str, text: str) -> bool: + """Return True if keyword occurs in text as a standalone token. + + Plain substring matching falsely fires when short element symbols like + 'ti' or 'au' appear inside unrelated words ('characteristic', + 'Australia'). Anchor on non-alphanumeric boundaries (so 'au3+' still + matches; '+' is non-alphanumeric). Case-insensitive. + """ + pattern = rf"(? tuple[list[str], list[str] # Combine notes notes = "; ".join(notes_parts) if notes_parts else "" - return sorted(list(metals)), sorted(list(ree)), relevance, notes + return sorted(metals), sorted(ree), relevance, notes def _extract_from_chebi_terms(data: dict) -> tuple[set[str], set[str]]: @@ -201,12 +215,12 @@ def _extract_from_environmental_factors(data: dict) -> tuple[set[str], set[str]] # Check for metal keywords for metal, keywords in METAL_KEYWORDS.items(): - if any(kw in name for kw in keywords): + if any(_keyword_in_text(kw, name) for kw in keywords): metals.add(metal) # Check for REE keywords for element, keywords in REE_KEYWORDS.items(): - if any(kw in name for kw in keywords): + if any(_keyword_in_text(kw, name) for kw in keywords): ree.add(element) return metals, ree @@ -231,22 +245,22 @@ def _extract_from_description(data: dict) -> tuple[set[str], set[str], str]: search_text = " ".join(text_parts).lower() # Check for strong evidence context - has_strong_context = any(keyword in search_text for keyword in STRONG_CONTEXT_KEYWORDS) + has_strong_context = any(_keyword_in_text(kw, search_text) for kw in STRONG_CONTEXT_KEYWORDS) if not has_strong_context: return metals, ree, notes # Only extract if strong context is present for metal, keywords in METAL_KEYWORDS.items(): - if any(kw in search_text for kw in keywords): + if any(_keyword_in_text(kw, search_text) for kw in keywords): metals.add(metal) for element, keywords in REE_KEYWORDS.items(): - if any(kw in search_text for kw in keywords): + if any(_keyword_in_text(kw, search_text) for kw in keywords): ree.add(element) # Check for generic REE mentions - if any(kw in search_text for kw in GENERIC_REE_KEYWORDS): + if any(_keyword_in_text(kw, search_text) for kw in GENERIC_REE_KEYWORDS): notes = "Generic REE mention detected in description - manual curation recommended" if metals or ree: @@ -266,9 +280,10 @@ def _compute_relevance(data: dict, metals: set[str], ree: set[str]) -> str: return "PRIMARY" # Check for explicit biomining/bioleaching mentions - if any(kw in description for kw in ["biomining", "bioleaching", "metal extraction"]): - if metals or ree: - return "PRIMARY" + if (metals or ree) and any( + kw in description for kw in ["biomining", "bioleaching", "metal extraction"] + ): + return "PRIMARY" # SIGNIFICANT: Metal/REE plays an important but not primary role if (metals or ree) and category not in ["RHIZOSPHERE", "LIGNOCELLULOSE", "OTHER"]: diff --git a/src/communitymech/network/auditor.py b/src/communitymech/network/auditor.py index 737c8bf63..03822db25 100644 --- a/src/communitymech/network/auditor.py +++ b/src/communitymech/network/auditor.py @@ -137,7 +137,9 @@ def audit_community(self, yaml_path: Path) -> list[dict]: "interaction": int_name, "interaction_index": idx, "taxon": source_term, - "message": f"Source taxon '{source_term}' not found in taxonomy section", + "message": ( + f"Source taxon '{source_term}' not found in " "taxonomy section" + ), } ) else: @@ -154,7 +156,10 @@ def audit_community(self, yaml_path: Path) -> list[dict]: "role": "source", "expected_id": expected_id, "actual_id": source_id, - "message": f"Source '{source_term}' has ID {source_id}, expected {expected_id}", + "message": ( + f"Source '{source_term}' has ID {source_id}, " + f"expected {expected_id}" + ), } ) @@ -172,7 +177,9 @@ def audit_community(self, yaml_path: Path) -> list[dict]: "interaction": int_name, "interaction_index": idx, "taxon": target_term, - "message": f"Target taxon '{target_term}' not found in taxonomy section", + "message": ( + f"Target taxon '{target_term}' not found in " "taxonomy section" + ), } ) else: @@ -189,7 +196,10 @@ def audit_community(self, yaml_path: Path) -> list[dict]: "role": "target", "expected_id": expected_id, "actual_id": target_id, - "message": f"Target '{target_term}' has ID {target_id}, expected {expected_id}", + "message": ( + f"Target '{target_term}' has ID {target_id}, " + f"expected {expected_id}" + ), } ) diff --git a/src/communitymech/network/batch_reporter.py b/src/communitymech/network/batch_reporter.py index 5bbac604d..154bc51ba 100644 --- a/src/communitymech/network/batch_reporter.py +++ b/src/communitymech/network/batch_reporter.py @@ -8,13 +8,13 @@ import yaml -logger = logging.getLogger(__name__) - from communitymech.llm.anthropic_client import AnthropicClient from communitymech.network.auditor import NetworkIntegrityAuditor from communitymech.network.repair_strategies import StrategySelector from communitymech.network.validators import SuggestionValidator +logger = logging.getLogger(__name__) + class BatchReporter: """Generate repair suggestion reports for offline review.""" diff --git a/src/communitymech/network/llm_repair.py b/src/communitymech/network/llm_repair.py index 4dd2b9466..9078e0171 100644 --- a/src/communitymech/network/llm_repair.py +++ b/src/communitymech/network/llm_repair.py @@ -244,7 +244,7 @@ def _apply_suggestion( # Restore from backup on failure if backup_path.exists(): shutil.copy(backup_path, yaml_path) - raise RuntimeError(f"Failed to apply suggestion: {e}") + raise RuntimeError(f"Failed to apply suggestion: {e}") from e def _create_backup(self, yaml_path: Path) -> Path: """ diff --git a/src/communitymech/network/validators.py b/src/communitymech/network/validators.py index d25bd3c20..7fdaf4425 100644 --- a/src/communitymech/network/validators.py +++ b/src/communitymech/network/validators.py @@ -184,7 +184,10 @@ def validate_schema(self, suggestion: dict[str, Any]) -> list[ValidationError]: ValidationError( layer="schema", field=f"suggested_interactions[{idx}].interaction_type", - message=f"Invalid interaction type. Must be one of: {', '.join(_INTERACTION_TYPE_VALUES)}", + message=( + "Invalid interaction type. Must be one of: " + f"{', '.join(_INTERACTION_TYPE_VALUES)}" + ), severity="error", ) ) @@ -274,7 +277,10 @@ def _validate_evidence_item( ValidationError( layer="schema", field=f"{field_path}.supports", - message=f"Invalid value for 'supports'. Must be one of: {', '.join(_SUPPORTS_VALUES)}", + message=( + "Invalid value for 'supports'. Must be one of: " + f"{', '.join(_SUPPORTS_VALUES)}" + ), severity="error", ) ) @@ -284,7 +290,10 @@ def _validate_evidence_item( ValidationError( layer="schema", field=f"{field_path}.evidence_source", - message=f"Invalid value for 'evidence_source'. Must be one of: {', '.join(_EVIDENCE_SOURCE_VALUES)}", + message=( + "Invalid value for 'evidence_source'. Must be one of: " + f"{', '.join(_EVIDENCE_SOURCE_VALUES)}" + ), severity="error", ) ) @@ -433,7 +442,10 @@ def validate_evidence(self, suggestion: dict[str, Any]) -> list[ValidationError] ValidationError( layer="evidence", field=f"suggested_interactions[{idx}].evidence[{ev_idx}].snippet", - message=f"Snippet does not match abstract (< {self.min_snippet_match_score*100}% similarity)", + message=( + "Snippet does not match abstract " + f"(< {self.min_snippet_match_score*100}% similarity)" + ), severity="error", ) ) @@ -536,7 +548,10 @@ def check_biological_plausibility( ValidationError( layer="plausibility", field=f"suggested_interactions[{idx}].metabolites_exchanged", - message=f"{interaction_type} interaction typically involves metabolite exchange", + message=( + f"{interaction_type} interaction typically involves " + "metabolite exchange" + ), severity="warning", ) ) diff --git a/src/communitymech/render_community_pages.py b/src/communitymech/render_community_pages.py index 4a8a6b100..811899af9 100644 --- a/src/communitymech/render_community_pages.py +++ b/src/communitymech/render_community_pages.py @@ -81,7 +81,10 @@ def safe_mermaid(value: str) -> Markup: s = s[len("```mermaid") :].lstrip() if s.endswith("```"): s = s[:-3].rstrip() - return Markup(f'
\n{s}\n
') + # S704: input `s` is the Mermaid diagram body written by curators in + # community YAML, not user-supplied at runtime; rendering it as-is is + # required so Mermaid can render the diagram. Treat as trusted. + return Markup(f'
\n{s}\n
') # noqa: S704 def make_env() -> Environment: @@ -108,9 +111,8 @@ def render_one( return "error:no-id", None, "" slug = slug_for(community, source_path) out_path = out_dir / f"{slug}.html" - if not force and out_path.exists(): - if out_path.stat().st_mtime >= source_path.stat().st_mtime: - return "skipped", community, slug + if not force and out_path.exists() and out_path.stat().st_mtime >= source_path.stat().st_mtime: + return "skipped", community, slug template = env.get_template("community.html.j2") html = template.render( community=community, diff --git a/src/communitymech/uniprot_reference_proteomes.py b/src/communitymech/uniprot_reference_proteomes.py index c29fadf78..f98462956 100644 --- a/src/communitymech/uniprot_reference_proteomes.py +++ b/src/communitymech/uniprot_reference_proteomes.py @@ -478,7 +478,7 @@ def _informative_tokens(value: str) -> set[str]: raw_tokens = re.findall(r"[a-z0-9]+", value.lower()) fused_tokens: list[str] = [] - for left, right in zip(raw_tokens, raw_tokens[1:]): + for left, right in zip(raw_tokens, raw_tokens[1:], strict=False): if (left.isalpha() and right.isdigit()) or (left.isdigit() and right.isalpha()): fused_tokens.append(f"{left}{right}") diff --git a/src/communitymech/utils/id_utils.py b/src/communitymech/utils/id_utils.py index f2abbc979..22a5d2f30 100644 --- a/src/communitymech/utils/id_utils.py +++ b/src/communitymech/utils/id_utils.py @@ -202,8 +202,9 @@ def find_highest_id_multi_file(directory: Path, prefix: str, pattern: str = "*.y id_str = data.get("id", "") if id_num := parse_xmech_id(id_str, prefix): max_id = max(max_id, id_num) - except Exception: - # Skip files that can't be parsed + except Exception: # noqa: S112 + # Skip files that can't be parsed; the goal here is to find the + # max valid ID, so unparseable entries are intentionally ignored. continue return max_id diff --git a/src/communitymech/validators/cross_repo_ids.py b/src/communitymech/validators/cross_repo_ids.py new file mode 100644 index 000000000..5970a8d4b --- /dev/null +++ b/src/communitymech/validators/cross_repo_ids.py @@ -0,0 +1,196 @@ +"""Cross-repository ID validation for related_media and related_ingredients. + +When a community YAML references a `culturemech_id` or `mediaingredientmech_id` +under `related_media` / `related_ingredients`, two things should hold: + +1. The ID matches its CURIE pattern (`CultureMech:NNNNNN` / + `MediaIngredientMech:NNNNNN`). LinkML's schema-level pattern check + covers this, but mirroring it here surfaces issues without booting + the full validator and lets callers act on individual offenders. + +2. The ID actually exists in the sibling repository. This requires a + path to the sibling repo and is therefore opt-in — if no sibling-repo + path is supplied, existence checks are skipped (and the validator + says so explicitly rather than silently passing). + +Usage: + + from pathlib import Path + from communitymech.validators.cross_repo_ids import validate_cross_repo_ids + + issues = validate_cross_repo_ids( + Path("kb/communities/SPRUCE_Peatland_Methane_Cycling_Community.yaml"), + sibling_repos={ + "CultureMech": Path("../CultureMech/kb/media"), + "MediaIngredientMech": Path("../MediaIngredientMech/kb/ingredients"), + }, + ) + for i in issues: + print(i.severity, i.message) +""" + +from __future__ import annotations + +import re +from collections.abc import Iterable +from dataclasses import dataclass, field +from pathlib import Path + +import yaml + +CULTUREMECH_ID_RE = re.compile(r"^CultureMech:\d{6}$") +MEDIAINGREDIENTMECH_ID_RE = re.compile(r"^MediaIngredientMech:\d{6}$") + + +@dataclass +class CrossRepoIssue: + """A single cross-repo ID validation finding.""" + + severity: str # "error" | "warning" | "info" + field_path: str + message: str + + def __str__(self) -> str: + return f"[{self.severity}] {self.field_path}: {self.message}" + + +@dataclass +class SiblingRepoIndex: + """Lazy index of IDs present in a sibling repo's kb/ directory. + + Treats every `*.yaml` file in `path` as a record and uses its top-level + `id:` field as the canonical ID. Returns an empty index if `path` is + None or does not exist, which lets callers configure repos optionally. + """ + + path: Path | None + _ids: set[str] = field(default_factory=set) + _loaded: bool = False + + def __contains__(self, candidate: str) -> bool: + self._ensure_loaded() + return candidate in self._ids + + @property + def available(self) -> bool: + return self.path is not None and self.path.exists() + + def _ensure_loaded(self) -> None: + if self._loaded: + return + self._loaded = True + if not self.available: + return + assert self.path is not None + for yaml_file in self.path.glob("*.yaml"): + try: + data = yaml.safe_load(yaml_file.read_text()) + except yaml.YAMLError: + continue + if isinstance(data, dict) and isinstance(data.get("id"), str): + self._ids.add(data["id"]) + + +def _iter_entries(data: dict, slot: str) -> Iterable[tuple[int, dict]]: + for idx, entry in enumerate(data.get(slot, []) or []): + if isinstance(entry, dict): + yield idx, entry + + +def validate_cross_repo_ids( + yaml_path: Path, + sibling_repos: dict[str, Path] | None = None, +) -> list[CrossRepoIssue]: + """Validate cross-repo IDs in a single community YAML. + + Args: + yaml_path: Path to the community YAML. + sibling_repos: Optional dict mapping repo name to the directory + holding the sibling repo's record YAMLs. Recognized keys: + ``CultureMech``, ``MediaIngredientMech``. If a key is missing + or its path doesn't exist, the existence check for that repo + is skipped (with an info-level note in the issue list). + + Returns: + List of CrossRepoIssue. Empty if everything checks out (or if + there are no cross-repo IDs to check and sibling repos are + configured). + """ + sibling_repos = sibling_repos or {} + culturemech = SiblingRepoIndex(path=sibling_repos.get("CultureMech")) + mim = SiblingRepoIndex(path=sibling_repos.get("MediaIngredientMech")) + + data = yaml.safe_load(yaml_path.read_text()) or {} + issues: list[CrossRepoIssue] = [] + + for idx, entry in _iter_entries(data, "related_media"): + cid = entry.get("culturemech_id") + if cid is None: + continue + field_path = f"related_media[{idx}].culturemech_id" + if not CULTUREMECH_ID_RE.match(cid): + issues.append( + CrossRepoIssue( + severity="error", + field_path=field_path, + message=f"'{cid}' does not match pattern CultureMech:NNNNNN", + ) + ) + continue + if culturemech.available: + if cid not in culturemech: + issues.append( + CrossRepoIssue( + severity="error", + field_path=field_path, + message=f"'{cid}' not found in CultureMech repo at {culturemech.path}", + ) + ) + else: + issues.append( + CrossRepoIssue( + severity="info", + field_path=field_path, + message=( + f"existence check for '{cid}' skipped: no CultureMech " + "sibling-repo path configured" + ), + ) + ) + + for idx, entry in _iter_entries(data, "related_ingredients"): + mid = entry.get("mediaingredientmech_id") + if mid is None: + continue + field_path = f"related_ingredients[{idx}].mediaingredientmech_id" + if not MEDIAINGREDIENTMECH_ID_RE.match(mid): + issues.append( + CrossRepoIssue( + severity="error", + field_path=field_path, + message=f"'{mid}' does not match pattern MediaIngredientMech:NNNNNN", + ) + ) + continue + if mim.available: + if mid not in mim: + issues.append( + CrossRepoIssue( + severity="error", + field_path=field_path, + message=f"'{mid}' not found in MediaIngredientMech repo at {mim.path}", + ) + ) + else: + issues.append( + CrossRepoIssue( + severity="info", + field_path=field_path, + message=( + f"existence check for '{mid}' skipped: no " + "MediaIngredientMech sibling-repo path configured" + ), + ) + ) + + return issues diff --git a/src/communitymech/visualization/umap_generator.py b/src/communitymech/visualization/umap_generator.py index ecd2aae0a..c1f8f6ed3 100644 --- a/src/communitymech/visualization/umap_generator.py +++ b/src/communitymech/visualization/umap_generator.py @@ -20,7 +20,10 @@ class UMAPVisualizationGenerator: def generate( self, communities_dir: str = "kb/communities", - embeddings_path: str = "data/embeddings/DeepWalkSkipGramEnsmallen_degreenorm_embedding_512_v2_2026-04-25_20_44_08.tsv.gz", + embeddings_path: str = ( + "data/embeddings/" + "DeepWalkSkipGramEnsmallen_degreenorm_embedding_512_v2_2026-04-25_20_44_08.tsv.gz" + ), output_path: str = "docs/community_umap.html", template_dir: str | None = None, cache_dir: str = ".umap_cache", @@ -62,14 +65,11 @@ def generate( ) print(f"\n📦 Aggregated {len(community_vectors)} communities") + skipped = self._count_yaml_files(communities_dir) - len(community_vectors) if exclude_hosts: - print( - f" (excluded non-microbial host taxa from {self._count_yaml_files(communities_dir) - len(community_vectors)} communities)" - ) + print(f" (excluded non-microbial host taxa from {skipped} communities)") else: - print( - f" (skipped {self._count_yaml_files(communities_dir) - len(community_vectors)} due to low coverage)" - ) + print(f" (skipped {skipped} due to low coverage)") # Step 3: Run UMAP reducer = UMAPReducer(n_neighbors=n_neighbors, min_dist=min_dist, random_state=42) @@ -177,8 +177,12 @@ def _render_html( if template_dir is None: template_dir = str(Path(__file__).parent.parent / "templates") - # Set up Jinja2 environment - env = Environment(loader=FileSystemLoader(template_dir)) + # Set up Jinja2 environment. autoescape is left at the default + # (False) because the template renders a JSON blob into a