From 7c555f659c7995dd3b944d05cf5e6e311b2be94d Mon Sep 17 00:00:00 2001 From: "marcin p. joachimiak" <4625870+realmarcin@users.noreply.github.com> Date: Mon, 25 May 2026 19:27:32 -0700 Subject: [PATCH 1/2] Fix broken literature_enhanced imports in two writer scripts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scripts/add_evidence_source.py and scripts/intelligent_snippet_fixer.py both import EnhancedLiteratureFetcher from communitymech.literature_enhanced — a module that was never committed to git (only a stale .pyc was shadowing the missing source locally). Both scripts have raised ModuleNotFoundError on import for as long as anyone has tried to run them, which was surfaced as a pre-existing-state heads-up by the recent writer-conversion PR #87. Swap to LiteratureFetcher from communitymech.literature, which exposes the same fetch_pubmed_abstract + fetch_paper surface plus a richer DOI fallback chain (CrossRef → PubMed via DOI lookup → PMC full-text → OpenAlex → Semantic Scholar → Europe PMC → publisher meta-tag scrape) that subsumes what fetch_abstract_for_doi did. API differences: - fetch_paper returns (abstract, pdf_url) not a dict; tuple-unpack at call sites. - LiteratureFetcher.fetch_paper has no download_pdf kwarg (the older version's flag was a no-op in the LiteratureFetcher pipeline; the pdf URL is just returned alongside the abstract). - Title field is unavailable separately. In add_evidence_source.py's guess_evidence_source classifier the title was filter(None, …)-merged with snippet and abstract anyway; losing it degrades classification marginally (PubMed abstracts include the title in the abstract text, so PMID references are unaffected). If richer DOI classification is needed later, LiteratureFetcher.fetch_doi_metadata() returns CrossRef metadata with a title field. After-state: both scripts now import and run their initialization paths cleanly. pytest tests/ still passes (136 passed, 9 skipped). Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/add_evidence_source.py | 37 +++++++++++++++++----------- scripts/intelligent_snippet_fixer.py | 20 +++++++++------ 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/scripts/add_evidence_source.py b/scripts/add_evidence_source.py index 0c43bc61..5354caee 100644 --- a/scripts/add_evidence_source.py +++ b/scripts/add_evidence_source.py @@ -26,7 +26,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent / "src")) -from communitymech.literature_enhanced import EnhancedLiteratureFetcher +from communitymech.literature import LiteratureFetcher from communitymech.curate.curation_event import record_curation_event from communitymech.validation.write_validated import ( @@ -39,10 +39,13 @@ class EvidenceSourceAdder: """Add evidence_source to evidence items""" def __init__(self): - self.fetcher = EnhancedLiteratureFetcher( - cache_dir=".literature_cache", - use_fallback_pdf=False - ) + # Previously imported a sibling EnhancedLiteratureFetcher class that + # was never committed to the repo; the LiteratureFetcher in + # communitymech.literature exposes the same fetch_pubmed_abstract + + # fetch_paper surface (plus a richer DOI fallback chain through + # CrossRef / PMC / OpenAlex / Semantic Scholar / Europe PMC) which + # is what these scripts actually need. + self.fetcher = LiteratureFetcher(cache_dir=".literature_cache") self.stats = { 'total_evidence': 0, 'already_has_source': 0, @@ -148,12 +151,14 @@ def process_yaml( # Try to fetch abstract for better classification abstract = None - title = None + title = None # LiteratureFetcher.fetch_paper returns + # (abstract, pdf_url); the title is embedded + # in PubMed abstracts and can be pulled from + # CrossRef metadata via fetch_doi_metadata() + # if richer classification is needed later. try: - paper = self.fetcher.fetch_paper(reference, download_pdf=False) - abstract = paper.get('abstract') - title = paper.get('title') - except: + abstract, _ = self.fetcher.fetch_paper(reference) + except Exception: pass # Guess evidence source @@ -221,12 +226,14 @@ def process_yaml( reference = ev.get('reference', '') abstract = None - title = None + title = None # LiteratureFetcher.fetch_paper returns + # (abstract, pdf_url); the title is embedded + # in PubMed abstracts and can be pulled from + # CrossRef metadata via fetch_doi_metadata() + # if richer classification is needed later. try: - paper = self.fetcher.fetch_paper(reference, download_pdf=False) - abstract = paper.get('abstract') - title = paper.get('title') - except: + abstract, _ = self.fetcher.fetch_paper(reference) + except Exception: pass guessed_source = self.guess_evidence_source( diff --git a/scripts/intelligent_snippet_fixer.py b/scripts/intelligent_snippet_fixer.py index 2733b2dc..a72d006b 100755 --- a/scripts/intelligent_snippet_fixer.py +++ b/scripts/intelligent_snippet_fixer.py @@ -25,7 +25,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from communitymech.curate.curation_event import record_curation_event -from communitymech.literature_enhanced import EnhancedLiteratureFetcher +from communitymech.literature import LiteratureFetcher from communitymech.validation.write_validated import ( ValidationFailedError, write_validated_community, @@ -59,7 +59,12 @@ class IntelligentSnippetFixer: """Intelligent snippet fixer with context-aware abstract analysis.""" def __init__(self, verbose: bool = False): - self.fetcher = EnhancedLiteratureFetcher() + # Previously imported a sibling EnhancedLiteratureFetcher class + # that was never committed; LiteratureFetcher exposes the same + # fetch_pubmed_abstract + fetch_paper surface plus a richer DOI + # fallback chain (CrossRef / PMC / OpenAlex / Semantic Scholar / + # Europe PMC) which subsumes what fetch_abstract_for_doi did. + self.fetcher = LiteratureFetcher() self.verbose = verbose def extract_relevant_sentences( @@ -210,12 +215,13 @@ def suggest_snippets_for_evidence( if reference.upper().startswith("PMID:"): pmid = reference.replace("PMID:", "").replace("pmid:", "").strip() abstract = self.fetcher.fetch_pubmed_abstract(pmid) - elif "doi" in reference.lower() or reference.startswith("10."): - doi = reference.replace("doi:", "").replace("https://doi.org/", "").strip() - abstract = self.fetcher.fetch_abstract_for_doi(doi) else: - paper = self.fetcher.fetch_paper(reference, download_pdf=False) - abstract = paper.get("abstract") + # fetch_paper auto-detects PMID vs DOI and runs the full + # DOI fallback chain (CrossRef → PMID via DOI lookup → PMC + # full-text → OpenAlex → Semantic Scholar → Europe PMC → + # publisher meta-tag scrape). Returns (abstract, pdf_url); + # we don't need the pdf here. + abstract, _ = self.fetcher.fetch_paper(reference) if not abstract: if self.verbose: From 9d599d53de16e34f6c970d8aa315b67d6c579b04 Mon Sep 17 00:00:00 2001 From: "marcin p. joachimiak" <4625870+realmarcin@users.noreply.github.com> Date: Mon, 25 May 2026 19:32:24 -0700 Subject: [PATCH 2/2] Address Copilot review: drop dead title param from guess_evidence_source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copilot flagged that title was assigned None and then passed through guess_evidence_source as a parameter that the classifier merged into its keyword-matching text via filter(None, ...). With title always None the parameter was dead code that just clutters the call sites. Remove the title parameter from guess_evidence_source and from both caller blocks. PubMed abstracts already embed the title in the abstract text (so PMID-driven classification is unchanged), and CrossRef titles for DOI references are available via LiteratureFetcher.fetch_doi_metadata() if richer classification is wanted later — that's now a clear future-work hook rather than a hard-coded-None pretense. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/add_evidence_source.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/scripts/add_evidence_source.py b/scripts/add_evidence_source.py index 5354caee..664bf250 100644 --- a/scripts/add_evidence_source.py +++ b/scripts/add_evidence_source.py @@ -81,13 +81,12 @@ def guess_evidence_source( self, snippet: str, abstract: str = None, - title: str = None, community_origin: str = None ) -> Optional[str]: """Guess evidence source using heuristics""" # Combine text for keyword matching - text = ' '.join(filter(None, [snippet, abstract, title])).lower() + text = ' '.join(filter(None, [snippet, abstract])).lower() # Check for review first (highest specificity) if any(kw in text for kw in self.review_keywords): @@ -150,12 +149,11 @@ def process_yaml( reference = ev.get('reference', '') # Try to fetch abstract for better classification + # Title is not threaded into the classifier — PubMed + # abstracts already embed the title, and CrossRef + # titles for DOIs are available via fetch_doi_metadata() + # if richer classification is wanted later. abstract = None - title = None # LiteratureFetcher.fetch_paper returns - # (abstract, pdf_url); the title is embedded - # in PubMed abstracts and can be pulled from - # CrossRef metadata via fetch_doi_metadata() - # if richer classification is needed later. try: abstract, _ = self.fetcher.fetch_paper(reference) except Exception: @@ -163,7 +161,7 @@ def process_yaml( # Guess evidence source guessed_source = self.guess_evidence_source( - snippet, abstract, title, community_origin + snippet, abstract, community_origin ) if auto_mode and guessed_source: @@ -225,19 +223,18 @@ def process_yaml( snippet = ev.get('snippet', '') reference = ev.get('reference', '') + # Title is not threaded into the classifier — PubMed + # abstracts already embed the title, and CrossRef + # titles for DOIs are available via fetch_doi_metadata() + # if richer classification is wanted later. abstract = None - title = None # LiteratureFetcher.fetch_paper returns - # (abstract, pdf_url); the title is embedded - # in PubMed abstracts and can be pulled from - # CrossRef metadata via fetch_doi_metadata() - # if richer classification is needed later. try: abstract, _ = self.fetcher.fetch_paper(reference) except Exception: pass guessed_source = self.guess_evidence_source( - snippet, abstract, title, community_origin + snippet, abstract, community_origin ) if auto_mode and guessed_source: