<a href="https://colab.research.google.com/github/AnamariaVLR/noura-rag/blob/main/NOURA_RAG_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# NOURA - Cell 1: Verify environment
print("NOURA is starting...")
print("Python ready")

NOURA is starting...
Python ready


In [2]:
# NOURA - Cell 2: Scoring methodology (NOURA core IP)

EVIDENCE_HIERARCHY = {
    "systematic_review_meta_analysis": {"base_weight": 1.00, "requires_independence_check": True},
    "rct":                             {"base_weight": 0.85, "requires_independence_check": True},
    "regulatory_opinion":              {"base_weight": 0.75, "requires_independence_check": False},
    "observational_cohort":            {"base_weight": 0.60, "requires_independence_check": True},
    "ewg_hazard":                      {"base_weight": 0.50, "requires_dose_adjustment": True},
    "cosing_regional":                 {"base_weight": 0.45, "requires_independence_check": False},
    "in_vitro":                        {"base_weight": 0.30, "requires_independence_check": False},
    "clinical_case":                   {"base_weight": 0.15, "requires_independence_check": False},
    "expert_opinion":                  {"base_weight": 0.10, "requires_independence_check": True},
}

INDUSTRY_FUNDING_PENALTY = 0.20
HEALTH_HARD_BLOCK = 50
PLANET_FLAG_THRESHOLD = 40

SUFFICIENCY_CAPS = {
    "only_in_vitro_or_case":   60,
    "only_regulatory_strong":  70,
    "only_regulatory_weak":    50,
    "single_rct":              80,
}

CATEGORY_CLAIM_REQUIREMENTS = {
    "skincare": {
        "hydration":   {"min_evidence": "rct", "min_studies": 1},
        "anti_aging":  {"min_evidence": "rct", "min_studies": 2},
        "brightening": {"min_evidence": "observational_cohort", "min_studies": 1},
        "acne":        {"min_evidence": "rct", "min_studies": 2},
    }
}

print("Scoring methodology loaded")
print(f"Evidence sources defined: {len(EVIDENCE_HIERARCHY)}")
print(f"Health hard block threshold: {HEALTH_HARD_BLOCK}")

Scoring methodology loaded
Evidence sources defined: 9
Health hard block threshold: 50


In [3]:
# NOURA - Cell 3: Scoring engine

def evaluate_evidence(source_type, industry_funded=False, dose_adjusted=True):
    if source_type not in EVIDENCE_HIERARCHY:
        return {"weight": 0, "source_type": source_type, "flags": [f"Unknown source type: {source_type}"]}

    weight = EVIDENCE_HIERARCHY[source_type]["base_weight"]
    flags = []

    if industry_funded and EVIDENCE_HIERARCHY[source_type].get("requires_independence_check"):
        weight = weight * (1 - INDUSTRY_FUNDING_PENALTY)
        flags.append("Industry-funded study: weight reduced 20%")

    if source_type == "ewg_hazard" and not dose_adjusted:
        weight = 0
        flags.append("EWG score excluded: not adjusted for actual product concentration")

    return {"source_type": source_type, "weight": round(weight, 3), "flags": flags}


def calculate_health_score(evaluated_evidence, prohibited=False):
    if prohibited:
        return {
            "score": 0,
            "verdict": "HIGHER RISK",
            "flag": "Ingredient prohibited by regulatory authority",
            "evidence_situation": "regulatory_block"
        }

    if not evaluated_evidence:
        return {
            "score": None,
            "verdict": "INSUFFICIENT DATA",
            "flag": "No scientific evidence retrieved for this ingredient",
            "evidence_situation": "no_evidence"
        }

    source_types = [e["source_type"] for e in evaluated_evidence]
    only_lab = all(t in {"in_vitro", "clinical_case"} for t in source_types)
    only_regulatory = all(t in {"regulatory_opinion", "cosing_regional"} for t in source_types)

    # FIXED FORMULA: use top 3 highest-weight studies, not average of all
    # This prevents weak studies from dragging down a strong evidence base
    weights = sorted([e["weight"] for e in evaluated_evidence], reverse=True)
    top_weights = weights[:3]
    score_raw = round((sum(top_weights) / len(top_weights)) * 100, 1)

    # Apply sufficiency caps
    if only_lab:
        score = min(score_raw, SUFFICIENCY_CAPS["only_in_vitro_or_case"])
        flag = "Health score capped at 60 — only lab-based evidence retrieved; human clinical data insufficient"
        situation = "only_lab"
    elif only_regulatory:
        score = min(score_raw, SUFFICIENCY_CAPS["only_regulatory_strong"])
        flag = "Health score capped at 70 — regulatory approval present but no clinical studies retrieved"
        situation = "only_regulatory"
    else:
        score = score_raw
        flag = None
        situation = "sufficient"

    # Assign verdict
    if score is None or score < HEALTH_HARD_BLOCK:
        verdict = "HIGHER RISK"
    elif score >= 71:
        verdict = "WELL SUPPORTED"
    elif score >= 41:
        verdict = "LIMITED SUPPORT"
    else:
        verdict = "HIGHER RISK"

    return {
        "score": score,
        "verdict": verdict,
        "flag": flag,
        "evidence_situation": situation
    }

print("Scoring engine loaded")

Scoring engine loaded


In [12]:
# NOURA - Cell 4: PubMed connection with abstract retrieval
import requests
import time

def search_pubmed(ingredient, max_results=10):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

    time.sleep(1)

    # Step 1: Search for IDs
    search_response = requests.get(
        f"{base_url}esearch.fcgi",
        params={
            "db": "pubmed",
            "term": ingredient + "[Title]",
            "retmax": max_results,
            "retmode": "json",
            "sort": "relevance"
        }
    )

    search_data = search_response.json()

    if "esearchresult" not in search_data:
        return {"ingredient": ingredient, "studies_found": 0, "studies": [],
                "error": "PubMed rate limit — try again in 30 seconds"}

    ids = search_data["esearchresult"]["idlist"]

    if not ids:
        return {"ingredient": ingredient, "studies_found": 0, "studies": []}

    time.sleep(1)

    # Step 2: Fetch full details including abstract
    fetch_response = requests.get(
        f"{base_url}efetch.fcgi",
        params={
            "db": "pubmed",
            "id": ",".join(ids),
            "retmode": "xml",
            "rettype": "abstract"
        }
    )

    time.sleep(1)

    # Step 3: Also fetch summary for metadata
    summary_response = requests.get(
        f"{base_url}esummary.fcgi",
        params={"db": "pubmed", "id": ",".join(ids), "retmode": "json"}
    )
    summary_data = summary_response.json()

    if "result" not in summary_data:
        return {"ingredient": ingredient, "studies_found": 0, "studies": []}

    # Step 4: Parse XML for abstracts
    import xml.etree.ElementTree as ET
    abstracts = {}
    sample_sizes = {}

    try:
        root = ET.fromstring(fetch_response.content)
        for article in root.findall(".//PubmedArticle"):
            # Get PMID
            pmid_el = article.find(".//PMID")
            if pmid_el is None:
                continue
            pmid = pmid_el.text

            # Get abstract
            abstract_texts = article.findall(".//AbstractText")
            if abstract_texts:
                abstract = " ".join([el.text or "" for el in abstract_texts])
                abstracts[pmid] = abstract[:500]  # First 500 chars

            # Try to find sample size mentions
            full_text = " ".join([el.text or "" for el in article.findall(".//AbstractText")])
            import re
            size_matches = re.findall(r'\b(\d+)\s*(?:patients|participants|subjects|women|men|volunteers|individuals)',
                                       full_text, re.IGNORECASE)
            if size_matches:
                sample_sizes[pmid] = max([int(x) for x in size_matches])
    except Exception as e:
        pass  # If XML parsing fails, continue without abstracts

    # Step 5: Build studies list
    studies = []
    for uid in ids:
        article = summary_data["result"].get(uid, {})
        if isinstance(article, dict) and "title" in article:
            studies.append({
                "id": uid,
                "title": article.get("title", ""),
                "abstract": abstracts.get(uid, ""),
                "sample_size": sample_sizes.get(uid, None),
                "year": article.get("pubdate", "")[:4],
                "source": "PubMed",
                "pubmed_url": f"https://pubmed.ncbi.nlm.nih.gov/{uid}/"
            })

    return {"ingredient": ingredient, "studies_found": len(studies), "studies": studies}

print("PubMed connection with abstract retrieval ready")

PubMed connection with abstract retrieval ready


In [5]:
# NOURA - Cell 5: Evidence classifier

def classify_evidence_type(title):
    title_lower = title.lower()

    if any(w in title_lower for w in ["meta-analysis", "systematic review", "cochrane"]):
        return "systematic_review_meta_analysis"

    elif any(w in title_lower for w in ["randomized", "rct", "controlled trial", "double-blind", "clinical trial"]):
        return "rct"

    elif any(w in title_lower for w in ["cohort", "observational", "prospective", "retrospective", "epidemiolog"]):
        return "observational_cohort"

    elif any(w in title_lower for w in ["guideline", "regulatory", "safety assessment", "efsa", "fda", "final report"]):
        return "regulatory_opinion"

    elif any(w in title_lower for w in ["review", "overview", "narrative", "update", "current evidence", "mechanisms of action", "mechanistic", "applications of"]):
        return "observational_cohort"  # Reviews treated as observational — higher than in_vitro

    elif any(w in title_lower for w in ["in vitro", "cell culture", "in-vitro"]):
        return "in_vitro"

    elif any(w in title_lower for w in ["case report", "case study"]):
        return "clinical_case"

    else:
        return "in_vitro"

print("Evidence classifier loaded")

Evidence classifier loaded


In [6]:
# NOURA - Cell 6: Full evaluation pipeline

def noura_evaluate(ingredient, category="skincare"):

    # Step 1: Search PubMed
    pubmed_results = search_pubmed(ingredient, max_results=10)

    # Step 2: Classify and evaluate each study
    evaluated = []
    evidence_count = {}

    for study in pubmed_results.get("studies", []):
        source_type = classify_evidence_type(study["title"])
        ev = evaluate_evidence(source_type)
        ev["study_title"] = study["title"][:80]
        ev["year"] = study["year"]
        evaluated.append(ev)
        evidence_count[source_type] = evidence_count.get(source_type, 0) + 1

    # Step 3: Calculate score
    result = calculate_health_score(evaluated)

    # Step 4: Build evidence summary
    evidence_str = " + ".join([f"{v} {k.replace('_', ' ')}"
                                for k, v in evidence_count.items()])

    # Step 5: Display NOURA assessment
    print(f"NOURA Health Assessment: {ingredient.title()} ({category})")
    print("=" * 65)
    print(f"Score:   {result['score']}/100")
    print(f"Verdict: {result['verdict']}")
    print()
    print(f"Studies retrieved:  {pubmed_results['studies_found']} (PubMed)")
    print(f"Evidence types:     {evidence_str if evidence_str else 'None'}")
    print()

    if result['flag']:
        print(f"Note: {result['flag']}")
        print()

    print("Evidence breakdown:")
    for e in evaluated:
        print(f"  [{e['year']}] {e['source_type'].replace('_', ' ')} "
              f"(weight: {e['weight']}) — {e['study_title']}...")

    print()
    print("What would you like next?")
    print("  - View full source links")
    print("  - Compare with alternatives")
    print("  - Check regulatory status")
    print("  - Assess another ingredient")
    print("=" * 65)
    print()

    return result


# Test
noura_evaluate("niacinamide")
noura_evaluate("parabens", category="skincare")

NOURA Health Assessment: Niacinamide (skincare)
Score:   65.0/100
Verdict: LIMITED SUPPORT

Studies retrieved:  10 (PubMed)
Evidence types:     4 observational cohort + 5 in vitro + 1 regulatory opinion

Evidence breakdown:
  [2024] observational cohort (weight: 0.6) — Niacinamide: a review on dermal delivery strategies and clinical evidence....
  [2014] observational cohort (weight: 0.6) — Niacinamide - mechanisms of action and its topical use in dermatology....
  [2005] in vitro (weight: 0.3) — Niacinamide: A B vitamin that improves aging facial skin appearance....
  [2002] in vitro (weight: 0.3) — The effect of niacinamide on reducing cutaneous pigmentation and suppression of ...
  [2021] observational cohort (weight: 0.6) — Cosmeceutical Aptitudes of Niacinamide: A Review....
  [2006] in vitro (weight: 0.3) — The effect of 2% niacinamide on facial sebum production....
  [2024] observational cohort (weight: 0.6) — Mechanistic Insights into the Multiple Functions of Niacinamide: Ther

{'score': 73.3,
 'verdict': 'WELL SUPPORTED',
 'flag': None,
 'evidence_situation': 'sufficient'}

In [21]:
# NOURA - Cell 7: Abstract-aware evidence direction classifier

def classify_evidence_direction(title, abstract=""):
    """
    Uses both title AND abstract to determine evidence direction.
    Abstract gives far more signal than title alone.
    """
    full_text = (title + " " + abstract).lower()

    concern_keywords = [
        "risk", "harm", "toxic", "toxicity", "hazard",
        "disrupt", "disruption", "endocrine", "carcinogen",
        "linked to cancer", "associated with cancer", "cancer risk",
        "causes cancer", "cancer development",
        "adverse", "negative effect", "damage", "impair",
        "banned", "restricted", "unsafe",
        "controversial", "breast cancer", "estrogenic",
        "inflammation", "sensitiz", "allerg", "diabes",
        "reproductive toxicity", "genotoxic", "mutagenic",
        "significant increase in risk", "associated with risk",
        "harmful", "dangerous", "prohibited"
    ]

    safety_keywords = [
        "safe", "safety assessment", "well tolerated", "no adverse",
        "no significant adverse", "approved", "permitted", "gras",
        "efficacious", "significant improvement", "effective treatment",
        "beneficial", "protective", "no toxicity observed",
        "no evidence of harm", "clinically proven",
        "significant reduction in", "improvement in skin",
        "recommended", "widely used safely",
        "explored for treatment", "potential treatment",
        "therapeutic application", "used to treat",
        "treatment of cancer", "against cancer",
        "skin brightening", "anti-ageing", "anti-aging",
        "skin barrier", "skin care", "dermal benefits"
    ]

    concern_score = sum(1 for w in concern_keywords if w in full_text)
    safety_score = sum(1 for w in safety_keywords if w in full_text)

    if concern_score > safety_score:
        return "CONCERN"
    elif safety_score > concern_score:
        return "SAFETY"
    else:
        return "NEUTRAL"


def classify_evidence_type(title, abstract=""):
    """
    Uses both title and abstract for better evidence type classification.
    """
    full_text = (title + " " + abstract).lower()

    if any(w in full_text for w in ["meta-analysis", "systematic review", "cochrane"]):
        return "systematic_review_meta_analysis"

    elif any(w in full_text for w in ["randomized", "randomised", "rct",
                                       "controlled trial", "double-blind",
                                       "double blind", "placebo-controlled"]):
        return "rct"

    elif any(w in full_text for w in ["cohort", "prospective", "retrospective",

SyntaxError: incomplete input (1301756171.py, line 64)

In [22]:
# NOURA - Cell 7: Abstract-aware evidence direction classifier

def classify_evidence_direction(title, abstract=""):
    """
    Uses both title AND abstract to determine evidence direction.
    Abstract gives far more signal than title alone.
    """
    full_text = (title + " " + abstract).lower()

    concern_keywords = [
        "risk", "harm", "toxic", "toxicity", "hazard",
        "disrupt", "disruption", "endocrine", "carcinogen",
        "linked to cancer", "associated with cancer", "cancer risk",
        "causes cancer", "cancer development",
        "adverse", "negative effect", "damage", "impair",
        "banned", "restricted", "unsafe",
        "controversial", "breast cancer", "estrogenic",
        "inflammation", "sensitiz", "allerg", "diabes",
        "reproductive toxicity", "genotoxic", "mutagenic",
        "significant increase in risk", "associated with risk",
        "harmful", "dangerous", "prohibited"
    ]

    safety_keywords = [
        "safe", "safety assessment", "well tolerated", "no adverse",
        "no significant adverse", "approved", "permitted", "gras",
        "efficacious", "significant improvement", "effective treatment",
        "beneficial", "protective", "no toxicity observed",
        "no evidence of harm", "clinically proven",
        "significant reduction in", "improvement in skin",
        "recommended", "widely used safely",
        "explored for treatment", "potential treatment",
        "therapeutic application", "used to treat",
        "treatment of cancer", "against cancer",
        "skin brightening", "anti-ageing", "anti-aging",
        "skin barrier", "skin care", "dermal benefits"
    ]

    concern_score = sum(1 for w in concern_keywords if w in full_text)
    safety_score = sum(1 for w in safety_keywords if w in full_text)

    if concern_score > safety_score:
        return "CONCERN"
    elif safety_score > concern_score:
        return "SAFETY"
    else:
        return "NEUTRAL"


def classify_evidence_type(title, abstract=""):
    """
    Uses both title and abstract for better evidence type classification.
    """
    full_text = (title + " " + abstract).lower()

    if any(w in full_text for w in ["meta-analysis", "systematic review", "cochrane"]):
        return "systematic_review_meta_analysis"

    elif any(w in full_text for w in ["randomized", "randomised", "rct",
                                       "controlled trial", "double-blind",
                                       "double blind", "placebo-controlled"]):
        return "rct"

    elif any(w in full_text for w in ["cohort", "prospective", "retrospective",
                                       "epidemiolog", "population-based"]):
        return "observational_cohort"

    elif any(w in full_text for w in ["guideline", "regulatory", "safety assessment",
                                       "efsa", "fda", "final report", "sccs opinion",
                                       "gras", "approved by"]):
        return "regulatory_opinion"

    elif any(w in full_text for w in ["review", "overview", "narrative review",
                                       "current evidence", "mechanisms of action",
                                       "comprehensive review", "literature review"]):
        return "observational_cohort"

    elif any(w in full_text for w in ["in vitro", "cell culture", "in-vitro",
                                       "cell line", "hek293", "keratinocyte"]):
        return "in_vitro"

    elif any(w in full_text for w in ["case report", "case series"]):
        return "clinical_case"

    else:
        return "in_vitro"


print("Abstract-aware classifier loaded")

Abstract-aware classifier loaded


In [23]:
# NOURA - Cell 9: Updated full pipeline with abstract-aware scoring

def noura_evaluate_v2(ingredient, category="skincare"):

    # Step 1: Search PubMed
    pubmed_results = search_pubmed(ingredient, max_results=10)

    # Step 2: Classify evidence type and direction using title + abstract
    evaluated = []
    evidence_count = {}
    direction_count = {"SAFETY": 0, "CONCERN": 0, "NEUTRAL": 0}

    for study in pubmed_results.get("studies", []):
        abstract = study.get("abstract", "")
        source_type = classify_evidence_type(study["title"], abstract)
        direction = classify_evidence_direction(study["title"], abstract)
        ev = evaluate_evidence(source_type)
        ev["study_title"] = study["title"][:80]
        ev["year"] = study["year"]
        ev["direction"] = direction
        ev["pubmed_url"] = study.get("pubmed_url", "")
        ev["sample_size"] = study.get("sample_size", None)
        ev["abstract"] = abstract[:500]
        evaluated.append(ev)
        evidence_count[source_type] = evidence_count.get(source_type, 0) + 1
        direction_count[direction] += 1

    # Step 3: Calculate direction-aware score
    result = calculate_direction_aware_score(evaluated)

    # Step 4: Build evidence summary
    evidence_str = " + ".join([f"{v} {k.replace('_', ' ')}"
                                for k, v in evidence_count.items()])

    # Step 5: Display NOURA assessment
    print(f"NOURA Health Assessment: {ingredient.title()} ({category})")
    print("=" * 65)
    print(f"Score:   {result['score']}/100")
    print(f"Verdict: {result['verdict']}")
    print()
    print(f"Studies retrieved:  {pubmed_results['studies_found']} (PubMed)")
    print(f"Evidence types:     {evidence_str if evidence_str else 'None'}")
    print(f"Evidence direction: {direction_count['SAFETY']} safety | "
          f"{direction_count['CONCERN']} concern | "
          f"{direction_count['NEUTRAL']} neutral")
    print()

    if result['flag']:
        flag_clean = result['flag'].strip().lstrip("|").strip()
        print(f"Note: {flag_clean}")
        print()

    print("Evidence breakdown:")
    for e in evaluated:
        sample_info = f" | n={e['sample_size']}" if e.get("sample_size") else ""
        print(f"  [{e['year']}] [{e['direction']}] "
              f"{e['source_type'].replace('_', ' ')} "
              f"(weight: {e['weight']}){sample_info}")
        print(f"           {e['study_title']}...")
        if e['abstract']:
            print(f"           Abstract: {e['abstract']}...")
        print()

    print("Source links:")
    for e in evaluated[:3]:
        print(f"  {e['pubmed_url']}")

    print()
    print("What would you like next?")
    print("  - View all source links")
    print("  - Compare with alternatives")
    print("  - Check regulatory status")
    print("  - Assess another ingredient")
    print("=" * 65)
    print()

    return result


# Test
noura_evaluate_v2("niacinamide")
print()
noura_evaluate_v2("parabens")

NOURA Health Assessment: Niacinamide (skincare)
Score:   79.3/100
Verdict: WELL SUPPORTED

Studies retrieved:  10 (PubMed)
Evidence types:     3 observational cohort + 6 in vitro + 1 regulatory opinion
Evidence direction: 5 safety | 0 concern | 5 neutral

Evidence breakdown:
  [2024] [SAFETY] observational cohort (weight: 0.6)
           Niacinamide: a review on dermal delivery strategies and clinical evidence....
           Abstract: Niacinamide, an active form of vitamin B3, is recognised for its significant dermal benefits including skin brightening, anti-ageing properties and the protection of the skin barrier. Its widespread incorporation into cosmetic products, ranging from cleansers to serums, is attributed to its safety profile and proven efficacy. Recently, topical niacinamide has also been explored for other pharmaceutical applications, including skin cancers. Therefore, a fundamental understanding of the skin perme...

  [2014] [SAFETY] observational cohort (weight: 0.6)
   

{'score': 0,
 'verdict': 'HIGHER RISK',
 'flag': 'Majority of retrieved studies raise safety concerns (9 concern vs 0 safety studies)',
 'evidence_situation': 'sufficient',
 'concern_count': 9,
 'safety_count': 0,
 'neutral_count': 1}