1. 🔧 Installations (if running in Colab or Jupyter)

In [None]:
# 1. 🔧 Installations (if running in Colab or Jupyter)
!pip install dspy-ai pydantic beautifulsoup4 requests numpy pandas

2. 📥 Imports

In [None]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import List
from pydantic import BaseModel, Field
import dspy
from dspy.adapters import XMLAdapter
import warnings
warnings.filterwarnings("ignore")

3. 🌐 DSPy LLM Configuration (LongCat Chat)

In [None]:
api_key = os.environ.get("LongCat_API_KEY")

dspy.settings.configure(
    lm=dspy.LM(
        model="openai/LongCat-Flash-Chat",
        api_key=api_key,
        api_base="https://api.longcat.chat/openai/v1",
        task="text-generation"
    ),
    adapter=XMLAdapter()
)

HEADERS = {'User-Agent': 'Mozilla/5.0'}

4. 🌐 Scrape Text from 10 URLs

In [None]:
urls = [
    "https://en.wikipedia.org/wiki/Sustainable_agriculture",
    "https://www.nature.com/articles/d41586-025-03353-5",
    "https://www.sciencedirect.com/science/article/pii/S1043661820315152",
    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/",
    "https://www.fao.org/3/y4671e/y4671e06.htm",
    "https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria",
    "https://www.sciencedirect.com/science/article/pii/S0378378220307088",
    "https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-planets",
    "https://www.medscape.com/viewarticle/second-dose-boosts-shingles-protection-adults-aged-65-years-2025a1000ro7",
    "https://www.theguardian.com/global-development/2025/oct/13/astro-ambassadors-stargazers-himalayas-hanle-ladakh-india"
]

def scrape_text(url):
    try:
        res = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(res.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = ' '.join([p.text for p in paragraphs])
        return text[:5000]
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""

data = [{'link': url, 'text': scrape_text(url)} for url in urls]

In [None]:
data[0]

5. 🧠 Entity Extraction using DSPy

In [None]:
class EntityWithAttr(BaseModel):
    entity: str = Field(description="The named entity")
    attr_type: str = Field(description="The semantic type (e.g., Drug, Concept, Person)")

class ExtractEntities(dspy.Signature):
    paragraph: str = dspy.InputField()
    entities: List[EntityWithAttr] = dspy.OutputField()

extractor = dspy.Predict(ExtractEntities)

# Extract entities
for d in data:
    try:
        output = extractor(paragraph=d['text'])
        d['entities'] = [e for e in output.entities if len(e.entity.strip()) <= 40 and len(e.attr_type.strip()) <= 40]
    except Exception as e:
        print(f"Entity extraction failed: {e}")
        d['entities'] = []

In [None]:
data[0]['entities']

6. 🧹 Deduplication with Confidence Feedback Loop

In [None]:
class DeduplicateEntities(dspy.Signature):
    items: List[EntityWithAttr] = dspy.InputField()
    deduplicated: List[EntityWithAttr] = dspy.OutputField()
    confidence: float = dspy.OutputField()

dedup_predictor = dspy.ChainOfThought(DeduplicateEntities)

def deduplicate_with_lm(items: List[EntityWithAttr], batch_size=10, target_confidence=0.91):
    if not items:
        return []

    def _process_batch(batch):
        while True:
            pred = dedup_predictor(items=batch)
            confidence = pred.confidence or 0.0
            if confidence >= target_confidence:
                return pred.deduplicated

    results = []
    for i in range(0, len(items), batch_size):
        batch = items[i:i+batch_size]
        results.extend(_process_batch(batch))
    return results

for d in data:
    try:
        d['deduplicated_entities'] = deduplicate_with_lm(d['entities'])
    except Exception as e:
        print(f"Deduplicated Entity extraction failed: {e}")
        d['deduplicated_entities'] = []

In [None]:
data[0]['deduplicated_entities']

7. 🔗 Relation Extraction + Mermaid Diagram Generation

In [None]:
class Relation(BaseModel):
    subj: str = Field(description="subject")
    pred: str = Field(description="predicate")
    obj: str = Field(description="object")

class ExtractRelations(dspy.Signature):
    paragraph: str = dspy.InputField()
    entities: List[str] = dspy.InputField()
    relations: List[Relation] = dspy.OutputField()
    confidence: float = dspy.OutputField()

rel_predictor = dspy.ChainOfThought(ExtractRelations)

def triples_to_mermaid(triples: List[Relation], entity_list: List[str], max_label_len=40):
    entity_set = {e.lower().strip() for e in entity_list}
    lines = ["flowchart LR"]

    def _id(s): return s.strip().replace(" ", "_").replace("(", "").replace(")", "").replace("-", "_")

    for t in triples:
        if t.subj.lower() in entity_set or t.obj.lower() in entity_set:
            label = t.pred.strip()
            if len(label) > max_label_len:
                label = label[:max_label_len - 3] + "..."
            lines.append(f'    {_id(t.subj)}["{t.subj}"] -->|{label}| {_id(t.obj)}["{t.obj}"]')

    return '\n'.join(lines)

Generate Mermaid Diagrams

In [None]:
os.makedirs("mermaid", exist_ok=True)

for d in data:
    entity_list = [e.entity for e in d['deduplicated_entities']]
    if not entity_list:
        continue
    while True:
        pred = rel_predictor(paragraph=d['text'], entities=entity_list)
        confidence = pred.confidence or 0.0
        if confidence >= 0.91:
            mermaid = triples_to_mermaid(pred.relations, entity_list)
            with open(f"mermaid/mermaid_{data.index(d)+1}.md", "w") as f:
                f.write(mermaid)
            break

8. 📊 CSV Export of Deduplicated Entities

In [None]:
os.makedirs("CSV", exist_ok=True)
rows = []

for d in data:
    for e in d['deduplicated_entities']:
        rows.append({'link': d['link'], 'tag': e.entity, 'tag_type': e.attr_type})

pd.DataFrame(rows).to_csv("CSV/Output.csv", index=False)

9. 🧾 Summary & Observations

✅ What This Notebook Does:

- Scrapes 10 web pages using BeautifulSoup
- Uses DSPy + Pydantic to extract structured entity data
- Deduplicates extracted entities using custom logic
- Creates Mermaid diagram output for visual relationship mapping
- Saves all extracted entities to a `Output.csv` file

---

📌 Notes:

- LLM entity extraction is not 100% deterministic — your output might vary run-to-run.
- You can add confidence scoring or filter based on entity types if needed.
- Mermaid diagrams can be extended to show richer relationships (e.g., weights, categories).

---

💬 Improvements (Optional):

- Add a confidence threshold for filtering LLM output
- Build a Streamlit app to visualize results
- Use fuzzy string matching for smarter deduplication
