In [None]:
# --------------------------------
from neo4j import GraphDatabase
import feedparser

# newspaper needs lxml_html_clean for cleaning HTML
try:
    from newspaper import Article
except ImportError:
    raise ImportError(
        "Missing required module for newspaper HTML cleaning.\n"
        "Install with: pip install newspaper3k lxml_html_clean"
    )

# LangChain imports
from langchain.schema import Document
try:
    from langchain_experimental.graphs.neo4j import Neo4jGraph, Neo4jGraphRetriever
except ImportError:
    raise ImportError(
        "Missing langchain_experimental package or its experimental graphs module.\n"
        "Install with: pip install langchain-experimental"
    )
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.agents import Tool, initialize_agent, AgentType

# Replace with your own credentials & endpoint
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PWD = "your_password"
OLLAMA_EMBED_MODEL = "ollama/text-embedding-ada-002"

# Create Neo4j driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PWD))

In [None]:
# %%
# Step 3: News‐Fetch Function
# ---------------------------
def fetch_latest_articles(industry: str, limit: int = 5) -> list[dict]:
    """
    Scrape Google News RSS for `industry` and return up to `limit` articles.
    Each dict contains: title, url, snippet, published.
    """
    feed_url = f"https://news.google.com/rss/search?q={industry.replace(' ', '+')}"
    feed = feedparser.parse(feed_url)
    results = []
    for entry in feed.entries[:limit]:
        art = Article(entry.link)
        art.download(); art.parse()
        results.append({
            "title":     entry.title,
            "url":       entry.link,
            "snippet":   art.text[:300] + "...",
            "published": entry.published,
        })
    return results

# %%
# Step 4: Ingest Article into Neo4j
# ----------------------------------
def ingest_article(industry: str, info: dict):
    """
    Merge the Industry node, create an Article node and link it.
    """
    query = """
    MERGE (c:Class {name:$industry})
    CREATE (a:Article {
      title:$title, url:$url,
      snippet:$snippet, published:$published
    })
    MERGE (a)-[:ABOUT]->(c)
    """
    params = {
        "industry":  industry,
        "title":     info["title"],
        "url":       info["url"],
        "snippet":   info["snippet"],
        "published": info["published"],
    }
    with driver.session() as sess:
        sess.run(query, params)

# %%
# Step 5: Combined Fetch & Ingest Tool
# -------------------------------------
def fetch_and_ingest(industry: str) -> str:
    """
    Fetch latest articles and write them into Neo4j.
    Returns a summary string.
    """
    articles = fetch_latest_articles(industry)
    for art in articles:
        ingest_article(industry, art)
    return f"Ingested {len(articles)} articles for '{industry}' into Neo4j." 

# %%
# Step 6: Load Documents from Neo4j for Vector Index
# ---------------------------------------------------
def load_docs_for_embeddings() -> list[Document]:
    """
    Pull all Class node descriptions and Article texts as Documents.
    """
    docs = []
    query = """
    MATCH (c:Class)
      OPTIONAL MATCH (a:Article)-[:ABOUT]->(c)
    RETURN c.name AS industry, c.description AS desc, collect(a.snippet) AS snippets
    """
    with driver.session() as sess:
        for rec in sess.run(query):
            text = (rec["desc"] or "") + "\n\n" + "\n---\n".join(rec["snippets"])
            docs.append(
                Document(
                    page_content=text,
                    metadata={"industry": rec["industry"]}
                )
            )
    return docs

# %%
# Step 7: Build Vector Store
# --------------------------
all_docs = load_docs_for_embeddings()
emb = OllamaEmbeddings(model=OLLAMA_EMBED_MODEL)
vector_store = FAISS.from_documents(all_docs, emb)

# (Optional) persist index
# vector_store.save_local("industry_articles_faiss")

# %%
# Step 8: Configure Retrievers
# ----------------------------
# Graph Retriever
graph = Neo4jGraph(uri=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PWD)
graph_retriever = Neo4jGraphRetriever(
    graph=graph,
    node_label="Class",
    relationship_type="ABOUT",  # or your custom edge type
    traversal_depth=2,
    top_k=5
)

# Semantic Retriever
semantic_retriever = vector_store.as_retriever(search_kwargs={"k": 5})

# %%
# Step 9: Wrap Tools & Initialize Agent
# --------------------------------------
tools = [
    Tool(
        name="fetch_news",
        func=fetch_and_ingest,
        description="Fetch and ingest latest news articles for an industry"
    ),
    Tool(
        name="graph_search",
        func=graph_retriever.get_relevant_nodes,
        description="Traverse industry graph for related nodes"
    ),
    Tool(
        name="semantic_search",
        func=semantic_retriever.get_relevant_documents,
        description="Find semantically relevant docs via FAISS"
    ),
]

llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
agent = initialize_agent(
    tools, llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

# %%
# Step 10: Run Example Queries
# -----------------------------
# 1) Ingest fresh news for 'Automotive'
print(agent.run("fetch_news Automotive"))

# 2) Ask a hybrid question
resp = agent.run("What do you know about the 'Renewable Energy' industry and its top two-hop neighbors?")
print(resp)
