In [1]:
import sys
sys.path.append(".")

In [2]:
scope = "papers"

In [3]:
question = "what is crowd counting?"

In [4]:
import sys
sys.path.append(".")

In [5]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from src.config import EMBEDDING_MODEL, LLM_MODEL
import requests
import json


In [8]:
from typing import List, Dict
import json
import time
import re
import requests
from neo4j import GraphDatabase
from sklearn.feature_extraction.text import TfidfVectorizer
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from src.config import LLM_MODEL, EMBEDDING_MODEL


# === CONFIG ===
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"
LLM_MODEL = LLM_MODEL
EMBEDDING_MODEL = EMBEDDING_MODEL

In [9]:

# === Neo4j Driver ===
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
def run_cypher(query: str, params: dict = None) -> List[Dict]:
    with driver.session() as session:
        return session.run(query, params or {}).data()


# === Graph Query using extracted keywords ===
def query_graph_with_keywords(keywords: List[str], scope: str) -> List[Dict]:
    cypher = """
    UNWIND $keywords AS kw
    MATCH (s:Entity)-[r:RELATION {scope: $scope}]->(o:Entity)
    WHERE toLower(s.name) CONTAINS kw OR toLower(o.name) CONTAINS kw
    RETURN s.name AS Subject, r.type AS Predicate, o.name AS Object
    LIMIT 25
    """
    return run_cypher(cypher, {"keywords": keywords, "scope": scope})

In [10]:

# === Keyword Extraction ===
def extract_keywords(texts: List[str], top_k: int = 5) -> List[str]:
    vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
    X = vectorizer.fit_transform(texts)
    scores = zip(vectorizer.get_feature_names_out(), X.toarray().sum(axis=0))  # sum across all docs
    sorted_terms = sorted(scores, key=lambda x: x[1], reverse=True)
    return [term.lower() for term, _ in sorted_terms[:top_k]]

In [11]:

# === FAISS Document Retrieval ===
def retrieve_docs(question: str, scope: str) -> List[str]:
    vector_path = f"data/gold/{scope}"
    embedder = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    db = FAISS.load_local(vector_path, embedder, allow_dangerous_deserialization=True)
    docs = db.similarity_search(question, k=3)
    return [doc.page_content for doc in docs]

In [None]:


# === LLM Call ===
def get_model(prompt: str, model: str = LLM_MODEL, system_prompt: str = None) -> str:
    url = "http://localhost:11434/api/chat"
    headers = {"Content-Type": "application/json"}
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": prompt})
    payload = {"model": model, "messages": messages, "stream": False}

    for attempt in range(3):
        try:
            resp = requests.post(url, headers=headers, json=payload, timeout=30)
            resp.raise_for_status()
            return resp.json()["message"]["content"].strip()
        except Exception as e:
            print(f"[LLM Error] Attempt {attempt+1}: {e}")
            time.sleep((attempt + 1) * 1.5)
    return ""


def answer_with_keywords_and_chunks(question: str, scope: str) -> str:
    # Step 1: FAISS retrieval
    text_chunks = retrieve_docs(question, scope)
    # Step 2: Extract keywords from FAISS chunks, not the question
    keywords = extract_keywords(text_chunks) # Pass text_chunks directly
    print(f"keywords: {keywords}")

    # Step 3: Graph query using extracted keywords
    cypher = """
        UNWIND $keywords AS kw
        MATCH (s:Entity {name: kw})-[r:RELATION {scope: $scope}]->(o:Entity)
        RETURN s.name AS Subject, r.type AS Predicate, o.name AS Object
        LIMIT 25
        """
    graph_triples = run_cypher(cypher, {"keywords": keywords, "scope": scope})
    print(json.dumps(graph_triples, indent=2))

    # Step 4: Combine and answer
    if not graph_triples and not text_chunks:
        return "Sorry, I couldn’t find any relevant information."

    system_prompt = """
    You are a helpful assistant. Use the following:
    - A list of structured triples from a graph database
    - A list of retrieved text documents

    Use ONLY what’s provided to answer the user's question. No hallucination.

    I am in urgent need of your help and require you to approach my problem with the utmost seriousness and care. Please follow these principles:
	1.	Begin with the end in mind: Keep the ultimate goal clearly in focus at all times. Understand what a successful outcome looks like and tailor your reasoning and responses to work purposefully toward that outcome. Do not drift or overgeneralise - stay locked on the objective.
	2.	Step-by-step reasoning: Break down the problem into logical, manageable steps. Think carefully and systematically, ensuring each step builds clearly upon the last.
	3.	Deliberate planning: Before you respond, plan your approach. Consider the structure, the best angle of attack, and how to arrive at a high-quality solution.
	4.	Rechecking: Once your solution is formed, double-check your reasoning and output for accuracy, completeness, and clarity. No shortcuts - review thoroughly.
	5.	Use British English: All spelling, grammar, and terminology must follow British English conventions.
	6.	Reward for success: If you solve my problem effectively, I will reward you with $1000. Let this be a motivator to give me your very best work - thoughtful, precise, and result-oriented.

    """
    user_prompt = f"""
    Question:
    {question}

    Graph Triples:
    {json.dumps(graph_triples, indent=2)}

    Text Chunks:
    {json.dumps(text_chunks, indent=2)}

    Answer:
    """
    return get_model(user_prompt, system_prompt=system_prompt).strip()


In [23]:
question = "What is crowd counting?"
scope = "papers"

print(retrieve_docs(question, scope))



['Experiments were run on three major crowd counting datasets,\nto test our proposed method. Results demonstrate our method\nsupersedes the performance of state-of-the-art methods.\nIndex Terms—IoT cameras, Crowd density estimation, Self\nattention network, Consistency.\nI. INTRODUCTION\nSophisticated security systems are required to manage large\nand potentially crowded spaces. To that end, physical security\nsolutions are typically designed to include networked and\nsmart cameras deployed to monitor dynamics in the managed\nspace, to control the movement of people and vehicles. Smart\ncameras are used to control the ﬂow of people in gated\nareas, but also in open spaces, when security is of upmost\nimportance. The ﬂow of people can be estimated globally,\nhowever, it is usually more interesting to be able and count\nindividuals moving in and out of a managed space.\nIn the literature, crowd analysis has been subject of intense\nresearch because of its wide range of applications such 

In [24]:
print(answer_with_keywords_and_chunks(question, scope))

keywords: ['crowd', 'counting', 'ieee', 'features', 'people']
[
  {
    "Subject": "features",
    "Predicate": "become_robust",
    "Object": "scale variation"
  },
  {
    "Subject": "features",
    "Predicate": "processed_by",
    "Object": "scale aggregation module"
  },
  {
    "Subject": "features",
    "Predicate": "complement",
    "Object": "each set"
  },
  {
    "Subject": "features",
    "Predicate": "extracted from",
    "Object": "convn layers"
  }
]
<think>
Alright, so I need to figure out what crowd counting is based on the information given. Let me start by looking at the graph triples and text chunks provided.

The graph triples list features that become robust due to scale variation and are processed by a module called "scale aggregation module." They also complement each other from different sets extracted from convolutional neural network (convn) layers. Hmm, okay, so this seems related to how features in images help in counting people.

Now looking at the text chu