In [2]:
import os
import subprocess
import shutil
from pathlib import Path

# ─── CONFIGURATION ──────────────────────────────────────────────────────────

# (1) List of Git repository URLs to clone.
#     You can also load these from a text file if you prefer.
GIT_URLS = [
    "https://github.com/sktime/sktime",
    "https://github.com/karask/python-bitcoin-utils"
]

# (2) Base directories for raw clones and for the “normalized” output:
BASE_CLONE_DIR = Path("repos")               # where repos get cloned
BASE_NORMALIZED_DIR = Path("normalized_repos")  # where filtered code goes

# (3) Directories to skip entirely (relative to each repo root). 
#     Any folder whose name matches one of these will be ignored.
IGNORED_DIR_NAMES = {
    ".git",
    "node_modules",
    "venv",
    "__pycache__",
    "build",
    "dist",
    ".idea",
    ".vscode",
    ".pytest_cache",
    # add others (e.g. “target” for Maven/Gradle, “.gradle”, etc.) as needed
}

# (4) File‐extensions to keep (i.e., source code files).
#     Adapt this list to the languages in your repos.
CODE_EXTENSIONS = {
    ".py",   # Python
    # ".java", # Java
    # ".js",   # JavaScript
    # ".ts",   # TypeScript
    # ".go",   # Go
    # ".cpp",  # C++
    # ".c",    # C
    # ".h",    # C/C++ headers
    # ".cs",   # C#
    # ".rs",   # Rust
    # ".php",  # PHP
    # ".rb",   # Ruby
}

# ─── FUNCTIONS ────────────────────────────────────────────────────────────────

def clone_repos(git_urls, dest_dir):
    """
    Clone each Git URL in git_urls into dest_dir.
    If the folder already exists, skip cloning.
    """
    dest_dir.mkdir(parents=True, exist_ok=True)
    for url in git_urls:
        repo_name = url.rstrip("/").split("/")[-1].removesuffix(".git")
        repo_path = dest_dir / repo_name

        if repo_path.exists():
            print(f"[SKIP] Already cloned: {repo_name}")
            continue

        print(f"[CLONING] {url} → {repo_path}")
        try:
            subprocess.run(
                ["git", "clone", "--depth", "1", url, str(repo_path)],
                check=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
            print(f"[OK] Cloned {repo_name}")
        except subprocess.CalledProcessError as e:
            print(f"[ERROR] Failed to clone {url}: {e.stderr.decode().strip()}")


def should_ignore_dir(dir_name):
    """
    Return True if dir_name is in our IGNORED_DIR_NAMES list.
    """
    return dir_name in IGNORED_DIR_NAMES


def normalize_repo(src_repo_path: Path, dst_base_path: Path):
    """
    Walk through src_repo_path, skipping any folder listed in IGNORED_DIR_NAMES.
    Copy only files whose extension is in CODE_EXTENSIONS into the parallel
    folder under dst_base_path / <repo_name> / <relative_path>.
    """
    repo_name = src_repo_path.name
    dst_repo_root = dst_base_path / repo_name
    print(f"\n[PROCESS] Normalizing: {repo_name}")
    for root, dirs, files in os.walk(src_repo_path):
        root_path = Path(root)

        # 1) Prune ignored subfolders in-place:
        #    Modify dirs[:] so that walk() won’t descend into them.
        dirs[:] = [d for d in dirs if not should_ignore_dir(d)]
        # Now `dirs` no longer contains any IGNORE names, so walk() skips them.

        # 2) For each file in this kept folder, check extension:
        for fname in files:
            ext = Path(fname).suffix.lower()
            if ext not in CODE_EXTENSIONS:
                continue  # skip non‐code files

            # Compute source path and destination path:
            rel_path = root_path.relative_to(src_repo_path)
            src_file = root_path / fname
            dst_folder = dst_repo_root / rel_path
            dst_folder.mkdir(parents=True, exist_ok=True)

            dst_file = dst_folder / fname
            shutil.copy2(src_file, dst_file)
            print(f"  • Copied: {rel_path / fname}")

    print(f"[DONE] Normalized files for {repo_name} → {dst_repo_root}")


def main():
    # Step 1: Clone all repos into BASE_CLONE_DIR/
    clone_repos(GIT_URLS, BASE_CLONE_DIR)

    # Step 2: For each cloned repo, run the “normalize” pass:
    BASE_NORMALIZED_DIR.mkdir(parents=True, exist_ok=True)

    for repo_dir in BASE_CLONE_DIR.iterdir():
        if not repo_dir.is_dir():
            continue
        # Normalize this repository:
        normalize_repo(repo_dir, BASE_NORMALIZED_DIR)


if __name__ == "__main__":
    main()


[CLONING] https://github.com/sktime/sktime → repos\sktime
[OK] Cloned sktime
[CLONING] https://github.com/karask/python-bitcoin-utils → repos\python-bitcoin-utils
[OK] Cloned python-bitcoin-utils

[PROCESS] Normalizing: python-bitcoin-utils
  • Copied: setup.py
  • Copied: bitcoinutils\bech32.py
  • Copied: bitcoinutils\block.py
  • Copied: bitcoinutils\constants.py
  • Copied: bitcoinutils\hdwallet.py
  • Copied: bitcoinutils\keys.py
  • Copied: bitcoinutils\proxy.py
  • Copied: bitcoinutils\ripemd160.py
  • Copied: bitcoinutils\schnorr.py
  • Copied: bitcoinutils\script.py
  • Copied: bitcoinutils\setup.py
  • Copied: bitcoinutils\transactions.py
  • Copied: bitcoinutils\utils.py
  • Copied: bitcoinutils\__init__.py
  • Copied: docs\conf.py
  • Copied: examples\block_parse.py
  • Copied: examples\create_and_mine_block.py
  • Copied: examples\create_non_std_tx.py
  • Copied: examples\create_p2sh_csv_p2pkh_address.py
  • Copied: examples\hd_keys.py
  • Copied: examples\keys_addresses.p

In [None]:
import os
import ast
from pathlib import Path
from neo4j import GraphDatabase

# ─── CONFIGURATION ────────────────────────────────────────────────────────────

# (1) Where your normalized repositories live:
NORMALIZED_BASE = Path("normalized_repos")

# (2) Neo4j connection info (adjust as needed):
NEO4J_URI="neo4j+s://edaa4b0b.databases.neo4j.io"
NEO4J_USERNAME="neo4j"
NEO4J_PASSWORD="QbKbT-bffZwN3f7qKD6QwjRgw7sHu8Lnz1kM_Aj6c3Q"
NEO4J_DATABASE="neo4j"

# ─── HELPERS FOR AST PARSING ───────────────────────────────────────────────────

def get_module_name(repo_root: Path, file_path: Path) -> str:
    """
    Given:
      repo_root = normalized_repos/repoA
      file_path = normalized_repos/repoA/subpkg/helper.py
    Return the Python module string: "repoA.subpkg.helper"
    """
    rel = file_path.relative_to(repo_root).with_suffix("")  # e.g. subpkg/helper
    parts = [repo_root.name] + list(rel.parts)             # ["repoA", "subpkg", "helper"]
    return ".".join(parts)

def collect_python_files(normalized_base: Path):
    """
    Walk through each repo folder in `normalized_base` and return a list of tuples:
      [
        (repo_name(str), file_path(Path), module_name(str)), 
        ...
      ]
    """
    all_files = []
    for repo_dir in normalized_base.iterdir():
        if not repo_dir.is_dir():
            continue
        repo_name = repo_dir.name
        for root, _, files in os.walk(repo_dir):
            root_path = Path(root)
            for fname in files:
                if fname.lower().endswith(".py"):
                    fpath = root_path / fname
                    module_name = get_module_name(repo_dir, fpath)
                    all_files.append((repo_name, fpath, module_name))
    return all_files

def parse_defs_and_imports(file_path: Path):
    """
    Parse a .py file and return three lists/dicts:
      - functions:   { func_name: ast.FunctionDef node }
      - classes:     { class_name: ast.ClassDef node }
      - imports:     [ (imported_module_str, lineno) , ... ]  
                      e.g. if file has `import subpkg.helper as h`, record ("subpkg.helper", lineno)
                      or  if file has `from subpkg.foo import bar`, record ("subpkg.foo.bar", lineno)
    """
    with open(file_path, "r", encoding="utf-8") as f:
        source = f.read()

    tree = ast.parse(source, filename=str(file_path))
    funcs = {}
    classes = {}
    imports = []

    

    for node in ast.walk(tree):
        # ─── Collect function defs ────────────────────────────────────────────────
        if isinstance(node, ast.FunctionDef):
            funcs[node.name] = node

        # ─── Collect class defs ───────────────────────────────────────────────────
        elif isinstance(node, ast.ClassDef):
            classes[node.name] = node

        # ─── Collect import statements ────────────────────────────────────────────
        elif isinstance(node, ast.Import):
            for alias in node.names:
                # e.g. "import subpkg.helper as h" → record "subpkg.helper"
                imports.append((alias.name, node.lineno))

        elif isinstance(node, ast.ImportFrom):
            module = node.module or ""
            for alias in node.names:
                # e.g. "from subpkg import helper" → record "subpkg.helper"
                full_name = f"{module}.{alias.name}" if module else alias.name
                imports.append((full_name, node.lineno))

    return funcs, classes, imports

def collect_call_edges(func_node: ast.FunctionDef):
    """
    Given an ast.FunctionDef, collect all simple calls of the form:
      foo(...)
    or
      self.foo(...)
    This returns a set of call-names (strings). We’ll only link a CALLS edge
    if we later find that name in the same file’s function map.
    """
    calls = set()

    class _CallVisitor(ast.NodeVisitor):
        def visit_Call(self, node: ast.Call):
            # If it’s a bare name, e.g. foo(...)
            if isinstance(node.func, ast.Name):
                calls.add(node.func.id)
            # If it’s an attribute, e.g. self.foo(...)
            elif isinstance(node.func, ast.Attribute):
                if isinstance(node.func.attr, str):
                    calls.add(node.func.attr)
            self.generic_visit(node)

    _CallVisitor().visit(func_node)
    return calls

def collect_inheritance_edges(class_node: ast.ClassDef):
    """
    For a ClassDef, return a list of parent‐class-names (strings). Only simple names:
      class Child(ParentA, ParentB): ...
    or
      class Child(module.ParentC): ...
    We’ll try to link “INHERITS” to any Class in our graph with a matching name.
    """
    bases = []
    for base in class_node.bases:
        if isinstance(base, ast.Name):
            bases.append(base.id)  # e.g. ParentA
        elif isinstance(base, ast.Attribute):
            # e.g. module.ParentC
            name_parts = []
            cur = base
            while isinstance(cur, ast.Attribute):
                name_parts.append(cur.attr)
                cur = cur.value
            if isinstance(cur, ast.Name):
                name_parts.append(cur.id)
                full = ".".join(reversed(name_parts))
                bases.append(full)
        # other complexities (e.g. subscripts, calls) are skipped
    return bases

# ─── BUILD A REPO‐WIDE INDEX MAPS FOR RESOLUTION ───────────────────────────────

def build_repo_index(all_files):
    """
    From the list of all_files = [(repo_name, Path, module_name), ...], build:
      - module_to_file: { "repoA.subpkg.helper": Path(...) }
      - file_to_module: { Path(...): "repoA.subpkg.helper" }
    """
    module_to_file = {}
    file_to_module = {}
    for repo_name, fpath, module_name in all_files:
        module_to_file[module_name] = fpath
        file_to_module[fpath] = module_name
    return module_to_file, file_to_module

# ─── NEO4J WRAPPER ────────────────────────────────────────────────────────────

class Neo4jGraphBuilder:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def run(self, query: str, params: dict = None, return_results: bool = False):
        with self.driver.session() as sess:
            if params:
                result = sess.run(query, params)
            else:
                result = sess.run(query)
            if return_results:
                return list(result)
        # For write queries, just execute and return nothing
    def create_repo_node(self, repo_name: str):
        """
        MERGE (r:Repo {name: $repo_name})
        """
        self.run(
            """
            MERGE (r:Repo {name: $repo_name})
            """,
            {"repo_name": repo_name}
        )

    def create_file_node(self, repo_name: str, file_path: str, module_name: str):
        """
        MERGE (f:File {path: $file_path})
        WITH f
        MATCH (r:Repo {name: $repo_name})
        MERGE (r)-[:CONTAINS]->(f)
        SET f.module = $module_name, f.repo = $repo_name
        """
        self.run(
            """
            MERGE (f:File {path: $file_path})
            ON CREATE SET f.module = $module_name, f.repo = $repo_name
            """,
            {
                "file_path": file_path,
                "module_name": module_name,
                "repo_name": repo_name
            }
        )
        # Link Repo→File
        self.run(
            """
            MATCH (r:Repo {name: $repo_name}), (f:File {path: $file_path})
            MERGE (r)-[:CONTAINS]->(f)
            """,
            {"repo_name": repo_name, "file_path": file_path}
        )

    def create_function_node(self, repo_name: str, file_path: str,
                             module_name: str, func_name: str, lineno: int):
        """
        MERGE (fn:Function {qualified_name: $qualified_name})
        SET fn.name = $func_name, fn.lineno = $lineno, fn.file = $file_path, fn.repo = $repo_name
        WITH fn
        MATCH (f:File {path: $file_path})
        MERGE (f)-[:DECLARES_FUNCTION]->(fn)
        """
        qualified_name = f"{module_name}.{func_name}"
        self.run(
            """
            MERGE (fn:Function {qualified_name: $qualified_name})
            ON CREATE SET fn.name = $func_name, fn.lineno = $lineno, fn.file = $file_path, fn.repo = $repo_name
            """,
            {
                "qualified_name": qualified_name,
                "func_name": func_name,
                "lineno": lineno,
                "file_path": file_path,
                "repo_name": repo_name
            }
        )
        # Link File→Function
        self.run(
            """
            MATCH (fn:Function {qualified_name: $qualified_name}), (f:File {path: $file_path})
            MERGE (f)-[:DECLARES_FUNCTION]->(fn)
            """,
            {
                "qualified_name": qualified_name,
                "file_path": file_path
            }
        )

    def create_class_node(self, repo_name: str, file_path: str,
                          module_name: str, class_name: str, lineno: int):
        """
        MERGE (cl:Class {qualified_name: $qualified_name})
        SET cl.name = $class_name, cl.lineno = $lineno, cl.file = $file_path, cl.repo = $repo_name
        WITH cl
        MATCH (f:File {path: $file_path})
        MERGE (f)-[:DECLARES_CLASS]->(cl)
        """
        qualified_name = f"{module_name}.{class_name}"
        self.run(
            """
            MERGE (cl:Class {qualified_name: $qualified_name})
            ON CREATE SET cl.name = $class_name, cl.lineno = $lineno, cl.file = $file_path, cl.repo = $repo_name
            """,
            {
                "qualified_name": qualified_name,
                "class_name": class_name,
                "lineno": lineno,
                "file_path": file_path,
                "repo_name": repo_name
            }
        )
        # Link File→Class
        self.run(
            """
            MATCH (cl:Class {qualified_name: $qualified_name}), (f:File {path: $file_path})
            MERGE (f)-[:DECLARES_CLASS]->(cl)
            """,
            {
                "qualified_name": qualified_name,
                "file_path": file_path
            }
        )

    def create_import_edge(self, from_file: str, to_file: str):
        """
        MATCH (src:File {path: $from_file}), (dst:File {path: $to_file})
        MERGE (src)-[:IMPORTS]->(dst)
        """
        self.run(
            """
            MATCH (src:File {path: $from_file}), (dst:File {path: $to_file})
            MERGE (src)-[:IMPORTS]->(dst)
            """,
            {"from_file": from_file, "to_file": to_file}
        )

    def create_call_edge(self, from_func: str, to_func: str):
        """
        MATCH (f1:Function {qualified_name: $from_func}), (f2:Function {qualified_name: $to_func})
        MERGE (f1)-[:CALLS]->(f2)
        """
        self.run(
            """
            MATCH (f1:Function {qualified_name: $from_func}), (f2:Function {qualified_name: $to_func})
            MERGE (f1)-[:CALLS]->(f2)
            """,
            {"from_func": from_func, "to_func": to_func}
        )

    def create_inherits_edge(self, child_cls: str, parent_cls: str):
        """
        MATCH (c:Class {qualified_name: $child_cls}), (p:Class {qualified_name: $parent_cls})
        MERGE (c)-[:INHERITS]->(p)
        """
        self.run(
            """
            MATCH (c:Class {qualified_name: $child_cls}), (p:Class {qualified_name: $parent_cls})
            MERGE (c)-[:INHERITS]->(p)
            """,
            {"child_cls": child_cls, "parent_cls": parent_cls}
        )


In [27]:

    # 1) Collect all Python files in all normalized repos:
all_files = collect_python_files(NORMALIZED_BASE)
    # all_files: [ (repo_name, Path("/.../repoA/foo.py"), "repoA.foo"), … ]

    # 2) Build module<->file maps across the entire dataset:
module_to_file, file_to_module = build_repo_index(all_files)

# 3) Connect to Neo4j:
graph = Neo4jGraphBuilder(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)

# 4) Create Repo nodes up‐front:
unique_repos = sorted({ repo for (repo, _, _) in all_files })

for  repo_name in unique_repos:
     graph.create_repo_node(repo_name)

    

In [7]:
# 5) For each .py file, parse and create File/Function/Class nodes:
    #    Also record per‐file function_names & class_names for later resolution.
per_file_funcs = {}     # { Path("/.../foo.py") : set(["funcA", "funcB", …]) }
per_file_classes = {}   # { Path("/.../bar.py") : set(["ClassA", "ClassB", …]) }

for repo_name, fpath, module_name in all_files:
    file_str = str(fpath)
    # 5a) Create the File node and link to Repo:
    graph.create_file_node(repo_name, file_str, module_name)

    # 5b) Parse AST for defs and imports:
    funcs, classes, imports = parse_defs_and_imports(fpath)
    per_file_funcs[fpath] = set(funcs.keys())
    per_file_classes[fpath] = set(classes.keys())

    # 5c) Create Function nodes (with edges File→Function):
    for func_name, node in funcs.items():
            graph.create_function_node(
                repo_name=repo_name,
                file_path=file_str,
                module_name=module_name,
                func_name=func_name,
                lineno=node.lineno
            )

    # 5d) Create Class nodes (with edges File→Class):
    for class_name, node in classes.items():
        graph.create_class_node(
            repo_name=repo_name,
            file_path=file_str,
            module_name=module_name,
            class_name=class_name,
            lineno=node.lineno
        )

    # 6) SECOND PASS: Create IMPORTS, CALLS, and INHERITS edges.
for repo_name, fpath, module_name in all_files:
    file_str = str(fpath)
    funcs, classes, imports = parse_defs_and_imports(fpath)

    # 6a) IMPORTS edges:
    for full_modname, _lineno in imports:
            # Try to resolve full_modname to a file in our normalized_repos:
            # E.g. if full_modname=="repoA.subpkg.helper", module_to_file[...] exists.
            if full_modname in module_to_file:
                target_path = module_to_file[full_modname]
                graph.create_import_edge(file_str, str(target_path))
            # Otherwise, skip (external-library or built-in)

    # 6b) CALLS edges (only if the callee is in the same file):
    for func_name, node in funcs.items():
        from_qualified = f"{module_name}.{func_name}"
        called_names = collect_call_edges(node)
        for callee in called_names:
            # If the callee name exists in this same file’s function set:
            if callee in per_file_funcs[fpath]:
                to_qualified = f"{module_name}.{callee}"
                graph.create_call_edge(from_qualified, to_qualified)
            # else: skip cross‐file call resolution (would require deeper analysis)

    # 6c) INHERITS edges (link a Class to its parent if parent is in any repo):
    for class_name, node in classes.items():
        child_qual = f"{module_name}.{class_name}"
        parents = collect_inheritance_edges(node)  # e.g. ["ParentA", "pkg.BaseClass"]
        for parent in parents:
            # Two possibilities for parent resolution:
            #  - parent is a local class in this same file
            #  - parent is a class defined elsewhere in our dataset, so we search all module paths
            # We’ll do a simple search: look for any Class whose qualified_name endswith f".{parent}"
            # (NOTE: this may match multiple results if there are name collisions)
            query = """
            MATCH (p:Class)
            WHERE p.qualified_name ENDS WITH $suffix
            RETURN p.qualified_name AS qname
            """
            # Materialize all matching records into a Python list first:
            records = list(graph.run(query, {"suffix": f".{parent}"}))
            for record in records:
                parent_qname = record["qname"]
                graph.create_inherits_edge(child_qual, parent_qname)

    graph.close()
    print("\n✅ Finished building Python‐code graph into Neo4j.")



✅ Finished building Python‐code graph into Neo4j.


  with self.driver.session() as sess:


TypeError: 'NoneType' object is not iterable

In [None]:
import requests

response = requests.get("https://french.braidpool.net:11435",
                        verify="C:/Users/keshav/Downloads/rootCA.crt",
                        auth=('keshav', 'T6wJXf36pS9Izdjy'))
print(response.content)

b'Ollama is running'


In [3]:
import requests
from requests.auth import HTTPBasicAuth

def use_ollama_llm(text: str):
    url = "https://french.braidpool.net:11435/api/embed"
    auth = HTTPBasicAuth("keshav", "T6wJXf36pS9Izdjy")
    data = {
        "model": "mahonzhan/bge-code-v1",
        "input": text,
    }
    response = requests.post(url, json=data, auth=auth, verify="C:/Users/keshav/Downloads/rootCA.crt")

    return response.json()['embeddings']

In [9]:
def use_ollama_llm_normal(context: str, question: str, model: str = "qwen3:14b"):
    url = "https://french.braidpool.net:11435/api/generate"
    auth = HTTPBasicAuth("keshav", "T6wJXf36pS9Izdjy")
    data = {
        "model": model,
        "system_prompt": context,
        "prompt": question,
        "stream": False
    }
    response = requests.post(url, json=data, auth=auth, verify=False)

    return response.json()['response']

In [100]:
print(use_ollama_llm_normal("Hello, how are you?", "Hello, how are you?"))



<think>
Okay, the user greeted me with "Hello, how are you?" I need to respond appropriately. First, I should acknowledge their greeting. Since I'm an AI, I don't have feelings, but I can express that I'm here to help. Maybe add a friendly emoji to keep it warm. Then, ask them how they're doing to keep the conversation going. Keep it simple and positive. Let me check if that makes sense. Yeah, that should work. Make sure there's no markdown and the response is natural.
</think>

Hello! I'm great, thank you for asking! 😊 How are you today? I'm here and ready to help with anything you need!


In [98]:
texts = ["First sentence.", "Second sentence.", "Third sentence."]
embeddings = [use_ollama_llm(text) for text in texts]
print(embeddings)

[[[-0.0034917847, 0.0032866402, -0.0032140934, -0.014283796, -0.0014642003, -0.016631711, 0.0022007187, -0.0025887492, -0.0046965, 0.0038079962, -0.002552822, -0.022914026, -0.002655776, 0.00030838896, 0.0006372391, -0.0027045726, 0.005751366, 0.017819963, 0.0032527568, -0.008371747, 0.0036593755, 0.0050342227, -0.014303696, -0.0020244983, -0.0010629746, -0.00089740014, -0.008360119, -0.004763837, -0.0053643333, 0.0039991094, 0.0058296192, 0.007613646, -0.0014087782, 0.015906658, 0.002783705, 0.0073660105, -0.0017175172, 0.0036108692, -0.0066793524, -0.010861308, -0.00018159734, -0.009018729, -0.0061074416, -0.009455501, -0.0031793143, 0.00034984024, -0.0022949874, 0.0064598876, 0.0068487884, -0.009286705, -0.0014139402, 0.006066126, 0.006784024, 0.0032816054, 0.0026838211, 0.0035185253, -0.0003400133, -0.0012364232, -0.007142598, 0.004401064, 0.002922662, 0.0013684604, 0.012161481, -0.005085791, -0.0108408, -0.0014782388, -0.0067688483, 0.013669897, -0.0005067728, -0.011917586, -0.003

In [103]:
import weaviate
from weaviate.classes.init import Auth


weaviate_url = "sxfou7uostmqhdla3a.c0.asia-southeast1.gcp.weaviate.cloud"
weaviate_api_key = "bDdwQW9odUoxNHM0QU1MU19KRlg2VDE3VGhiN1B2NUtBRk95cHRhOUpMRnFHMVMyeXMyVVdNMnlYSEhrPV92MjAw"

# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
)

print(client.is_ready())



True


In [46]:
from weaviate.classes.config import Property, DataType


collection_name = "CodeNode"

# Delete the class if it already exists (for testing)
# if client.collections.exists(collection_name):
#     client.collections.delete(collection_name)

# client.collections.create(
#     collection_name,
#     properties=[
#         Property(name="qualified_name", data_type=DataType.TEXT),
#         Property(name="type", data_type=DataType.TEXT),
#         Property(name="module", data_type=DataType.TEXT),
#         Property(name="code", data_type=DataType.TEXT),
#     ],
#     # No vectorizer_config if you supply your own vectors
# )

In [14]:
def fetch_code_nodes_with_location(driver):
    with driver.session() as session:
        result = session.run("""
            MATCH (n)
            WHERE n:Function OR n:Class
            RETURN n.qualified_name AS qualified_name,
                   n.name AS name,
                   n.lineno AS lineno,
                   n.file AS file,
                   labels(n)[0] AS type
        """)
        return [record.data() for record in result]
        

In [70]:
import ast

def extract_code_snippet(file_path, node_name, node_type):
    with open(file_path, "r", encoding="utf-8") as f:
        source = f.read()
    tree = ast.parse(source, filename=str(file_path))
    for node in ast.walk(tree):
        if node_type == "Function" and isinstance(node, ast.FunctionDef) and node.name == node_name:
            return ast.get_source_segment(source, node)
        elif node_type == "Class" and isinstance(node, ast.ClassDef) and node.name == node_name:
            return ast.get_source_segment(source, node)
        elif node_type == "Module" and isinstance(node, ast.Module) and node.name == node_name:
            return ast.get_source_segment(source, node)
        elif node_type == "Import" and isinstance(node, ast.Import) and node.name == node_name:
            return ast.get_source_segment(source, node)
    return None  # Not found

In [104]:
from neo4j import GraphDatabase

NEO4J_URI="neo4j+s://edaa4b0b.databases.neo4j.io"
NEO4J_USERNAME="neo4j"
NEO4J_PASSWORD="QbKbT-bffZwN3f7qKD6QwjRgw7sHu8Lnz1kM_Aj6c3Q"
NEO4J_DATABASE="neo4j"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

In [18]:
import uuid
import weaviate.classes as wvc # Make sure this import is present

# collection_name should be defined, e.g., collection_name = "CodeNode"

for idx, node in enumerate(fetch_code_nodes_with_location(driver)):
    # if idx > 10: break  # Limit for testing
    
    qualified_name = node["qualified_name"] # Get qualified_name early for the check

    # Check if the node already exists in Weaviate
    try:
        response = client.collections.get(collection_name).query.fetch_objects(
            filters=wvc.query.Filter.by_property("qualified_name").equal(qualified_name),
            limit=1 # We only need to know if at least one exists
        )
        
        if len(response.objects) > 0:
            print(f"Skipping {qualified_name}: already exists in Weaviate.")
            continue # Skip to the next node
            
    except Exception as e:
        print(f"Error checking Weaviate for {qualified_name}: {e}")
        # Optionally, decide if you want to stop or continue if the check fails
        # For now, let's continue to the next node if the check itself fails
        continue

    # If the node does not exist, proceed with extraction, embedding, and insertion
    file_path = node["file"]
    node_name = node["name"]
    node_type = node["type"]
    code = extract_code_snippet(file_path, node_name, node_type)
    
    if not code:
        print(f"Skipping {qualified_name}: no code snippet extracted.")
        continue
        
    try:
        emb_response = use_ollama_llm(code) # Renamed to emb_response to avoid conflict
        
        # Handle different embedding response structures
        if isinstance(emb_response, dict) and "embedding" in emb_response:
            emb = emb_response["embedding"]
        elif isinstance(emb_response, list) and len(emb_response) > 0 and isinstance(emb_response[0], (float, int)): # Simpler check for flat list
             emb = emb_response
        elif isinstance(emb_response, list) and len(emb_response) == 1 and isinstance(emb_response[0], list): # Nested list
            emb = emb_response[0]
        else:
            print(f"Skipping {qualified_name}: Unexpected embedding format from Ollama: {type(emb_response)}")
            continue

        # Validate embedding
        if not (isinstance(emb, list) and len(emb) == 1536):
            actual_len = len(emb) if isinstance(emb, list) else "N/A"
            print(f"Skipping {qualified_name}: embedding has wrong length {actual_len} or type {type(emb)}. Expected list of 1536.")
            continue
            
        # print(f"Embedding for {qualified_name}: {str(emb[:5])[:-1]}..., len: {len(emb)}") # Debug print

    except Exception as e:
        print(f"Embedding failed for {qualified_name}: {e}")
        continue

    # Insert into Weaviate
    try:
        client.collections.get(collection_name).data.insert(
            properties={
                "qualified_name": qualified_name,
                "type": node_type,
                "module": node.get("module", ""),
                "code": code,
            },
            vector=emb,
            uuid=str(uuid.uuid4()) # Generate a new UUID for each new object
        )
        print(f"Inserted: {qualified_name}")
    except Exception as e:
        print(f"Failed to insert {qualified_name} into Weaviate: {e}")



Skipping python-bitcoin-utils.bitcoinutils.bech32.bech32_polymod: already exists in Weaviate.
Skipping python-bitcoin-utils.bitcoinutils.bech32.bech32_hrp_expand: already exists in Weaviate.
Skipping python-bitcoin-utils.bitcoinutils.bech32.bech32_verify_checksum: already exists in Weaviate.
Skipping python-bitcoin-utils.bitcoinutils.bech32.bech32_create_checksum: already exists in Weaviate.
Skipping python-bitcoin-utils.bitcoinutils.bech32.bech32_encode: already exists in Weaviate.
Skipping python-bitcoin-utils.bitcoinutils.bech32.bech32_decode: already exists in Weaviate.
Skipping python-bitcoin-utils.bitcoinutils.bech32.convertbits: already exists in Weaviate.
Skipping python-bitcoin-utils.bitcoinutils.bech32.decode: already exists in Weaviate.
Skipping python-bitcoin-utils.bitcoinutils.bech32.encode: already exists in Weaviate.
Skipping python-bitcoin-utils.bitcoinutils.bech32.Encoding: already exists in Weaviate.
Skipping python-bitcoin-utils.bitcoinutils.block.__init__: already e

In [60]:
# Make sure you have APOC procedures enabled in your Neo4j instance for the apoc.do.when call.

def get_node_details(driver, qualified_name):
    with driver.session() as session:
        query = """
        MATCH (n {qualified_name: $qname})
        // 'n.file' is the property storing the file path for Function/Class nodes
        // as per your Neo4jGraphBuilder script.
        WITH n, labels(n)[0] AS node_type, n.file AS node_file_path 

        OPTIONAL MATCH (parent_class:Class)-[:DECLARES_FUNCTION]->(n) WHERE node_type = 'Function'
        OPTIONAL MATCH (parent_class)-[:DECLARES_FUNCTION]->(sibling_function:Function) WHERE node_type = 'Function' AND n <> sibling_function
        OPTIONAL MATCH (n)-[:CALLS]->(func_called:Function) WHERE node_type = 'Function'
        OPTIONAL MATCH (func_calling:Function)-[:CALLS]->(n) WHERE node_type = 'Function'

        OPTIONAL MATCH (n)-[:DECLARES_FUNCTION]->(declared_method:Function) WHERE node_type = 'Class' // Methods declared by the class
        OPTIONAL MATCH (declared_method)-[:CALLS]->(class_method_calls:Function) WHERE node_type = 'Class' // Calls made by methods of this class
        OPTIONAL MATCH (caller_to_class_method:Function)-[:CALLS]->(declared_method) WHERE node_type = 'Class' // Functions calling methods of this class

        // Fetch import references based on your (File)-[:IMPORTS]->(File) structure
        // The target File's 'module' property is what we want.
        CALL apoc.do.when(node_file_path IS NOT NULL AND trim(node_file_path) <> "",
          'MATCH (source_file:File {path: $nfp})-[:IMPORTS]->(imported_file:File) RETURN collect(imported_file.module) AS imports_list',
          'RETURN [] AS imports_list',
          {nfp: node_file_path}
        ) YIELD value AS import_data

        RETURN
            node_type,
            node_file_path, 
            import_data.imports_list AS file_import_references, // This now directly gives the list of module strings
            parent_class.qualified_name AS func_parent_class,
            collect(DISTINCT sibling_function.qualified_name) AS func_sibling_methods,
            collect(DISTINCT func_called.qualified_name) AS func_calls,
            collect(DISTINCT func_calling.qualified_name) AS func_called_by,
            collect(DISTINCT declared_method.qualified_name) AS class_declared_methods,
            collect(DISTINCT class_method_calls.qualified_name) AS class_calls_by_methods,
            collect(DISTINCT caller_to_class_method.qualified_name) AS class_called_by_callers_to_methods
        """
        result = session.run(query, {"qname": qualified_name}).single()

        if not result:
            return {"error": "Node not found"}

        node_type = result["node_type"]
        
        # The import_data.imports_list from Cypher is already the list of strings we want.
        file_imports_list = result["file_import_references"] if result["file_import_references"] else []
        
        details = {
            "node_type": node_type,
            "file_path": result["node_file_path"], # Changed from file_path to node_file_path to match Cypher
            "file_import_references": file_imports_list, 
            "parent_class": None,
            "methods_or_siblings": [],
            "calls": [],
            "called_by": []
        }

        if node_type == "Function":
            details["parent_class"] = result["func_parent_class"]
            details["methods_or_siblings"] = [s for s in result["func_sibling_methods"] if s] 
            details["calls"] = [c for c in result["func_calls"] if c]
            details["called_by"] = [cb for cb in result["func_called_by"] if cb]
        elif node_type == "Class":
            # For a class, its "methods_or_siblings" are its declared methods
            details["methods_or_siblings"] = [m for m in result["class_declared_methods"] if m]
            # Calls made by any method of this class
            details["calls"] = [c for c in result["class_calls_by_methods"] if c] 
            # Functions that call any method of this class
            details["called_by"] = [cb for cb in result["class_called_by_callers_to_methods"] if cb]
        else:
            return {"error": f"Unknown node type: {node_type} for {qualified_name}"}
            
        return details

In [49]:
from weaviate.classes.query import MetadataQuery

def semantic_search_weaviate(query, embed_fn, weaviate_client, collection_name, top_k=5):
    query_vec = embed_fn(query)
    if isinstance(query_vec, list) and len(query_vec) == 1 and isinstance(query_vec[0], list):
        query_vec = query_vec[0]
    print(query_vec[:2], len(query_vec))
    assert isinstance(query_vec, list) and len(query_vec) == 1536, "Embedding must be 1536-dimensional"

    collection = weaviate_client.collections.get(collection_name)
    results = collection.query.near_vector(
        near_vector=query_vec,
        limit=top_k,
        return_metadata=MetadataQuery(distance=True)
    )

    return results.objects


In [63]:
query = "How to create and mine blocks?"
results = semantic_search_weaviate(
    query=query,
    embed_fn=use_ollama_llm,
    weaviate_client=client,
    collection_name= "CodeNode",
    top_k=7
)

[-0.0034530205, 0.0040467414] 1536


In [64]:
import weaviate.classes as wvc # Ensure this is imported

def get_code_from_weaviate_by_qname(weaviate_client, collection_name, qualified_name):
    """
    Fetches the code snippet of a node from Weaviate given its qualified_name.
    """
    try:
        collection = weaviate_client.collections.get(collection_name)
        response = collection.query.fetch_objects(
            filters=wvc.query.Filter.by_property("qualified_name").equal(qualified_name),
            limit=1,
            return_properties=["code"]  # We only need the 'code' property
        )
        if response.objects:
            return response.objects[0].properties.get("code")
    except Exception as e:
        print(f"Error fetching code for {qualified_name} from Weaviate: {e}")
    return None

In [72]:
# Your existing loop:
for res in results: # Assuming 'results' are from your Weaviate semantic search
    props = res.properties
    qname = props['qualified_name']
    code_snippet = props.get('code', 'N/A') # From Weaviate
    node_type_weaviate = props.get('type', 'N/A') # Type from Weaviate
    distance = res.metadata.distance if res.metadata and hasattr(res.metadata, 'distance') else 'N/A'
    
    print(f"Found Node (from Weaviate): {qname}")
    print(f"Type (from Weaviate): {node_type_weaviate}")
    if isinstance(distance, float):
        print(f"Distance: {distance:.4f}")
    # print(f"Code Snippet:\n{code_snippet}\n") # Uncomment if you want to see the code

    # Fetch details from Neo4j using the new function
    # IMPORTANT: Make sure you are calling the correct, updated function here.
    # I previously suggested `get_node_details_with_imports`.
    # If you named your function `get_node_details` in the notebook, ensure it's the latest version.
    
    # Let's assume your updated function is called get_node_details_with_imports
    node_details = get_node_details(driver, qname) 
    
    print(f"--- Neo4j Details for {qname} ---") # Header for Neo4j info
    if "error" not in node_details:
        print(f"  Node Type (from Neo4j): {node_details.get('node_type', 'N/A')}")
        print(f"  File Path: {node_details.get('file_path', 'N/A')}")
        print(f"  File Import References: {node_details.get('file_import_references', [])}") # New field

        if node_details.get('node_type') == 'Function':
            print(f"  Parent Class: {node_details.get('parent_class', 'N/A')}")
            print(f"  Sibling Methods: {node_details.get('methods_or_siblings', [])}")
        elif node_details.get('node_type') == 'Class':
            print(f"  Declared Methods: {node_details.get('methods_or_siblings', [])}")
        
        # These are common to both Function and Class (for Class, it's calls by/to its methods)
        print(f"  Calls: {node_details.get('calls', [])}")
        print(f"  Called By: {node_details.get('called_by', [])}")
    else:
        print(f"  Error fetching Neo4j details: {node_details['error']}")
    print("---------------------------------------\n") # Footer for each item

Found Node (from Weaviate): python-bitcoin-utils.examples.create_and_mine_block.main
Type (from Weaviate): Function
Distance: 0.0565
--- Neo4j Details for python-bitcoin-utils.examples.create_and_mine_block.main ---
  Node Type (from Neo4j): Function
  File Path: normalized_repos\python-bitcoin-utils\examples\create_and_mine_block.py
  File Import References: []
  Parent Class: None
  Sibling Methods: []
  Calls: []
  Called By: []
---------------------------------------

Found Node (from Weaviate): sktime.sktime.libs.pykalman.tests.test_standard.data
Type (from Weaviate): Function
Distance: 0.0634
--- Neo4j Details for sktime.sktime.libs.pykalman.tests.test_standard.data ---
  Node Type (from Neo4j): Function
  File Path: normalized_repos\sktime\sktime\libs\pykalman\tests\test_standard.py
  File Import References: []
  Parent Class: None
  Sibling Methods: []
  Calls: []
  Called By: []
---------------------------------------

Found Node (from Weaviate): python-bitcoin-utils.examples.

In [106]:
# Ensure get_code_from_weaviate_by_qname is defined in your notebook
# import weaviate.classes as wvc # if not already imported

def format_rag_context(weaviate_results, neo4j_driver, get_relationships_fn, 
                       weaviate_client, collection_name, 
                       max_related_items=2, max_imports_to_expand=1):
    combined_context = []
    for i, res_obj in enumerate(weaviate_results):
        props = res_obj.properties
        qname = props.get('qualified_name', 'N/A')
        code_snippet = props.get('code', 'N/A')
        
        context_item = f"Context Item {i+1} (Retrieved Semantically from Weaviate):\n"
        context_item += f"  Qualified Name: {qname}\n"
        context_item += f"  Type (from Weaviate): {props.get('type', 'N/A')}\n"
        if res_obj.metadata and hasattr(res_obj.metadata, 'distance') and isinstance(res_obj.metadata.distance, float):
            context_item += f"  Relevance Score (distance): {res_obj.metadata.distance:.4f}\n"
        context_item += f"  Code Snippet:\n```python\n{code_snippet}\n```\n"

        relationships_data = get_relationships_fn(neo4j_driver, qname) # Using your improved function
        
        if "error" in relationships_data:
            context_item += f"  Error fetching Neo4j relationships: {relationships_data['error']}\n"
        else:
            neo4j_node_type = relationships_data.get("node_type", "Unknown")
            context_item += f"  Relationships (from Neo4j - Node Type: {neo4j_node_type}):\n"

            if neo4j_node_type == "Function":
                if relationships_data.get("parent_class"):
                    context_item += f"    - Part of Class: {relationships_data['parent_class']}\n"
                if relationships_data.get("methods_or_siblings"):
                    context_item += f"    - Sibling Methods: {', '.join(relationships_data['methods_or_siblings'])}\n"
            
            elif neo4j_node_type == "Class":
                if relationships_data.get("methods_or_siblings"): 
                    context_item += f"    - Declared Methods: {', '.join(relationships_data['methods_or_siblings'])}\n"
                
                imports = relationships_data.get("imports", [])
                if imports:
                    context_item += f"    - Imports from file ({relationships_data.get('file_path', 'N/A')}):\n"
                    expanded_imports_count = 0
                    for imp_detail in imports:
                        module_name = imp_detail.get('module', '')
                        item_name = imp_detail.get('name', '') # Specific item imported
                        alias = imp_detail.get('alias', '')
                        
                        import_str_display = "      - "
                        potential_qname_for_code = None

                        if module_name and item_name and item_name != '*': # e.g., from foo.bar import Baz
                            import_str_display += f"from {module_name} import {item_name}"
                            potential_qname_for_code = f"{module_name}.{item_name}"
                        elif module_name and not item_name: # e.g., import foo.bar
                            import_str_display += f"import {module_name}"
                            potential_qname_for_code = module_name # The module itself might be a class/func
                        elif module_name and item_name == '*': # e.g., from foo.bar import *
                            import_str_display += f"from {module_name} import *"
                            # potential_qname_for_code remains None, hard to resolve '*'
                        else: # Fallback for other cases or if parsing was incomplete
                            import_str_display += f"Import: module='{module_name}', name='{item_name}'"

                        if alias:
                            import_str_display += f" as {alias}"
                        context_item += import_str_display + "\n"

                        # Attempt to fetch code for the imported item (e.g., a base class)
                        # Heuristic: often base classes are capitalized or explicitly named 'Base...'
                        if potential_qname_for_code and expanded_imports_count < max_imports_to_expand and \
                           ( (item_name and item_name[0].isupper()) or \
                             (not item_name and module_name.split('.')[-1][0].isupper()) or \
                             ("Base" in potential_qname_for_code or "Abstract" in potential_qname_for_code) ):
                            context_item += f"        Attempting to fetch code for imported item: {potential_qname_for_code}\n"
                            imported_code = get_code_from_weaviate_by_qname(weaviate_client, collection_name, potential_qname_for_code)
                            if imported_code:
                                context_item += f"        Code for {potential_qname_for_code}:\n```python\n{imported_code}\n```\n"
                                expanded_imports_count += 1
                            else:
                                context_item += f"        (Code not found in Weaviate for {potential_qname_for_code})\n"
                    if not imports:
                         context_item += "      (No specific imports listed for this file in Neo4j)\n"

            # Common relationships: Calls and Called By
            called_fns = relationships_data.get("calls", [])
            if called_fns:
                context_item += f"    - {'Calls (by methods)' if neo4j_node_type == 'Class' else 'Calls'}:\n"
                for idx, called_qname in enumerate(called_fns[:max_related_items]):
                    # Fetch code for called_qname (already implemented in your previous version)
                    called_code = get_code_from_weaviate_by_qname(weaviate_client, collection_name, called_qname)
                    context_item += f"      - Name: {called_qname}\n"
                    if called_code:
                        context_item += f"        Code:\n```python\n{called_code}\n```\n"
                    else:
                        context_item += f"        (Code not found for {called_qname})\n"
                if len(called_fns) > max_related_items:
                    context_item += f"      - ... and {len(called_fns) - max_related_items} more.\n"

            calling_fns = relationships_data.get("called_by", [])
            if calling_fns:
                context_item += f"    - {'Called By (functions calling methods)' if neo4j_node_type == 'Class' else 'Called By'}:\n"
                for idx, calling_qname in enumerate(calling_fns[:max_related_items]):
                    # Fetch code for calling_qname (already implemented)
                    calling_code = get_code_from_weaviate_by_qname(weaviate_client, collection_name, calling_qname)
                    context_item += f"      - Name: {calling_qname}\n"
                    if calling_code:
                        context_item += f"        Code:\n```python\n{calling_code}\n```\n"
                    else:
                        context_item += f"        (Code not found for {calling_qname})\n"
                if len(calling_fns) > max_related_items:
                     context_item += f"      - ... and {len(calling_fns) - max_related_items} more.\n"
        
        combined_context.append(context_item)
        combined_context.append("---\n") 

    return "\n".join(combined_context)


def construct_llm_prompt(rag_context):
    """
    Constructs the final prompt for the LLM.
    """
    prompt = "Retrieved Context:\n"
    prompt += "------------------\n"
    prompt += rag_context
    return prompt


rag_context_str = format_rag_context(results, driver, get_code_relationships, client, "CodeNode", max_related_items=3)

# 3. Construct the final prompt for your LLM
final_prompt_for_llm = construct_llm_prompt(rag_context_str)
final_prompt_for_llm = "\n\nUser Query: " + query + "\n\n" + final_prompt_for_llm
print(final_prompt_for_llm)




User Query: How to create and mine blocks?

Retrieved Context:
------------------
Context Item 1 (Retrieved Semantically from Weaviate):
  Qualified Name: python-bitcoin-utils.examples.create_and_mine_block.main
  Type (from Weaviate): Function
  Relevance Score (distance): 0.0565
  Code Snippet:
```python
def main():

    # mock transaction details, this transaction would be the first transaction
    # of the block after the coinbase transaction
    tx_details = {
        "txid": "00000a2d1a9e29116b539b85b6e893213b1ed95a08b7526a8d59a4b088fc6571",
        "version": 1,
        "locktime": 0,
        "vin": [
            {
            "txid": "2e4843d552ca9487efd9e69c0359f05375b7de5449eb49510d17a25bb5b15ec0",
            "vout": 1,
            "prevout": {
                "scriptpubkey": "512065fd3d423ea46a70505248db989e7302bfbbdd64ee4193dd9a59f69894f0de48",
                "scriptpubkey_asm": "OP_PUSHNUM_1 OP_PUSHBYTES_32 65fd3d423ea46a70505248db989e7302bfbbdd64ee4193dd9a59f69894f0de

In [74]:
# Cell 1: Installation (run this once if you don't have the library)
# !pip install -U sentence-transformers -q

# Cell 2: Imports and Model Loading
from sentence_transformers import CrossEncoder

# Load a pre-trained cross-encoder model
# Models like 'cross-encoder/ms-marco-MiniLM-L-6-v2' are good for general relevance.
cross_encoder_model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
try:
    reranker_model = CrossEncoder(cross_encoder_model_name)
    print(f"Cross-encoder model '{cross_encoder_model_name}' loaded successfully.")
except Exception as e:
    print(f"Error loading cross-encoder model: {e}")
    print("Please ensure you have an internet connection, the model name is correct, and consider running '!pip install sentence-transformers'.")
    reranker_model = None # Set to None if loading fails

  _unclosed_resource_warn(self)
  _deprecation_warn(
  _unclosed_resource_warn(self)
            Please make sure to close the connection using `client.close()`.
  for attr in assigned:
  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Cross-encoder model 'cross-encoder/ms-marco-MiniLM-L-6-v2' loaded successfully.


In [107]:
# Cell 3: Re-ranking function
def rerank_with_cross_encoder(query, weaviate_results, cross_encoder, top_n_final=5):
    """
    Reranks a list of documents based on a query using a cross-encoder model.

    Args:
        query (str): The user's query.
        weaviate_results (list): A list of Weaviate result objects. 
                                 Each object must have a 'properties' attribute,
                                 which is a dict containing at least a 'code' field.
        cross_encoder (CrossEncoder): The loaded CrossEncoder model.
        top_n_final (int): The number of top documents to return after reranking.

    Returns:
        list: A new list of Weaviate result objects, sorted by the cross-encoder scores,
              containing up to top_n_final documents.
    """
    if not cross_encoder:
        print("Reranker model not loaded. Returning original top N results without reranking.")
        return weaviate_results[:top_n_final]
    if not weaviate_results:
        return []

    # Create pairs of [query, document_text] for the cross-encoder
    # We'll use the 'code' property from the Weaviate result objects
    pairs = []
    valid_docs_for_reranking = [] # Keep track of docs that actually go into pairs

    for doc_obj in weaviate_results:
        code_snippet = doc_obj.properties.get('code')
        if code_snippet and isinstance(code_snippet, str) and code_snippet.strip():
            pairs.append([query, code_snippet])
            valid_docs_for_reranking.append(doc_obj)
        else:
            # Optionally, handle documents without code snippets or with empty ones
            q_name = doc_obj.properties.get('qualified_name', 'Unknown QName')
            # print(f"Warning: Document '{q_name}' has no/empty code snippet, skipping for reranking.")
            pass


    if not pairs:
        # print("No valid (query, code_snippet) pairs to rerank from the provided documents.")
        # Fallback: return top_n_final from original results if no valid pairs
        return weaviate_results[:top_n_final] 

    # Get scores from the cross-encoder
    scores = cross_encoder.predict(pairs, show_progress_bar=False)

    # Combine valid documents with their new scores
    scored_documents = []
    for i, doc_obj in enumerate(valid_docs_for_reranking):
        # We'll add the score to a temporary structure for sorting.
        # It's not ideal to modify the Weaviate object directly unless it's a copy.
        scored_documents.append({'obj': doc_obj, 'rerank_score': scores[i]})

    # Sort documents by the new cross-encoder scores in descending order
    scored_documents.sort(key=lambda x: x['rerank_score'], reverse=True)

    # Extract the original Weaviate objects in the new order
    reranked_objects = [item['obj'] for item in scored_documents]
    
    return reranked_objects[:top_n_final]

In [108]:
from weaviate.classes.query import MetadataQuery # Already there
import weaviate.classes as wvc # For Filter, if you were to add it later, good practice to have


def semantic_search_weaviate(
    weaviate_client, 
    collection_name, 
    query_text, # Changed from 'query' to 'query_text' for clarity
    limit, # Changed from top_k to limit for consistency
    embedding_fn # Function to get embedding
):
    """
    Performs semantic search in Weaviate.

    Args:
        weaviate_client: The Weaviate client instance.
        collection_name (str): The name of the Weaviate collection.
        query_text (str): The user's query text.
        limit (int): The maximum number of results to return.
        embedding_fn (callable): Function that takes text and returns an embedding vector.
        ollama_api_url (str): URL for the Ollama API.
        ollama_model (str): Ollama model name for embeddings.
        ollama_headers (dict): Headers for Ollama API request.

    Returns:
        list: A list of Weaviate result objects.
    """
    # Get the embedding for the query text
    # Ensure your embedding_fn is correctly called with all its necessary parameters
    query_vec = embedding_fn(
        query_text
    )

    # Your existing embedding validation logic
    if isinstance(query_vec, list) and len(query_vec) == 1 and isinstance(query_vec[0], list) and len(query_vec[0]) > 1:
        query_vec = query_vec[0]
    
    # print(f"Query vector (first 2 dims, length): {query_vec[:2]}, {len(query_vec)}") # For debugging
    
    # Assuming your embeddings are 768-dimensional as per previous discussions.
    # Adjust if your Ollama model produces different dimensions (e.g., 1536, 4096).
    # The checkpoint mentioned 768 or 1536. Let's make it more flexible or check schema.
    # For now, I'll comment out the hardcoded assert to avoid breaking if you switch models.
    # A better approach would be to fetch expected dimension from Weaviate schema if possible.
    # assert isinstance(query_vec, list) and len(query_vec) == 768, \ # Or 1536, etc.
    #        f"Embedding must be of the configured dimension. Got: {len(query_vec)}"

    if not query_vec or not isinstance(query_vec, list) or not all(isinstance(x, (int, float)) for x in query_vec):
        print(f"Error: Invalid query vector generated for query: '{query_text}'. Vector: {query_vec}")
        return []

    collection = weaviate_client.collections.get(collection_name)
    
    # Define the properties you want Weaviate to return
    # These should match the properties needed by downstream functions (reranker, context formatter)
    properties_to_return = [
        "qualified_name", 
        "code", 
        "module"
    ]

    try:
        results = collection.query.near_vector(
            near_vector=query_vec,
            limit=limit, # Use the passed limit
            return_metadata=MetadataQuery(distance=True), # Already there, good
            return_properties=properties_to_return  # Explicitly ask for properties
        )
        return results.objects
    except Exception as e:
        print(f"Error during Weaviate semantic search for query '{query_text}': {e}")
        # print(f"Query vector that caused error (first 5 dims): {query_vec[:5]}") # For debugging
        return []


In [112]:
# Cell 4: Updated Main Workflow (example)

COLLECTION_NAME = "CodeNode"
# Example User Query
user_query = "how to create and mine blocks?" 

# 1. Initial Semantic Search from Weaviate
#    Retrieve more candidates than your final desired K for the LLM.
initial_retrieve_k = 15 # Number of candidates to fetch initially
print(f"Step 1: Performing initial semantic search for top {initial_retrieve_k} candidates...")

# Make sure your semantic_search_weaviate function can accept a 'limit'
# and returns all necessary properties for the reranker and context formatter.
initial_weaviate_results = semantic_search_weaviate(
    weaviate_client=client,  # Your Weaviate client
    collection_name=COLLECTION_NAME, # Your collection name
    query_text=user_query,
    limit=initial_retrieve_k,
    embedding_fn=use_ollama_llm,
)

if not initial_weaviate_results:
    print("No initial results from Weaviate. Cannot proceed with reranking.")
else:
    print(f"Retrieved {len(initial_weaviate_results)} initial candidates from Weaviate.")
    # for i, res_obj in enumerate(initial_weaviate_results):
    #     print(f"  Initial {i+1}: {res_obj.properties.get('qualified_name')} (Dist: {res_obj.metadata.distance if res_obj.metadata else 'N/A'})")


    # 2. Rerank the initial results using the Cross-Encoder
    final_top_k = 5  # Number of candidates to pass to LLM after reranking
    print(f"\nStep 2: Reranking top {len(initial_weaviate_results)} candidates down to {final_top_k}...")
    
    reranked_weaviate_results = rerank_with_cross_encoder(
        query=user_query,
        weaviate_results=initial_weaviate_results,
        cross_encoder=reranker_model, # The loaded model
        top_n_final=final_top_k
    )
    
    print(f"Re-ranked down to {len(reranked_weaviate_results)} candidates.")
    # for i, res_obj in enumerate(reranked_weaviate_results):
        # The rerank_score is not directly attached to res_obj here,
        # but you could modify rerank_with_cross_encoder to return scores if needed for inspection.
        # print(f"  Reranked {i+1}: {res_obj.properties.get('qualified_name')}")


    # 3. Format RAG context using the reranked results
    print(f"\nStep 3: Formatting RAG context for top {len(reranked_weaviate_results)} reranked candidates...")
    rag_context_str = format_rag_context( # Your existing context formatting function
        weaviate_results=reranked_weaviate_results, 
        neo4j_driver=driver, 
        get_relationships_fn=get_node_details, # Your Neo4j details function
        weaviate_client=client,
        collection_name=COLLECTION_NAME,
        max_related_items=2, 
        max_imports_to_expand=1 
    )

    # 4. Construct final LLM prompt
    print("\nStep 4: Constructing final prompt for LLM...")
    final_prompt_for_llm = construct_llm_prompt(rag_context_str)
    final_prompt_for_llm = "User Input Query: " + user_query + "\n\n" + final_prompt_for_llm
    print("\n--- Final RAG Context for LLM (using reranked results) ---")
    print(final_prompt_for_llm)
    
    # Next, you would send final_prompt_for_llm to your generative LLM.

Step 1: Performing initial semantic search for top 15 candidates...
Retrieved 15 initial candidates from Weaviate.

Step 2: Reranking top 15 candidates down to 5...
Re-ranked down to 5 candidates.

Step 3: Formatting RAG context for top 5 reranked candidates...

Step 4: Constructing final prompt for LLM...

--- Final RAG Context for LLM (using reranked results) ---
User Input Query: how to create and mine blocks?

Retrieved Context:
------------------
Context Item 1 (Retrieved Semantically from Weaviate):
  Qualified Name: python-bitcoin-utils.examples.create_and_mine_block.mine_block
  Type (from Weaviate): N/A
  Relevance Score (distance): 0.0653
  Code Snippet:
```python
def mine_block(block_header_bytes, target_hex):
    """
    Mine a block by iterating through nonce values.

    Args:
        block_header_bytes (bytes): The 80-byte block header with a placeholder nonce.
        target_hex (str): The difficulty target as a hex string (256-bit number).

    Returns:
        (int, b

In [116]:
system_template_for_rag = (
    "You are an expert code generation assistant. You will be provided with context of relevant code snippets. Use them to generate final code working solution based on user query. You need not output anything else."
    
)
final = system_template_for_rag + "\n\n" + final_prompt_for_llm
llm_response = use_ollama_llm_normal(
    system_template_for_rag,
    final,
    model="qwen3:32b"
)

# 4. Print the response
print(llm_response)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)