In [41]:
# sentence transformers
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

## Llama CPP

In [42]:
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.ollama import Ollama

# model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin"
# model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

# llm = LlamaCPP(
#     # You can pass in the URL to a GGML model to download it automatically
#     model_url=model_url,
#     # optionally, you can set the path to a pre-downloaded model instead of model_url
#     model_path=None,
#     temperature=0.1,
#     max_new_tokens=256,
#     # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
#     context_window=3900,
#     # kwargs to pass to __call__()
#     generate_kwargs={},
#     # kwargs to pass to __init__()
#     # set to at least 1 to use GPU
#     model_kwargs={"n_gpu_layers": 5},
#     verbose=True,
# )

from llama_index.embeddings.ollama import OllamaEmbedding

embed_model = OllamaEmbedding(
    model_name="llama3",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0},
)

llm = Ollama(
    model="llama3", 
    request_timeout=120.0,
    system_prompt="You are a senior developer specializing in static analysis tools and secure operating systems. You will follow the input instructions and generate code. Your output will be purely code, without additional information.",
    temperature=0.1
)

## Initialize Postgres

In [43]:
import psycopg2

db_name = "vector_db"
host = "localhost"
password = "123456"
port = "5432"
user = "evan"
# conn = psycopg2.connect(connection_string)
conn = psycopg2.connect(
    dbname="postgres",
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

with conn.cursor() as c:
    c.execute("""
        SELECT pg_terminate_backend(pg_stat_activity.pid)
        FROM pg_stat_activity
        WHERE pg_stat_activity.datname = 'vector_db'
        AND pid <> pg_backend_pid();
    """)
    c.execute("DROP DATABASE IF EXISTS vector_db;")
conn.close()

conn = psycopg2.connect(
    dbname="postgres",
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

with conn.cursor() as c:
    c.execute("CREATE DATABASE vector_db;")

In [44]:
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore

vector_store = PGVectorStore.from_params(
    database=db_name,
    host=host,
    password=password,
    port=port,
    user=user,
    table_name="codellama",
    embed_dim=4096,  # llama embedding dimension
)

## Build an Ingestion Pipeline from Scratch

In [45]:
# !mkdir data
# !wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"

In [46]:
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader

In [82]:
# loader = PyMuPDFReader()
# documents = loader.load(file_path="./data/Soot Command Line Options.pdf")

from llama_index.core import SimpleDirectoryReader
required_exts = [".html", ".md"]
reader = SimpleDirectoryReader(
    input_dir="/home/ubuntu/soot/target/site/apidocs",
    required_exts=required_exts,
    recursive=True,
)
docs = reader.load_data()
print(f"Loaded {len(docs)} docs")

Loaded 6333 docs


## Use a Text Splitter to Split Documents

In [83]:
from llama_index.core.node_parser import SentenceSplitter, HTMLNodeParser

In [92]:
# text_parser = SentenceSplitter(
#     chunk_size=1024,
#     # separator=" ",
# )
from llama_index.core.node_parser import HTMLNodeParser
parser = HTMLNodeParser()
nodes = parser.get_nodes_from_documents(documents=docs) 
print(f"Extracted {len(nodes)} nodes")

Extracted 156127 nodes


In [89]:
# from llama_index.core.node_parser import CodeSplitter
# nodes = CodeSplitter(
#     language="java",
#     chunk_lines=40,  # lines per chunk
#     chunk_lines_overlap=15,  # lines overlap between chunks
#     max_chars=1500,  # max chars per chunk
# ).get_nodes_from_documents(SimpleDirectoryReader(
#     input_dir="/home/ubuntu/soot/src/test/java",
#     recursive=True,
#     required_exts=[".java"],
# ).load_data())
# print(f"Extracted {len(nodes)} nodes")

ModuleNotFoundError: No module named 'tree_sitter'

In [85]:
# text_chunks = []
# # maintain relationship with source doc index, to help inject doc metadata in (3)
# doc_idxs = []
# for doc_idx, doc in enumerate(docs):
#     cur_text_chunks = text_parser.split_text(doc.text)
#     text_chunks.extend(cur_text_chunks)
#     doc_idxs.extend([doc_idx] * len(cur_text_chunks))

## Manually Construct Nodes from Text Chunks

In [51]:
# from llama_index.core.schema import TextNode

# nodes = []
# for idx, text_chunk in enumerate(text_chunks):
#     node = TextNode(
#         text=text_chunk,
#     )
#     src_doc = documents[doc_idxs[idx]]
#     node.metadata = src_doc.metadata
#     nodes.append(node)

## Generate Embeddings for each Node

In [90]:
import concurrent.futures

def get_and_set_embedding(node):
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding
    return node

with concurrent.futures.ThreadPoolExecutor() as executor:
    nodes = list(executor.map(get_and_set_embedding, nodes))

KeyboardInterrupt: 

## Load Nodes into a Vector Store

In [53]:
vector_store.add(nodes)

['be0dfca2-0f2b-4a9a-9645-6e726648012f',
 '1b9bbabc-eec0-4b1f-af2a-e983c1061e87',
 '2b50692f-d3b8-43c9-bf44-9108f395ace7',
 'ce49eac6-594e-4d87-9495-fc61b8f7ca23',
 '19725e32-b465-4abd-bc66-16d8e86a30d0',
 '693b8ce4-2651-42fa-bf0d-f75a4f1978cd',
 '5a591930-d5dd-4542-80e0-a359af056f33',
 'dd33c909-4d4d-419a-96b5-c1322fe0b05f',
 '1f502d9f-fa0a-4c90-bd3b-7236b40afc2b',
 'a2ebe0fa-c43e-4642-8158-cadb3141865d',
 '25988cd5-04ac-442b-8263-ff8602902624',
 'daf7ddaa-ac51-4758-b4e7-05771583a9d4',
 '4e352ca1-2497-45ef-be25-f0c6842b8d0f',
 '2e4ef0fc-07b0-4afd-aaf7-e5cd7e5f3185',
 '76df1545-19c4-42e6-9225-c60eff3a79fa',
 '0466d688-b7b1-496c-9bcf-de703b1f27ff',
 'd6664b2a-ed0c-4d83-813e-77ec713ba9a6',
 '94eb18b9-7a2e-4233-98fd-05180e506f9a',
 '32daded0-3637-4ae8-8b63-99102e66a525',
 '6aa1a391-c052-46ad-816c-75333fb10f4b',
 'b9222d5f-fcfe-4928-b6c5-c9d52c9d5db9',
 '6a8fccc8-a9cc-42a2-9357-1591d3bd72a6',
 '14f7ef50-82ec-42bf-8358-55bcc30415b2',
 'ddad9f22-6459-4e6f-917a-399c23915032',
 'a487df84-232c-

### Put into a Retriever

In [54]:
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List
from llama_index.core.vector_stores import VectorStoreQuery
from llama_index.core.schema import NodeWithScore
from typing import Optional


class VectorDBRetriever(BaseRetriever):
    """Retriever over a postgres vector store."""

    def __init__(
        self,
        vector_store: PGVectorStore,
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k: int = 2,
    ) -> None:
        """Init params."""
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve."""
        query_embedding = embed_model.get_query_embedding(
            query_bundle.query_str
        )
        vector_store_query = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode,
        )
        query_result = vector_store.query(vector_store_query)

        nodes_with_scores = []
        for index, node in enumerate(query_result.nodes):
            score: Optional[float] = None
            if query_result.similarities is not None:
                score = query_result.similarities[index]
            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

In [55]:
from llama_index.core.vector_stores.types import VectorStoreQueryMode

retriever = VectorDBRetriever(
    vector_store, embed_model, query_mode=VectorStoreQueryMode.DEFAULT, similarity_top_k=2
)

## Plug this into our RetrieverQueryEngine to synthesize a response

In [56]:
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)

In [70]:
# query_str = "Create an iptables config for a web server 127.1.1.1 that only allows incoming traffic on port 80 and 443."
query_str = "Generate Soot (verification) driver code that prints CFGs for all functions in `/home/ubuntu/ConfigCraft.ai/experimental/soot/java/Solution.java`."
# query_str = "Generate an iptables config for an ecommerce website server that only allows incoming HTTP/HTTPS traffic, DNS lookup and SSH connecton."

# "Enhance this query to maximize the relevance of the search results: " + query_str using ollama llama3
# print(llm.complete("Enhance this query to provide more context for further LLM generation: " + query_str))

response = query_engine.query(query_str)

Here's an enhanced query that provides more context for further LLM generation:

```
soot -v -cfg-all /home/ubuntu/ConfigCraft.ai/experimental/soot/java/Solution.java
```

This query uses the following options:

* `-v`: Verbose mode, which prints detailed information about the Soot analysis.
* `-cfg-all`: Generates CFGs (Control Flow Graphs) for all functions in the specified Java file.

By running this command, you'll get a comprehensive output that includes:
1. Function signatures and their corresponding CFGs.
2. Detailed information about each function's control flow, including loops, conditional statements, and jump targets.
3. A visual representation of the CFGs, which can be useful for understanding the program's logic.

This enhanced query provides more context for further LLM generation by:

1. Providing a clear overview of the Soot analysis process.
2. Highlighting the importance of CFGs in understanding the program's control flow.
3. Offering a visual representation of the CF

In [64]:
print(str(response))

```bash
soot -cp /home/ubuntu/ConfigCraft.ai/experimental/soot/java/Solution.java -f -cfg-all
```


In [65]:
print(response.source_nodes[0].get_content())

-ire
-ignore-resolution-
errors
Does not throw an exception when a program references an undeclared field
or method.
Application Mode Options
-i pkg
-include pkg
Include classes in pkg as application classes
-x pkg
-exclude pkg
Exclude classes in pkg from application classes
-include-all
Set default excluded packages to empty list
-dynamic-class class Note that class may be loaded dynamically
-dynamic-dir dir
Mark all classes in dir as potentially dynamic
-dynamic-package pkg Marks classes in pkg as potentially dynamic
Input Attribute Options
-keep-line-number
Keep line number tables
-keep-bytecode-offset
-keep-offset
Attach bytecode offset to IR
Output Attribute Options
-write-local-annotations Write out debug annotations on local names
Annotation Options
-annot-purity
Emit purity attributes
-annot-nullpointer Emit null pointer attributes
-annot-arraybounds Emit array bounds check attributes
-annot-side-effect Emit side-effect attributes
-annot-fieldrw
Emit field read/write attributes