## LLM Model

In [24]:
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.ollama import Ollama

# model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin"
# model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

# llm = LlamaCPP(
#     # You can pass in the URL to a GGML model to download it automatically
#     model_url=model_url,
#     # optionally, you can set the path to a pre-downloaded model instead of model_url
#     model_path=None,
#     temperature=0.1,
#     max_new_tokens=256,
#     # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
#     context_window=3900,
#     # kwargs to pass to __call__()
#     generate_kwargs={},
#     # kwargs to pass to __init__()
#     # set to at least 1 to use GPU
#     model_kwargs={"n_gpu_layers": 5},
#     verbose=True,
# )

from llama_index.embeddings.ollama import OllamaEmbedding

embed_model = OllamaEmbedding(
    model_name="codellama",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0},
)

llm = Ollama(
    model="codellama", 
    request_timeout=120.0,
    system_prompt="You are a senior developer specializing in static analysis tools and secure operating systems. You will follow the input instructions and generate code. Your output will be purely code, without additional information.",
    temperature=0.1
)

## Initialize Postgres

In [25]:
import psycopg2

db_name = "vector_db"
host = "localhost"
password = "123456"
port = "5432"
user = "evan"
# conn = psycopg2.connect(connection_string)
conn = psycopg2.connect(
    dbname="postgres",
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

with conn.cursor() as c:
    c.execute("""
        SELECT pg_terminate_backend(pg_stat_activity.pid)
        FROM pg_stat_activity
        WHERE pg_stat_activity.datname = 'vector_db'
        AND pid <> pg_backend_pid();
    """)
    c.execute("DROP DATABASE IF EXISTS vector_db;")
conn.close()

conn = psycopg2.connect(
    dbname="postgres",
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

with conn.cursor() as c:
    c.execute("CREATE DATABASE vector_db;")

In [26]:
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore

vector_store = PGVectorStore.from_params(
    database=db_name,
    host=host,
    password=password,
    port=port,
    user=user,
    table_name="codellama",
    embed_dim=4096,  # llama embedding dimension
)

## Build an Ingestion Pipeline from Scratch

In [27]:
# !mkdir data
# !wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"

In [28]:
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader

In [29]:
loader = PyMuPDFReader()
documents = loader.load(file_path="./data/Iptables Tutorial 1.2.2.pdf")

## Use a Text Splitter to Split Documents

In [30]:
from llama_index.core.node_parser import SentenceSplitter

In [31]:
text_parser = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)

In [32]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

## Manually Construct Nodes from Text Chunks

In [33]:
from llama_index.core.schema import TextNode

nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

## Generate Embeddings for each Node

In [35]:
import concurrent.futures

def get_and_set_embedding(node):
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding
    return node

with concurrent.futures.ThreadPoolExecutor() as executor:
    nodes = list(executor.map(get_and_set_embedding, nodes))

## Load Nodes into a Vector Store

In [36]:
vector_store.add(nodes)

['16625e72-5c42-41d7-9396-a1a1177421cc',
 '2f9ec981-0efe-4a04-a2f2-616c84267dc8',
 '5f63bb5e-7462-4f9a-8243-b54456310a65',
 'df52300f-31bc-4cac-8dc1-6008eaf70fbf',
 '47ec86d9-9de7-4cee-9d26-0910b7834dbe',
 'ae3a3324-bb69-4db5-be11-977f98b88d56',
 'eb90b44d-2c21-41e4-8c15-ea439019512e',
 '8da3e9fe-5d18-4106-afdc-fdcc876f52b7',
 '2ff513bb-e6e5-47b5-9585-ae10c2db6909',
 '002d9fba-4093-4a7e-b785-409f97dd5422',
 '3ab59209-5d14-4042-892b-cdf75f584361',
 '675f3219-e6b1-481a-82b9-27f393bc2740',
 '55b8d58a-4f8d-483a-9906-ad463e653b2c',
 'c15380fb-36da-44cf-a7ea-309b38b6f651',
 'd4da25a4-84c1-4227-98e4-18033d622f31',
 '47ea2d9b-3072-41d0-a158-e86a4b5ef628',
 '6a9c9f11-173d-4001-82e5-195aa81f8eda',
 'e7cd1b5c-89a5-440e-b56c-5c8c3667fa9d',
 'cf27aa0e-8af1-48e3-9356-360e1e0799c6',
 '7862c89b-8f43-4ce9-9aca-fc2e82d1ae1d',
 '79e1321f-afb0-4fae-85d1-e6bc4d698ff4',
 '64688b9b-7a56-4d86-acea-acfde0091308',
 '41fb871f-5604-48b4-9a8d-7610ce732064',
 '52743593-6f96-4367-81af-cb963b79ed68',
 'fd4ab342-2e4e-

### Put into a Retriever

In [38]:
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List
from llama_index.core.schema import NodeWithScore
from typing import Optional
from llama_index.core.vector_stores import VectorStoreQuery


class VectorDBRetriever(BaseRetriever):
    """Retriever over a postgres vector store."""

    def __init__(
        self,
        vector_store: PGVectorStore,
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k: int = 2,
    ) -> None:
        """Init params."""
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve."""
        query_embedding = embed_model.get_query_embedding(
            query_bundle.query_str
        )
        vector_store_query = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode,
        )
        query_result = vector_store.query(vector_store_query)

        nodes_with_scores = []
        for index, node in enumerate(query_result.nodes):
            score: Optional[float] = None
            if query_result.similarities is not None:
                score = query_result.similarities[index]
            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

In [39]:
from llama_index.core.vector_stores.types import VectorStoreQueryMode

retriever = VectorDBRetriever(
    vector_store, embed_model, query_mode=VectorStoreQueryMode.DEFAULT, similarity_top_k=2
)

## Plug this into our RetrieverQueryEngine to synthesize a response

In [40]:
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)

In [41]:
# query_str = "Create an iptables config for a web server 127.1.1.1 that only allows incoming traffic on port 80 and 443."
# query_str = "Generate Soot (verification) command that prints CFGs for all functions in java.util.concurrent.ThreadPoolExecutor class."
query_str = "Generate an iptables config for an ecommerce website server that only allows incoming HTTP/HTTPS traffic, DNS lookup and SSH connecton."

response = query_engine.query(query_str)

In [42]:
print(str(response))

```
*nat
:PREROUTING ACCEPT [0:0]
:INPUT ACCEPT [0:0]
:OUTPUT ACCEPT [0:0]
:POSTROUTING ACCEPT [0:0]
-A PREROUTING -p tcp -m tcp --dport 80 -j REDIRECT --to-ports 3128
-A PREROUTING -p tcp -m tcp --dport 443 -j REDIRECT --to-ports 3129
-A INPUT -i lo -j ACCEPT
-A INPUT -p icmp -j ACCEPT
-A INPUT -p udp -m udp --dport 53 -j ACCEPT
-A INPUT -p tcp -m tcp --dport 22 -j ACCEPT
-A INPUT -p tcp -m state --state ESTABLISHED,RELATED -j ACCEPT
COMMIT
```
This iptables config allows incoming HTTP/HTTPS traffic, DNS lookup, and SSH connection. The first two rules in the PREROUTING chain redirect all incoming TCP traffic on ports 80 and 443 to port 3128 and 3129 respectively, which are proxied by a web proxy server. The third rule allows all traffic on the loopback interface (lo). The fourth rule allows all ICMP packets. The fifth rule allows all UDP traffic on port 53, which is used for DNS lookups. The sixth rule allows all TCP traffic on port 22, which is used for SSH connections. The seventh r

In [43]:
print(response.source_nodes[0].get_content())

#
# 3.1 Required proc configuration
#
echo "1" > /proc/sys/net/ipv4/ip_forward
#
# 3.2 Non-Required proc configuration
#
#echo "1" > /proc/sys/net/ipv4/conf/all/rp_filter
#echo "1" > /proc/sys/net/ipv4/conf/all/proxy_arp
#echo "1" > /proc/sys/net/ipv4/ip_dynaddr
###########################################################################
#
# 4. rules set up.
#
######
# 4.1 Filter table
#
#
# 4.1.1 Set policies
#
$IPTABLES -P INPUT DROP
$IPTABLES -P OUTPUT DROP
$IPTABLES -P FORWARD DROP
#
# 4.1.2 Create userspecified chains
#
#
# Create chain for bad tcp packets
#
$IPTABLES -N bad_tcp_packets
#
# Create separate chains for ICMP, TCP and UDP to traverse
#
$IPTABLES -N allowed
$IPTABLES -N tcp_packets
$IPTABLES -N udp_packets
$IPTABLES -N icmp_packets
#
# 4.1.3 Create content in userspecified chains
#
#
# bad_tcp_packets chain
#
$IPTABLES -A bad_tcp_packets -p tcp --tcp-flags SYN,ACK SYN,ACK \
-m state --state NEW -j REJECT --reject-with tcp-reset
$IPTABLES -A bad_tcp_packets -p tcp ! --sy