In [2]:
import os
import glob
import tiktoken
import numpy as np
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sklearn.manifold import TSNE
import plotly.graph_objects as go

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
MODEL = "llama3.2"
db_name = "vector_db"
load_dotenv(override=True)

False

In [4]:
knowledge_base_path = "books/book.pdf"
files = glob.glob(knowledge_base_path) # glob finds files whose names match a specified pattern.
print(f"found {len(files)} files in the knowledge base")
entire_knowledge_base = ""

from pypdf import PdfReader
# for file_path in files:
#     with open(file_path, 'r', encoding='utf-8')

for file_path in files:
    reader = PdfReader(file_path)
    for page in reader.pages:
        text = page.extract_text()
        if text:
            entire_knowledge_base += text
            entire_knowledge_base += "\n\n"

print(f"Total characters in knowledge base: {len(entire_knowledge_base):,}")

found 1 files in the knowledge base
Total characters in knowledge base: 530,477


In [5]:
import tiktoken
encoding = tiktoken.get_encoding('cl100k_base')
tokens = encoding.encode(entire_knowledge_base)

print(f"Token length {len(tokens)}")

Token length 137902


# Completed turning text into tokens by far. Will start doing the rest now

In [29]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader

loader = DirectoryLoader(
    "knowledge-base",
    glob="**/*.pdf",
    loader_cls=PyPDFLoader
)

documents = loader.load()

print(f"Loaded {len(documents)} documents")

Loaded 350 documents


In [30]:
documents[1]

Document(metadata={'producer': 'calibre (4.11.2) [http://calibre-ebook.com]', 'creator': 'calibre (4.11.2) [http://calibre-ebook.com]', 'creationdate': '2020-04-18T21:13:58+00:00', 'author': 'Peter Yaworski', 'moddate': '2020-04-18T23:13:58+02:00', 'title': 'Real-World Bug Hunting', 'source': 'knowledge-base\\book.pdf', 'total_pages': 350, 'page': 1, 'page_label': '2'}, page_content='REAL-W ORLD BU G HU NTING\nA Field Guide to W eb Hacking\nby Peter Yaworski\nS an Francisco')

In [31]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
chunks = text_splitter.split_documents(documents)

print(f"Divided into {len(chunks)} chunks")
print(f"First chunk is \n\n{chunks[0]}")

Divided into 745 chunks
First chunk is 

page_content='REAL-W ORLD BU G HU NTING
A Field Guide to W eb Hacking
by Peter Yaworski
S an Francisco' metadata={'producer': 'calibre (4.11.2) [http://calibre-ebook.com]', 'creator': 'calibre (4.11.2) [http://calibre-ebook.com]', 'creationdate': '2020-04-18T21:13:58+00:00', 'author': 'Peter Yaworski', 'moddate': '2020-04-18T23:13:58+02:00', 'title': 'Real-World Bug Hunting', 'source': 'knowledge-base\\book.pdf', 'total_pages': 350, 'page': 1, 'page_label': '2'}


## Now we create a vector store

In [32]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection
vectorstore = Chroma.from_documents(documents=chunks, embedding = embeddings, persist_directory=db_name)
print(f"Vectorstore create with {vectorstore._collection.count()} documents")

Vectorstore create with 1490 documents


In [33]:
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 1,490 vectors with 384 dimensions in the vector store


In [41]:
# # Prework

# result = collection.get(include=['embeddings', 'documents', 'metadatas'])
# vectors = np.array(result['embeddings'])
# documents = result['documents']
# metadatas = result['metadatas']
# doc_types = [metadata.get('doc_type', 'unknown') for metadata in metadatas]
# colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [43]:
# Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])

vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']

doc_types = [metadata.get('doc_type', 'unknown') for metadata in metadatas]

color_map = {
    'chapters': 'blue',
    'bugs': 'green',
    'takeaways': 'red',
    'summary': 'orange'
}

colors = [color_map.get(t, 'gray') for t in doc_types]

# To visualize

In [45]:
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# 3D huh?

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=10, b=10, l=10, t=40)
)

fig.show()

# And now towards making the retreival mechanism

In [47]:
from dotenv import load_dotenv
from langchain_ollama import ChatOllama

from langchain_chroma import Chroma
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_huggingface import HuggingFaceEmbeddings
import gradio as gr

In [48]:
MODEL = "llama3.2"
DB_NAME = "vector_db"
load_dotenv(override=True)

False

In [49]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma(persist_directory=DB_NAME, embedding_function=embeddings)

In [50]:
retreiver = vectorstore.as_retriever()
llm = ChatOllama(temperature=0, model=MODEL)

In [51]:
retreiver.invoke("What is HTTP?")

[Document(id='fcbdefcd-9375-413d-9606-302733785999', metadata={'total_pages': 350, 'creator': 'calibre (4.11.2) [http://calibre-ebook.com]', 'page_label': '65', 'page': 64, 'creationdate': '2020-04-18T21:13:58+00:00', 'author': 'Peter Yaworski', 'moddate': '2020-04-18T23:13:58+02:00', 'title': 'Real-World Bug Hunting', 'source': 'knowledge-base\\book.pdf', 'producer': 'calibre (4.11.2) [http://calibre-ebook.com]'}, page_content='header that browsers m ight include when sending H T T P requests. T he\nheader tells the recipient how the body of the H T T P request is\nencoded. H ere is an exam ple of a text/plain content-type request:'),
 Document(id='d3a31bf2-5000-4cb0-b770-80a89a6a2839', metadata={'title': 'Real-World Bug Hunting', 'page': 64, 'author': 'Peter Yaworski', 'creationdate': '2020-04-18T21:13:58+00:00', 'source': 'knowledge-base\\book.pdf', 'moddate': '2020-04-18T23:13:58+02:00', 'page_label': '65', 'producer': 'calibre (4.11.2) [http://calibre-ebook.com]', 'creator': 'cali

In [52]:
llm.invoke("What is HTTP?")

AIMessage(content="HTTP (Hypertext Transfer Protocol) is a protocol used for transferring data over the internet. It is the foundation of how web pages are loaded and interacted with on the web.\n\nHTTP defines the rules for how data is sent between a client (such as a web browser) and a server, which hosts the website or application being accessed. The protocol consists of two main components:\n\n1. **Request**: When a user requests a webpage by typing its URL into their browser, the browser sends an HTTP request to the server hosting that webpage.\n2. **Response**: The server processes the request and returns an HTTP response back to the client (browser), which then displays the requested data on the screen.\n\nHTTP is typically used for the following purposes:\n\n* Web browsing: When you enter a URL into your browser, it sends an HTTP request to the server hosting that webpage.\n* File transfers: You can use HTTP to download files from a website or upload files to a server.\n* APIs 

## Putting it together

In [53]:
SYSTEM_PROMPT_TEMPLATE = """
You are a knowledgeable, friendly assistant that helps the user with bug hunting in cyber security.
You are chatting with a user about Hacking.
If relevant, use the given context to answer any question.
If you don't know the answer, say so.
Context:
{context}
"""

In [55]:
def answer_question(question: str, history):
    docs = retreiver.invoke(question)
    context = "\n\n".join(doc.page_content for doc in docs)
    system_prompt = SYSTEM_PROMPT_TEMPLATE.format(context = context)
    response = llm.invoke([SystemMessage(content = system_prompt), HumanMessage(content=question)])
    return response.content

In [56]:
answer_question("What is HTTP PARAMETER POLLUTION?", [])

'HTTP Parameter Pollution (HPP) is a type of vulnerability that occurs when an attacker injects extra parameters into an HTTP request, which are then trusted by the target website. This can lead to unexpected behavior, such as unauthorized access, data tampering, or even complete takeover of the website.\n\nIn HPP, an attacker manipulates the HTTP request headers or query string to inject malicious parameters that are not intended by the legitimate user. The vulnerable website trusts these injected parameters and uses them to perform actions, which can have unintended consequences.\n\nFor example, if a website has a login form with a username and password field, an attacker might inject additional fields like "username=eviladmin&password=secret" into the request. If the website doesn\'t properly validate or sanitize the input, it may use these injected parameters to authenticate the user, allowing the attacker to gain unauthorized access.\n\nHPP can occur on both client-side (e.g., in 

In [57]:
gr.ChatInterface(answer_question).launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


