In [14]:
%pip install faiss-cpu sentence-transformers python-dotenv requests

Note: you may need to restart the kernel to use updated packages.


In [15]:
%pip install PyMuPDF

1889.36s - thread._ident is None in _get_related_thread!


Note: you may need to restart the kernel to use updated packages.


In [16]:
%pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import fitz  # PyMuPDF for reading PDFs
import numpy as np
import requests
import faiss
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

# Load .env variables (if you created one)
load_dotenv()

# Get your Groq API key
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"

# Print to confirm
print("✅ Groq key loaded:", "✔️" if GROQ_API_KEY.startswith("gsk_") else "❌ (Replace your key manually)")

✅ Groq key loaded: ✔️


In [18]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    return full_text

def chunk_text(text, max_chars=500):
    return [text[i:i + max_chars] for i in range(0, len(text), max_chars)]

# Replace this with your actual PDF file name
pdf_path = "sample.pdf"
raw_text = extract_text_from_pdf(pdf_path)
chunks = chunk_text(raw_text)

print(f"✅ Extracted {len(chunks)} chunks from the PDF.")
print("\n📌 First chunk:\n", chunks[0][:300], "...")

✅ Extracted 22 chunks from the PDF.

📌 First chunk:
 Switch
Example
Description
-sL
nmap 192.168.1.1-3 -sL
No Scan. List targets only
-sn
nmap 192.168.1.1/24 -sn
Disable port scanning
-Pn
nmap 192.168.1.1-5 -Pn
Disable host discovery. Port scan only
-PS
nmap 192.168.1.1-5 -PS22-25,80
TCP SYN discovery on port x. Port 80 by default
-PA
nmap 192.168.1.1 ...


In [19]:
# Load the embedding model
embedder = SentenceTransformer("all-mpnet-base-v2")

# Encode the chunks
embeddings = embedder.encode(chunks, convert_to_numpy=True)

# Show shape of embedding matrix
print("✅ Embeddings shape:", embeddings.shape)
print("📌 Example vector (first chunk):\n", embeddings[0][:5], "...")


✅ Embeddings shape: (22, 768)
📌 Example vector (first chunk):
 [-0.01699914 -0.06065932  0.00682455  0.01032754 -0.02969524] ...


In [20]:
# Create a FAISS index
dimension = embeddings.shape[1]  # e.g., 768
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the index
index.add(embeddings)

print("✅ FAISS index created and populated.")
print("📌 Total vectors indexed:", index.ntotal)


✅ FAISS index created and populated.
📌 Total vectors indexed: 22


In [21]:
def retrieve(query, top_k=3):
    # Embed the query
    query_embedding = embedder.encode([query], convert_to_numpy=True)

    # Search in FAISS index
    D, I = index.search(query_embedding, top_k)

    # Return the matching chunks
    return [chunks[i] for i in I[0]]

# 🧪 Try a sample query
sample_query = "What is the main topic of the document?"
retrieved_chunks = retrieve(sample_query)

print("✅ Retrieved Chunks:")
for i, chunk in enumerate(retrieved_chunks, 1):
    print(f"\n📎 Chunk {i}:\n", chunk[:300], "...")

✅ Retrieved Chunks:

📎 Chunk 1:
 1
Timing and Performance
Switch
Example
Description
-T0 
nmap 192.168.1.1 -T0
Paranoid (0) Intrusion Detection System evasion
-T1 
nmap 192.168.1.1 -T1
Sneaky (1) Intrusion Detection System evasion
-T2 
nmap 192.168.1.1 -T2
Polite (2) slows down the scan to use less bandwidth and use less target mac ...

📎 Chunk 2:
 168.1.1 --reason 
Display the reason a port is in a particular state, same output as -vv   
--open 
nmap 192.168.1.1 --open 
Only show open (or possibly open) ports   
--packet-trace
nmap 192.168.1.1 -T4 --packet-trace 
Show all packets sent and received   
--iflist
nmap --iflist
Shows the host inte ...

📎 Chunk 3:
 P and UDP ports
-p-
nmap 192.168.1.1 -p- 
Port scan all ports
-p
nmap 192.168.1.1 -p http,https
Port scan from service name
-F
nmap 192.168.1.1 -F
Fast port scan (100 ports)
--top-ports 
nmap 192.168.1.1 --top-ports 2000
Port scan the top x ports
-p-65535 
nmap 192.168.1.1 -p-65535 
Leaving oﬀ initi ...


In [22]:
def generate_answer(query, context_chunks):
    context = "\n".join([f"- {chunk}" for chunk in context_chunks])

    prompt = f"""You are a helpful assistant. Use only the given context to answer the question truthfully.

Context:
{context}

Question: {query}
Answer:"""

    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": "llama3-8b-8192",
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.2,
        "max_tokens": 300
    }

    response = requests.post(GROQ_API_URL, headers=headers, json=payload)
    return response.json()['choices'][0]['message']['content']

In [28]:
query = "Summarize the document, and give the roadmap for the next steps. And give a example From the document for Each type of Scan"
retrieved = retrieve(query)
answer = generate_answer(query, retrieved)

print("🧠 LLaMA 3 Answer:\n", answer)


🧠 LLaMA 3 Answer:
 Summary:

The document provides an overview of the Nmap command-line tool and its various options for timing and performance, as well as its script scanning capabilities. The timing and performance options allow users to adjust the speed and aggressiveness of the scan, while the script scanning options enable users to run specific scripts or categories of scripts to gather more information about the target.

Roadmap for next steps:

1. Understand the different timing and performance options available in Nmap, including the Paranoid, Sneaky, Polite, Normal, Aggressive, and Insane modes.
2. Learn how to use the script scanning options to run specific scripts or categories of scripts to gather more information about the target.
3. Practice using Nmap with different timing and performance options and script scanning options to gain a better understanding of how to use the tool effectively.

Examples:

1. Paranoid (0) Intrusion Detection System evasion:

nmap 192.168.1.1 

In [30]:
%pip install chromadb sentence-transformers pypdf groq

Collecting chromadb
  Using cached chromadb-1.0.15-cp39-abi3-win_amd64.whl.metadata (7.1 kB)
Collecting pypdf
  Using cached pypdf-5.9.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groq
  Using cached groq-0.30.0-py3-none-any.whl.metadata (16 kB)
Collecting build>=1.0.3 (from chromadb)
  Using cached build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting pydantic>=1.9 (from chromadb)
  Using cached pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Using cached pybase64-1.4.2-cp312-cp312-win_amd64.whl.metadata (9.0 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Using cached uvicorn-0.35.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Using cached posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Using cached onnxruntime-1.22.1-cp312-cp312-win_amd64.whl.metadata (5.1 kB)
Collecting opentelemetry-api>=1.2.0 (from chrom

In [32]:
%pip install pypdf

Note: you may need to restart the kernel to use updated packages.


In [34]:
# If you see "Import 'PyPDF2' could not be resolved", it means the PyPDF2 package is not installed in your environment.
# You can install it using the following command in a Jupyter cell:
# %pip install PyPDF2
%pip install PyPDF2
from PyPDF2 import PdfReader

def load_pdf_text(pdf_path):
    reader = PdfReader(pdf_path)
    return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])

pdf_path = "sample.pdf"
text = load_pdf_text(pdf_path)


Note: you may need to restart the kernel to use updated packages.


In [37]:
from sklearn.feature_extraction.text import CountVectorizer

def split_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

text_chunks = split_text(text)


In [38]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(text_chunks)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  return forward_call(*args, **kwargs)


In [43]:
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.utils import embedding_functions

# Load SentenceTransformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Compute embeddings for chunks
embeddings = model.encode(text_chunks)

# Initialize ChromaDB persistent client
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Create or get collection
collection = chroma_client.get_or_create_collection(name="my_documents")

# Add documents and embeddings
collection.add(
    documents=text_chunks,
    embeddings=embeddings.tolist(),
    ids=[str(i) for i in range(len(text_chunks))]
)

print("✅ ChromaDB collection created and embeddings stored.")


  return forward_call(*args, **kwargs)


✅ ChromaDB collection created and embeddings stored.


In [45]:
# Define the user query
user_query = "What is the main theme of the PDF?"

# Encode the query
query_embedding = model.encode(user_query).tolist()

# Query ChromaDB
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

# Extract matched text chunks
retrieved_chunks = results['documents'][0]
print("🔍 Retrieved Chunks:\n", "\n---\n".join(retrieved_chunks))


🔍 Retrieved Chunks:
 ( -sN) TCP PING SCAN ( -sP) VERSION DETECTION SCAN ( -sV) UDP SCAN ( -sU) IP PROTOCOL SCAN ( -sO) TCP ACK SCAN ( -sA) TCP WINDOW SCAN ( -sW)Version scan identifies open pots with a TCP SYN scan… …and then queries the port with a customized signature. 5 IDLESCAN ( -sI<zombie host: [ probeport ]>) Step 1: Nmap sends a SYN/ACK to the zombie workstation to induce a RST in return. This RST frame contains the initial IPID that nmap will remember for later.Step 2: Nmap sends a SYN frame to the destination address, but nmap spoofs the IP address to make it seem as if the SYN frame was sent from the zombie workstation.Step 3: Nmap repeats the original SYN/ACK probe of the zombie station. If the IPID has incremented, then the port that was spoofed in the original SYN frame is open on the destination device. FTP BOUNCE ATTACK ( -b <ftp_relay_host >) www.stationx.net/nmap -cheat -sheet/ A closed port will result with the FTP server informing the source station that the FTP ser

In [46]:
import os
from dotenv import load_dotenv
import requests

# Load .env file
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# Define the prompt for LLaMA 3
context = "\n".join(retrieved_chunks)
final_prompt = f"""You are an AI assistant. Based on the following context extracted from a PDF, answer the question below.
    
Context:
{context}

Question: {user_query}
Answer:"""

# Call Groq API with LLaMA 3
headers = {
    "Authorization": f"Bearer {GROQ_API_KEY}",
    "Content-Type": "application/json"
}

payload = {
    "model": "llama3-70b-8192",
    "messages": [
        {"role": "user", "content": final_prompt}
    ]
}

response = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=headers, json=payload)

# Print response
answer = response.json()["choices"][0]["message"]["content"]
print("🤖 Groq LLaMA 3 Answer:\n", answer)


🤖 Groq LLaMA 3 Answer:
 The main theme of the PDF is Nmap, a network exploration and security auditing tool, and its various commands, options, and techniques for scanning, detecting, and analyzing network hosts, services, and systems.


In [47]:
import os
import PyPDF2
import requests
import chromadb
from dotenv import load_dotenv
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load .env variables
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# Load SentenceTransformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text()
    return text

# Step 2: Split text into chunks
def split_text(text, chunk_size=500):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# Step 3: Embed and store in ChromaDB
def store_chunks_in_chroma(text_chunks, persist_dir="./chroma_db"):
    embeddings = model.encode(text_chunks)

    client = chromadb.Client(Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory=persist_dir
    ))

    collection = client.get_or_create_collection(name="my_documents")
    
    collection.add(
        documents=text_chunks,
        embeddings=embeddings.tolist(),
        ids=[str(i) for i in range(len(text_chunks))]
    )

    return collection, text_chunks, embeddings

# Step 4: Retrieve top-k relevant chunks
def retrieve_relevant_chunks(query, text_chunks, embeddings, top_k=3):
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    
    top_indices = similarities.argsort()[-top_k:][::-1]
    return [text_chunks[i] for i in top_indices]

# Step 5: Query Groq LLaMA 3 with retrieved context
def query_groq_llama3(context_chunks, question):
    context = "\n".join(context_chunks)
    prompt = f"""You are an AI assistant. Based on the following context extracted from a PDF, answer the question below.
    
Context:
{context}

Question: {question}
Answer:"""

    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": "llama3-70b-8192",
        "messages": [{"role": "user", "content": prompt}]
    }

    response = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=headers, json=payload)
    return response.json()["choices"][0]["message"]["content"]

# Wrapper function to run the full pipeline
def run_rag_pipeline(pdf_path, question):
    text = extract_text_from_pdf(pdf_path)
    chunks = split_text(text)
    collection, text_chunks, embeddings = store_chunks_in_chroma(chunks)
    top_chunks = retrieve_relevant_chunks(question, text_chunks, embeddings)
    answer = query_groq_llama3(top_chunks, question)
    return answer

# === Execute ===
if __name__ == "__main__":
    pdf_path = "your_file.pdf"  # replace with your actual PDF path
    user_question = "What is the main idea of this document?"  # replace with your question

    print("\n🤖 Answer from Groq LLaMA 3:\n")
    print(run_rag_pipeline(pdf_path, user_question))



🤖 Answer from Groq LLaMA 3:



FileNotFoundError: [Errno 2] No such file or directory: 'your_file.pdf'

In [49]:
pip install langchain-groq

Collecting langchain-groq
  Downloading langchain_groq-0.3.6-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-core<1.0.0,>=0.3.68 (from langchain-groq)
  Downloading langchain_core-0.3.72-py3-none-any.whl.metadata (5.8 kB)
Collecting langsmith>=0.3.45 (from langchain-core<1.0.0,>=0.3.68->langchain-groq)
  Downloading langsmith-0.4.8-py3-none-any.whl.metadata (15 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<1.0.0,>=0.3.68->langchain-groq)
  Using cached jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting requests-toolbelt<2.0.0,>=1.0.0 (from langsmith>=0.3.45->langchain-core<1.0.0,>=0.3.68->langchain-groq)
  Using cached requests_toolbelt-1.0.0-py2.py3-none-any.whl.metadata (14 kB)
Collecting zstandard<0.24.0,>=0.23.0 (from langsmith>=0.3.45->langchain-core<1.0.0,>=0.3.68->langchain-groq)
  Using cached zstandard-0.23.0-cp312-cp312-win_amd64.whl.metadata (3.0 kB)
Downloading langchain_groq-0.3.6-py3-none-any.whl (16 kB)
Downloading langchain_core-0.3.72-p

In [51]:
from typing import TypedDict, List, Annotated

class GroqChatResponse(TypedDict):
    capital : Annotated[str, "The capital city of the country."]

In [52]:
from langchain_groq import ChatGroq
llm=ChatGroq(
    model="llama3-70b-8192",
    api_key=GROQ_API_KEY,
    temperature=0.2,
    max_tokens=300
)
llm.with_structured_output(GroqChatResponse).invoke("What is the capital of India")  # Example usage, replace with your query

{'capital': 'New Delhi'}