### Loading documents (pdf and word files)

In [10]:
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.documents import Document
import os

def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        else:
            print(f"Unsupported file type: {filename}")
            continue
        documents.extend(loader.load())
    return documents

folder_path = "content/docs"
documents = load_documents(folder_path)
print(f"Loaded {len(documents)} documents from the folder.")


Loaded 7 documents from the folder.


### Splitting documents

In [11]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

splits = text_splitter.split_documents(documents)
print(f"Split the documents into {len(splits)} chunks.")


Split the documents into 14 chunks.


In [12]:
print(documents[0])

page_content='Question: 
Explain why it is impossible to design a perfectly secure Network & Information 
System. 
Answer: 
It is impossible to design a perfectly secure Network & Information System due to the 
following reasons: 
1. Evolving Threats: Cybersecurity threats are constantly changing. Attackers develop 
new techniques and exploit previously unknown vulnerabilities, making it 
impossible to anticipate and counter all potential attacks. 
2. Human Error: Many security breaches result from mistakes made by users or 
administrators, such as weak passwords, improper configurations, or falling victim 
to social engineering attacks. Human behavior is inherently unpredictable and 
cannot be fully secured. 
3. Complexity of Systems: Modern systems are highly complex, with multiple 
interconnected components. This complexity increases the likelihood of 
vulnerabilities that attackers can exploit. Ensuring every component is secure is 
practically unachievable. 
4. Resource Limitation

In [13]:
print(splits[1])

page_content='interconnected components. This complexity increases the likelihood of 
vulnerabilities that attackers can exploit. Ensuring every component is secure is 
practically unachievable. 
4. Resource Limitations: Implementing security measures involves costs and trade-
offs, such as reduced system performance or higher maintenance requirements. 
Organizations often cannot afford the resources needed for comprehensive 
security. 
5. Conflict Between Usability and Security: Strong security measures often make 
systems harder to use, leading to resistance from users. Balancing usability with 
security inevitably creates gaps that attackers can exploit. 
These challenges ensure that absolute security remains unattainable; instead, the goal is 
to mitigate risks to an acceptable level through continuous monitoring and updating of 
security measures. 
Question: 
(b) DETERMINE the following Denial of Service Attacks with the help of example [CLO-
2] [6 Marks]' metadata={'source': 'con

In [14]:
print(splits[0].metadata)

{'source': 'content/docs\\nis past papers.pdf', 'page': 0}


###  Creating embeddings using Cohere embeddings

Get (free trial) API Key here: https://dashboard.cohere.com/api-keys

In [15]:
import getpass
import os

if not os.getenv("COHERE_API_KEY"):
    os.environ["COHERE_API_KEY"] = os.getenv("COHERE_API_KEY")

In [16]:
from langchain_cohere import CohereEmbeddings

embeddings = CohereEmbeddings(
    model="embed-english-light-v3.0",
)

In [17]:
document_embeddings = embeddings.embed_documents([split.page_content for split in splits])
print(f"Created embeddings for {len(document_embeddings)} document chunks.")

Created embeddings for 14 document chunks.


### Set up the vector store for storing the embeddings (using chroma here)

In [18]:
from langchain_chroma import Chroma

collection_name = "my_collection"
vectorstore = Chroma.from_documents(
    collection_name=collection_name,
    documents=splits,
    embedding=embeddings,
    persist_directory="./chroma_db"
)
print("Vector store created and persisted to './chroma_db'")


Vector store created and persisted to './chroma_db'


### Performing vector search

In [20]:
query = "Why is it difficult to implement security in a system?"
search_results = vectorstore.similarity_search(query, k=2)
print(f"\nTop 2 most relevant chunks for the query: '{query}'\n")
for i, result in enumerate(search_results, 1):
    print(f"Result {i}:")
    print(f"Source: {result.metadata.get('source', 'Unknown')}")
    print(f"Content: {result.page_content}")
    print()



Top 2 most relevant chunks for the query: 'Why is it difficult to implement security in a system?'

Result 1:
Source: content/docs\nis past papers.pdf
Content: interconnected components. This complexity increases the likelihood of 
vulnerabilities that attackers can exploit. Ensuring every component is secure is 
practically unachievable. 
4. Resource Limitations: Implementing security measures involves costs and trade-
offs, such as reduced system performance or higher maintenance requirements. 
Organizations often cannot afford the resources needed for comprehensive 
security. 
5. Conflict Between Usability and Security: Strong security measures often make 
systems harder to use, leading to resistance from users. Balancing usability with 
security inevitably creates gaps that attackers can exploit. 
These challenges ensure that absolute security remains unattainable; instead, the goal is 
to mitigate risks to an acceptable level through continuous monitoring and updating of 
securit

### Creating a retriever for the RAG chain

In [21]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
retriever_results = retriever.invoke("Why is it difficult to implement security in a system?")
print(retriever_results)

[Document(metadata={'page': 0, 'source': 'content/docs\\nis past papers.pdf'}, page_content='interconnected components. This complexity increases the likelihood of \nvulnerabilities that attackers can exploit. Ensuring every component is secure is \npractically unachievable. \n4. Resource Limitations: Implementing security measures involves costs and trade-\noffs, such as reduced system performance or higher maintenance requirements. \nOrganizations often cannot afford the resources needed for comprehensive \nsecurity. \n5. Conflict Between Usability and Security: Strong security measures often make \nsystems harder to use, leading to resistance from users. Balancing usability with \nsecurity inevitably creates gaps that attackers can exploit. \nThese challenges ensure that absolute security remains unattainable; instead, the goal is \nto mitigate risks to an acceptable level through continuous monitoring and updating of \nsecurity measures. \nQuestion: \n(b) DETERMINE the following De

### Building the RAG chain

We'll use Llama 3.3 as our LLM of choice, through Groq. Get Groq API Key here: https://console.groq.com/keys

In [26]:
from dotenv import load_dotenv
import os

load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")
os.environ["GROQ_API_KEY"] = groq_api_key

In [27]:
from langchain_groq import ChatGroq

llm = ChatGroq(temperature=0, model_name="llama-3.3-70b-versatile")

In [28]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

template = """Answer the question based only on the following context:
{context}
Question: {question}
Answer: """

prompt = ChatPromptTemplate.from_template(template)

def docs2str(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | docs2str, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


### Lets test our RAG chain now

In [30]:
question = "What are different kinds of flooding attacks?"
response = rag_chain.invoke(question)
print(f"Question: {question}")
print(f"Answer: {response}")


Question: What are different kinds of flooding attacks?
Answer: Based on the provided context, there are at least two kinds of flooding attacks:

1. TCP SYN Flooding Attacks: This type of attack exploits the TCP three-way handshake mechanism by sending a large number of SYN packets with spoofed source addresses to the server, consuming server resources.

2. ICMP Flooding Attacks: This type of attack overwhelms a server by flooding it with ICMP packets, such as echo requests (ping), consuming the target's bandwidth and processing power.

Additionally, there is a mention of "Reflection Attacks" in the context, but it is not described in detail. It may be a third type of flooding attack, but without more information, it is unclear how it relates to the other two types.


### Create functions to enable the LLM to output in a structured manner for creating quizzes based on a particular topic

We'll use pydantic to enable the LLM to generate structured JSON outputs that can reliably be parsed later 

In [55]:
from pydantic import BaseModel, Field
from typing import List

class Quiz(BaseModel):
    """Model representing a quiz with questions and answers."""
    questions: List[str] = Field(description="List of quiz questions.")
    answers: List[str] = Field(description="List of corresponding answers to the quiz questions.")


In [60]:
# Bind the Quiz schema to the model
quiz_llm = llm.with_structured_output(Quiz)


In [63]:
from langchain_core.prompts import ChatPromptTemplate

quiz_template = """
Based on the following content, generate a quiz with questions and corresponding answers.

Content:
{context}

The output should be in JSON format with two keys: 'questions' and 'answers'.
"""

quiz_prompt = ChatPromptTemplate.from_template(quiz_template)


In [64]:
from langchain.schema.runnable import RunnablePassthrough

def docs_to_string(docs):
    return "\n\n".join(doc.page_content for doc in docs)

quiz_chain = (
    {"context": retriever | docs_to_string, "question": RunnablePassthrough()}
    | quiz_prompt
    | quiz_llm
)

# User provides a topic
topic = "Network Security"

# Generate the quiz
quiz = quiz_chain.invoke(topic)


In [65]:
print(quiz.questions)
print(quiz.answers)


['Why is it impossible to design a perfectly secure Network & Information System?', 'What is the role of an External Firewall in a network?', 'What is the purpose of a Service Network or DMZ?', 'What is the function of an Internal Firewall?']
['Evolving Threats, Human Error, Complexity of Systems', 'Acts as the first layer of defense between the Internet and the internal network', 'Provides limited access to external users and includes public-facing services', 'Provides an additional layer of security by protecting the internal networks from the DMZ']


### Create functions to enable LLM to create MCQs

In [79]:
from pydantic import BaseModel, Field
from typing import List

class MCQ(BaseModel):
    """Model representing a multiple-choice question."""
    question: str = Field(description="The question text.")
    choices: List[str] = Field(description="List of 4 possible answers for the question.")
    correct_answer: str = Field(description="The correct answer for the question.")

class MCQQuiz(BaseModel):
    """Model representing a quiz with multiple-choice questions."""
    mcqs: List[MCQ] = Field(description="List of multiple-choice questions.")


In [80]:
# Bind the Quiz schema to the model
mcq_llm = llm.with_structured_output(MCQQuiz)


In [81]:
from langchain_core.prompts import ChatPromptTemplate

mcq_template = """
Based on the following content, generate a set of multiple-choice questions (MCQs). Each question should include:
1. A question text.
2. A list of 4 choices, including the correct answer and 3 distractors.
3. The correct answer.

The output should be in JSON format with the following structure:
{{
    "mcqs": [
        {{
            "question": "Question text here",
            "choices": ["Choice 1", "Choice 2", "Choice 3", "Choice 4"],
            "correct_answer": "The correct choice here"
        }},
        ...
    ]
}}

Content:
{context}
"""


mcq_prompt = ChatPromptTemplate.from_template(mcq_template)


In [82]:
from langchain.schema.runnable import RunnablePassthrough

def docs_to_string(docs):
    return "\n\n".join(doc.page_content for doc in docs)

mcq_chain = (
    {"context": retriever | docs_to_string, "question": RunnablePassthrough()}
    | mcq_prompt
    | mcq_llm
)

# User provides a topic
topic = "Network Security"

# Generate the quiz
mcqs = mcq_chain.invoke(topic)


In [83]:
print(mcqs)

mcqs=[MCQ(question='Why is it impossible to design a perfectly secure Network & Information System?', choices=['Evolving threats and human error', 'Complexity of systems and lack of funding', 'Only evolving threats', 'Only human error'], correct_answer='Evolving threats and human error'), MCQ(question='What is the primary function of an External Firewall?', choices=['To protect the internal network from the DMZ', 'To filter incoming and outgoing traffic based on security policies', 'To provide public-facing services like web servers', 'To ensure traffic between the DMZ and internal network adheres to security policies'], correct_answer='To filter incoming and outgoing traffic based on security policies'), MCQ(question='What is the purpose of a DMZ (Demilitarized Zone) network?', choices=['To provide direct access to the internal network', 'To include private-facing services like database servers', 'To provide limited access to external users and include public-facing services', 'To pro

### Create functions to generate flashcards for a topic

For this, using pydantic structuring isn't necessary but it still guarantees reliable json outputs ensuring there won't be any parsing issues

In [92]:
from pydantic import BaseModel, Field
from typing import List

class Flashcards(BaseModel):
    """Model representing flashcard content"""
    flashcard: List[str] = Field(description="List of concise, memorable flashcard content.")


In [93]:
# Bind the Flashcards schema to the model
cards_llm = llm.with_structured_output(Flashcards)


In [94]:
from langchain_core.prompts import ChatPromptTemplate

cards_template = """
Based on the following content, generate some concise and memorable flashcards.

Content:
{context}

The output should be in JSON format with the key: 'flashcard'.
"""


cards_prompt = ChatPromptTemplate.from_template(cards_template)


In [95]:
from langchain.schema.runnable import RunnablePassthrough

def docs_to_string(docs):
    return "\n\n".join(doc.page_content for doc in docs)

cards_chain = (
    {"context": retriever | docs_to_string, "question": RunnablePassthrough()}
    | cards_prompt
    | cards_llm
)

# User provides a topic
topic = "Network Security"

# Generate the quiz
cards = cards_chain.invoke(topic)


In [96]:
print(cards)

flashcard=['Evolving threats make it impossible to anticipate all attacks', 'Human error is a significant factor in security breaches', 'Complex systems increase the likelihood of vulnerabilities', 'External Firewall: first layer of defense', 'DMZ: provides limited access to external users', 'Internal Firewall: protects internal networks from the DMZ']


In [97]:
print(cards.flashcard)

['Evolving threats make it impossible to anticipate all attacks', 'Human error is a significant factor in security breaches', 'Complex systems increase the likelihood of vulnerabilities', 'External Firewall: first layer of defense', 'DMZ: provides limited access to external users', 'Internal Firewall: protects internal networks from the DMZ']
