# LLM Model: Llama

In [None]:
!pip install -r /content/drive/MyDrive/NLP_Project/requirements.txt > /dev/null 2>&1

### Data Loading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from huggingface_hub import login

In [None]:
login(token="***********************") #Please enter your hugging_face token here I have removed it once I ran the model.

In [None]:
from langchain_community.document_loaders import PyPDFLoader

# Load the first document
loader1 = PyPDFLoader("/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf")
data1 = loader1.load()

loader2 = PyPDFLoader("/content/drive/MyDrive/NLP_Project/malware.pdf")
data2 = loader2.load()

loader3 = PyPDFLoader("/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf")
data3 = loader3.load()

loader4 = PyPDFLoader("/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf")
data4 = loader4.load()

# Combine all the data
data = data1 + data2 + data3 + data4

In [None]:
len(data)

146

### Cleaning the Data

In [None]:
import re

def clean_and_normalize(text):
  return re.sub(r'\s+', ' ', text).strip()

In [None]:
for doc in data:
    doc.page_content = clean_and_normalize(doc.page_content)

### Text Splitter

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# split data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
docs = text_splitter.split_documents(data)


print("Total number of documents: ",len(docs))

Total number of documents:  337


In [None]:
docs[86]

Document(metadata={'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf', 'page': 39}, page_content='of users and processes to data files, URLs, services and other resources of your application. ► 9.10 Verify that every connection of your web servers (with user browsers, other web service calls, databases, cloud, etc.) is encrypted using the latest version of the TLS protocol (encryption in transit).')

### Embeddings

In [None]:
#from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

vector = embeddings.embed_query("hello, world!")
vector[:5]
#vector

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[0.03492268547415733,
 0.0188300758600235,
 -0.017854738980531693,
 0.0001388332893839106,
 0.07407363504171371]

### Chroma DB

In [None]:
from langchain_chroma import Chroma
vectorstore = Chroma.from_documents(documents=docs, embedding=HuggingFaceEmbeddings())

### Retriever

In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

retrieved_docs = retriever.invoke("Email phishing")

In [None]:
len(retrieved_docs)

3

In [None]:
print(retrieved_docs[1].page_content)

CYBERSECURITY FOR SMALL BUSINESS WHAT TO DO IF YOUR EMAIL IS SPOOFED Email authentication helps keep your business’s email from being used in phishing schemes because it notifies you if someone spoofs your company’s email. If you get that notification, take these actions: Report it Report the scam to local law enforcement, the FBI’s Internet Crime Complaint Center at IC3.gov, and the FTC at FTC.gov/Complaint. Y ou also can forward phishing emails to spam@uce.gov (an address used by the FTC) and to reportphishing@apwg.org (an address used by the Anti-Phishing Working Group, which includes ISPs, security vendors, financial institutions, and law enforcement agencies). Notify your customers If you find out scammers are impersonating your business, tell your customers as soon as possible — by mail, email, or social media. If you email your customers, send an email without hyperlinks: you don’t want your notification email to look like a phishing scam. Remind customers not to share any


### LangChain pipeline using a HuggingFace LLM

In [None]:
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser
import torch

# Initialize the model
model_id = "meta-llama/Llama-3.2-3B-Instruct"

# Create the text generation pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    max_new_tokens=400,
    device=0  # Use GPU if available, otherwise set to -1 for CPU
)

# Create the LangChain HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Define the prompt template
prompt_template = """
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>
"""

# Create the PromptTemplate
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create the LangChain
llm_chain = prompt | llm | StrOutputParser()

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

### Evaluating RAG Model Responses on Accuracy, Groundedness, and Speed

In [None]:
import time
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Global lists to store data (accuracy, groundedness, speed)
accuracy_list = []
groundedness_list = []
speed_list = []

# Function to calculate cosine similarity (groundedness)
def calculate_similarity(response, retrieved_docs):
    context = [doc.page_content for doc in retrieved_docs]
    context.append(response)  # Add the response to the context for comparison

    # Vectorize the context and response using TF-IDF
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(context)

    # Calculate cosine similarity between the response and the context
    similarity_score = cosine_similarity(vectors[-1], vectors[:-1])  # Compare the response with the context
    return similarity_score.flatten()

# Function to ask a question, generate a response, and record data
def ask_question(question):
    global accuracy_list, groundedness_list, speed_list

    # Start timer for response generation
    start_time = time.time()

    # Generate the response using the RAG chain (you should replace this with actual invocation)
    response = rag_chain.invoke(question)

    # Stop timer after response is generated
    end_time = time.time()
    response_time = end_time - start_time
    speed_list.append(response_time)  # Store the speed

    # Calculate cosine similarity for groundedness
    retrieved_docs = retriever.invoke(question)  # Get retrieved documents (this may be adjusted based on your setup)
    similarity_scores = calculate_similarity(response, retrieved_docs)
    avg_similarity = similarity_scores.mean() if similarity_scores.size > 0 else 0  # Handle array correctly
    groundedness_list.append(avg_similarity)  # Store groundedness score

    # Display the response and similarity with nice formatting
    print(f"\nResponse: {response}")
    print(f"\nCosine Similarity (Groundedness): {avg_similarity:.4f}")
    print(f"\nSpeed: {response_time:.4f} seconds")

    # Ask for accuracy rating
    accuracy = int(input("\nPlease rate the accuracy of the response (1 to 5): "))
    accuracy_list.append(accuracy)

    # Add separator for next question
    print("\n" + "-"*50)

# Function to calculate average accuracy, groundedness, and speed
def calculate_metrics():
    avg_accuracy = sum(accuracy_list) / len(accuracy_list) if accuracy_list else 0
    avg_groundedness = sum(groundedness_list) / len(groundedness_list) if groundedness_list else 0
    avg_speed = sum(speed_list) / len(speed_list) if speed_list else 0

    # Print the results in a table format
    print("\nFinal Evaluation")
    print(f"{'Average Accuracy (%)':<20}{'Average Groundedness':<20}{'Average Speed (seconds)'}")
    print(f"{avg_accuracy*20:<20.2f}{avg_groundedness*100:<20.2f}{avg_speed:.2f}")

# Ask questions in a loop
def start_session():
    print("Welcome to the Question-Answer Evaluation Session!")
    print("Please ask a question, and rate the response based on accuracy. Type 'end' to finish.")

    while True:
        question = input("\nAsk a question (or type 'end' to finish): ")
        if question.lower() == 'end':
            break
        ask_question(question)

    # After all questions, print the summary
    calculate_metrics()

# Start the session
start_session()


Welcome to the Question-Answer Evaluation Session!
Please ask a question, and rate the response based on accuracy. Type 'end' to finish.

Ask a question (or type 'end' to finish): What is phishing, and how does it work?


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 8, 'source': '/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf'}, page_content='CYBERSECURITY FOR SMALL BUSINESS PHISHING LEARN MORE AT: FTC.gov/SmallBusiness You get an email that looks like it’s from someone you know. It seems to be from one of your company’s vendors and asks that you click on a link to update your business account. Should you click? Maybe it looks like it’s from your boss and asks for your network password. Should you reply? In either case, probably not. These may be phishing attempts. HOW WHAT YOU CAN DOPHISHING WORKS You get an email or text It seems to be from someone you know, and it asks you to click a link, or give your password, business bank account,

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 35, 'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf'}, page_content='via phishing emails. Figure 7 Example of malware infection and propagation Source: https://www.spambrella.com/what-is-emotet-malware-and-how-is-it-delivered/ 12 See https://www.europol.europa.eu/ newsroom/news/world%E2%80%99s-most- dangerous-malware-emotet-disrupted- through-global-action'), Document(metadata={'page': 70, 'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf'}, page_content='security-part-3 19. https://www.csoonline.com/article/3391588/why-unauthenticated- sms-is-a-security-risk.html 20. https://www.cloudwards.net/best-2fa-apps/ 21. https:/

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 2, 'source': '/content/drive/MyDrive/NLP_Project/malware.pdf'}, page_content='The traditional classiﬁcation was introduced by Peter Denning in the late 1980s [4, 5]. We will use the following deﬁnitions: Virus: Malware which spreads from one computer to another by embedding copies of itself into ﬁles, which by some means or another are transported to the target. The medium of transport is often known as the vector of the virus. The transport may be initiated by the virus itself (for example, it may send the infected ﬁle as an e-mail attachment) or rely on an unsuspecting human user (who for example transports a CD-ROM containing the infected ﬁle). Worm: Malware which spreads from one computer to ano

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 17, 'source': '/content/drive/MyDrive/NLP_Project/malware.pdf'}, page_content='On an computer with an Athlon 1GHz CPU and 1GB of RAM, analysis of all variants of the Hare virus to build up the CFGs and to annotate them to indicate “empty code” took 10 seconds of CPU time. To build up the annotated CFG for a fairly large non-malicious executable (QuickTimePlayer.exe, size approx. 1MB) took about 800 seconds of CPU time. However, the method was extremely eﬀective at recognising viral code, even when it appeared in quite obscure variants. False positive and false negative rates of 0% were reported for the examples tested. It must be expected that improvements in the technique will make it suitable for 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 12, 'source': '/content/drive/MyDrive/NLP_Project/malware.pdf'}, page_content='control a large number of computers, which is done by installing abackdoor in each of them. The individual computers in the botnet then technically speaking become zombies since they are under remote control, but ar e in this context usually referred to simply as bots. The bots can be given orders by a controller, often known as the botmas- ter, to perform various tasks, such as sending spam mail, adware, or spyware, performing DDoS attacks or just searching for further potential targets to be enrolled in the botnet. In many cases, the botmaster oﬀers such facilities as a service to anyone who is willing to pay for it. Bo

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 19, 'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf'}, page_content='for vulnerable versions of software. Then, by executing the appropriate commands, they exploit the vulnerabilities and can gain access to the system. In many cases the breach occurs due to vulnerable applications that were installed without the business need to do so. • In recent years, due to the improvement of mitigation measures for malware detection and blocking, attackers use a new and more sophisticated technique, the so-called fileless attacks . The term derives from the fact that in this case the malicious code runs directly in memory and does not create an executable file saved in t

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 23, 'source': '/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf'}, page_content='CYBERSECURITY FOR SMALL BUSINESS HOW TO CONNECT TO THE NETWORK REMOTEL Y Require employees and vendors to use secure connections when connecting remotely to your network. They should: Use a router with WPA2 or WPA3 encryption when connecting from their homes. Encryption protects information sent over a network so that outsiders can’t read it. WPA2 and WPA3 are the only encryption standards that will protect information sent over a wireless network. Only use public Wi-Fi when also using a virtual private network (VPN) to encrypt traffic between their computers and the internet. Public Wi-Fi does not

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 25, 'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf'}, page_content='e.g., by submitting a fake online form to which the user is redirected from a web link that was sent in a phishing email. • Use of malware that retrieves passwords from computer memory or by sniffing the network. SUB-CONTROLS ► 5.1 Develop and document: • a user authentication policy that addresses purpose, scope, roles and responsibilities, • procedures for implementing the policy and the relevant protection measures. ► 5.2 Implement authentication mechanisms that enforce the creation of strong passwords for your network and information systems. Strong passwords are those that are at least 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 25, 'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf'}, page_content="26 CYBERSECURITY HANDBOOK 5. USER AUTHENTICATION Implement measures and procedures to verify the identity of any user wishing to access your corporate network. Authentication systems are a primary target for any attacker, for their compromise results in identity theft and unauthorized access to an entity's valuable resources. There are several ways to steal user credentials, such as: • Weak passwords. Most users use easy-to-remember passwords, which can be easily retrieved with a dictionary attack. • Employees storing credentials in plain sight. • Implementation of weak cryptographic techniq

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 36, 'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf'}, page_content='to files and server processes, • unsuccessful file execution attempts, • special privileges usage and usage attempts, • system files usage, • changes in user accounts and in the security policy, • HTTP(S) and DNS requests, • data transfer to and from portable storage media. WHAT ARE THE RISKS?'), Document(metadata={'page': 17, 'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf'}, page_content='configuration and change management tools to effectively track all configuration changes and updates in your corporate network in an automated manner. ► 2.13 Ensur

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 4, 'source': '/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf'}, page_content='flash drives, laptops, point-of-sale devices, and other equipment. Store securely When paper files or electronic devices contain sensitive information, store them in a locked cabinet or room. Limit physical access When records or devices contain sensitive data, allow access only to those who need it. Send reminders Remind employees to put paper files in locked file cabinets, log out of your network and applications, and never leave files or devices with sensitive data unattended. Keep stock Keep track of and secure any devices that collect sensitive customer information. Only keep files and data you

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 22, 'source': '/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf'}, page_content='encryption for laptops and other mobile devices that connect remotely to your network. Check your operating system for this option, which will protect any data stored on the device if it’s lost or stolen. This is especially important if the device stores any sensitive personal information. Change smartphone settings to stop automatic connections to public Wi-Fi. Keep up-to-date antivirus software on devices that connect to your network, including mobile devices. LEARN MORE AT:'), Document(metadata={'page': 22, 'source': '/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf'}, page

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 23, 'source': '/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf'}, page_content='employees to use. WHAT TO DO TO MAINTAIN SECURITY Include information on secure remote access in regular trainings and new staff orientations. Have policies covering basic cybersecurity, give copies to your employees, and explain the importance of following them. Before letting any device — whether at an employee’s home or on a vendor’s network — connect to your network, make sure it meets your network’s security requirements. Tell your staff about the risks of public Wi-Fi. Train your staff: Give your staff tools that will help maintain security: • Require employees to use unique, complex network 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 0, 'source': '/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf'}, page_content='laptops, tablets, smartphones, removable drives, backup tapes, and cloud storage solutions. Use multi-factor authentication Require multi-factor authentication to access areas of your network with sensitive information. This requires additional steps beyond logging in with a password — like a temporary code on a smartphone or a key that’s inserted into a computer. LEARN MORE AT: FTC.gov/SmallBusiness'), Document(metadata={'page': 0, 'source': '/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf'}, page_content='laptops, tablets, smartphones, removable drives, backup tapes, and clo

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 1, 'source': '/content/drive/MyDrive/NLP_Project/malware.pdf'}, page_content='An Introduction to Malware Robin Sharp Spring 2007 Abstract These notes, intended for use in DTU course 02233 on Network Security, give a short introduction to the topic of malware. The most important types of malware are described, together with their basic principles of operation and dissemination, and defenses against malware are discussed. 1 Some Deﬁnitions Malware is a general term for all types of malicious software, which in the context of computer security means: Software which is used with the aim of attempting to breach a computer system’s security policy with respect to Conﬁdentiality, Integrity or Availability.

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 7, 'source': '/content/drive/MyDrive/NLP_Project/malware.pdf'}, page_content='technique is used, for example, by the COK variant (2005) of the BackDoor virus (actually a Trojan horse) which deposits a DLL called spool.dll and then injects code into all processes running on the computer, so that they link to it. 3.3 Disguising the Virus Since signature-based antivirus systems attempt to ﬁnd viral code by looking for charac- teristic byte sequences in the executable, virus designers have adopted various techniques for disguising such sequences. The two dominant techniques are encryption of the viral code and polymorphism. 3.3.1 Encryption Encryption of the viral code with diﬀerent encryption keys will

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 7, 'source': '/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf'}, page_content='CYBERSECURITY FOR SMALL BUSINESS HOW TO PROTECT YOUR BUSINESS Have a plan How would your business stay up and running after a ransomware attack? Put this plan in writing and share it with everyone who needs to know. Back up your data Regularly save important files to a drive or server that’s not connected to your network. Make data backup part of your routine business operations. Keep your security up to date Always install the latest patches and updates. Look for additional means of protection, like email authentication, and intrusion prevention software, and set them to update automatically on you

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 10, 'source': '/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf'}, page_content='CYBERSECURITY FOR SMALL BUSINESS BUSINESS EMAIL IMPOSTERS LEARN MORE AT: FTC.gov/SmallBusiness A scammer sets up an email address that looks like it’s from your company. Then the scammer sends out messages using that email address. This practice is called spoofing, and the scammer is what we call a business email imposter. Scammers do this to get passwords and bank account numbers or to get someone to send them money. When this happens, your company has a lot to lose. Customers and partners might lose trust and take their business elsewhere — and your business could then lose money. HOW TO PROTECT 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 51, 'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf'}, page_content="52 CYBERSECURITY HANDBOOK ► 12.3 Periodically conduct a cybersecurity awareness training program addressed to distinct roles and targeting different employee categories based on business activities and the level of technical expertise. ► 12.4 Perform a knowledge gap analysis of your employees to develop a plan of sequential trainings. ► 12.5 Periodically conduct exercises that simulate cyber security incidents and their impact. Examples include opening a malicious email attachment or visiting a malicious website. Figure 10 below shows specific signs for detecting a phishing email. For furthe

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 9, 'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf'}, page_content='untrustworthy. Each of the above must be authenticated and then explicitly authorized with the minimum required privileges. • “Assume breach” : It is considered that the corporate devices and network may have already been compromised by a malicious actor. The "deny-by-default" principle applies to every user, device, application, and data access request. Access is granted after multiple parameters are thoroughly examined (e.g., username, device name and location, time, previously recorded user behavior, etc.). The zero trust approach incorporates highly detailed monitoring procedures. All acc

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 13, 'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf'}, page_content='14 CYBERSECURITY HANDBOOK PART B BEST PRACTICES 1. INVENTORY OF HARDWARE AND SOFTWARE ASSETS Create an inventory of all IT assets (devices and software) hosted in the physical infrastructure of your organization as well as in cloud environments to form a complete understanding of your asset range and the required controls for their protection and maintenance. The larger an organization is, the more demanding its overall asset management (hardware and software). The assets may reside in more than one location as well as in the cloud, in a modern environment that is constantly changing and cur

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 16, 'source': '/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf'}, page_content='CYBERSECURITY FOR SMALL BUSINESS EMAIL AUTHENTICATION Email authentication technology makes it a lot harder for a scammer to send phishing emails that look like they’re from your company. Using email authentication technology makes it a lot harder for scammers to send phishing emails. This technology allows a receiving server to verify an email from your company and block emails from an imposter — or send them to a quarantine folder and then notify you about them. WHAT TO KNOW Some web host providers let you set up your company’s business email using your domain name (which you may think of as your

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 23, 'source': '/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf'}, page_content='employees to use. WHAT TO DO TO MAINTAIN SECURITY Include information on secure remote access in regular trainings and new staff orientations. Have policies covering basic cybersecurity, give copies to your employees, and explain the importance of following them. Before letting any device — whether at an employee’s home or on a vendor’s network — connect to your network, make sure it meets your network’s security requirements. Tell your staff about the risks of public Wi-Fi. Train your staff: Give your staff tools that will help maintain security: • Require employees to use unique, complex network 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 36, 'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf'}, page_content='to files and server processes, • unsuccessful file execution attempts, • special privileges usage and usage attempts, • system files usage, • changes in user accounts and in the security policy, • HTTP(S) and DNS requests, • data transfer to and from portable storage media. WHAT ARE THE RISKS?'), Document(metadata={'page': 4, 'source': '/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf'}, page_content='flash drives, laptops, point-of-sale devices, and other equipment. Store securely When paper files or electronic devices contain sensitive information, store them in a loc

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 14, 'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf'}, page_content='operations. ► 1.5 Develop and document a policy and procedures for the use of removable media (USB, external hard drives, CDs, DVDs). This policy must be consistent with the security risks that your systems and data face and must cover: • the acceptable uses and types of portable media, • the requirements for the protection of portable media and their content, • the requirements for reporting a lost or stolen portable device, • the requirements for the removal, destruction or disposal of portable media. ► 1.6 Ensure that the proprietary mobile devices (laptops, tablets, smartphones) that emp

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 6, 'source': '/content/drive/MyDrive/NLP_Project/malware.pdf'}, page_content='tryPoint ﬁeld in the Optional Header, so that it points to the start of the virus code. With this approach, it is usual for the virus code to be constructed so that the “original” code is executed after the virus code, as indicated in the examples of Figure 5. In this way, the executable appears to have the usual eﬀect and the user does not get suspicious. Directly changing the AddressOfEntryPoint ﬁeld is such an obvious idea that most an- tivirus systems check whether the beginning of the code is in a section which should not contain executable code or contains known patterns from a database of viral code. In an Entry Poi

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 10, 'source': '/content/drive/MyDrive/NLP_Project/malware.pdf'}, page_content='are: 1. To exploit the targets in order to cause a Distributed DoS attack on a chosen system. Example: Apache/mod ssl (2002) 2. Website defacement on the targets, which are chosen to be web servers. Example: Perl.Santy (2004), which overwrote all ﬁl es with extensions .asp, .htm, .jsp, .php, .phtm and .shtm on the server, so they all produced the text “This site is defaced!!! NeverEverNoSanity WebWorm generation xx”. 3. Installation of a keylogger to track the user’s input, typically in order to pick up passwords, PIN codes, credit card numbers or other conﬁdential information, and to 10'), Document(metadata={'page': 1, '

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 50, 'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf'}, page_content='51 CYBERSECURITY HANDBOOK 12. CYBERSECURITY SKILLS AND AWARENESS TRAINING Implement training programs on a regular basis, to improve the skills and awareness of your employees on cybersecurity issues. Employees play a critical role in the security of network and information systems. The lack of training and corresponding responsibility for this issue poses various types of threats to the organizations: • Social engineering attacks : due to the improvement of protection technologies in recent years, attackers are now targeting the greatest vulnerability, which is the human factor. Today, most

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 6, 'source': '/content/drive/MyDrive/NLP_Project/malware.pdf'}, page_content='enough waste space, the malicious code can be divided among several sections, as illustrated in Figure 5(b). A common arrangement is for the largest area of waste space to be used to contain a small loader which can load the remaining pieces of the virus code as required. One of the tests used for selecting the set of victim ﬁles would then typically be that they must contain a contiguous area of waste space which is large enough to hold the virus loader. Dividing the virus code up into small pieces also helps the virus designer to avoid his virus being detected, as the antivirus system will ﬁnd it diﬃcult to recognise a s

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 1, 'source': '/content/drive/MyDrive/NLP_Project/malware.pdf'}, page_content='An Introduction to Malware Robin Sharp Spring 2007 Abstract These notes, intended for use in DTU course 02233 on Network Security, give a short introduction to the topic of malware. The most important types of malware are described, together with their basic principles of operation and dissemination, and defenses against malware are discussed. 1 Some Deﬁnitions Malware is a general term for all types of malicious software, which in the context of computer security means: Software which is used with the aim of attempting to breach a computer system’s security policy with respect to Conﬁdentiality, Integrity or Availability.

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 12, 'source': '/content/drive/MyDrive/NLP_Project/malware.pdf'}, page_content='control a large number of computers, which is done by installing abackdoor in each of them. The individual computers in the botnet then technically speaking become zombies since they are under remote control, but ar e in this context usually referred to simply as bots. The bots can be given orders by a controller, often known as the botmas- ter, to perform various tasks, such as sending spam mail, adware, or spyware, performing DDoS attacks or just searching for further potential targets to be enrolled in the botnet. In many cases, the botmaster oﬀers such facilities as a service to anyone who is willing to pay for it. Bo

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Response: 
<|system|>
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Do not provide links in responses and make responses structured

[Document(metadata={'page': 22, 'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf'}, page_content="at a minimum, contains the person's name, username, start/stop date, privileges and the employee's department. ► 4.4 Ensure that users who perform exclusively non- administrative tasks of daily routine (e.g. use of Word, Excel, Adobe Reader, reading and sending e-mails, web browsing, etc.) are granted only a non-privileged account. WHAT ARE THE RISKS?"), Document(metadata={'page': 36, 'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf'}, page_content='to files and server processes, • unsuccessful file execution attempts, • special privileges usage and us