# LLM Model: QWwen

In [None]:
!pip install -r /content/drive/MyDrive/NLP_Project/requirements.txt > /dev/null 2>&1

### Data Loading

In [None]:
from langchain_community.document_loaders import PyPDFLoader

# Load the first document
loader1 = PyPDFLoader("/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf")
data1 = loader1.load()

loader2 = PyPDFLoader("/content/drive/MyDrive/NLP_Project/malware.pdf")
data2 = loader2.load()

loader3 = PyPDFLoader("/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf")
data3 = loader3.load()

loader4 = PyPDFLoader("/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf")
data4 = loader4.load()

# Combine all the data
data = data1 + data2 + data3 + data4

In [None]:
len(data)

146

### Cleaning the Data

In [None]:
import re

def clean_and_normalize(text):
  return re.sub(r'\s+', ' ', text).strip()

In [None]:
for doc in data:
    doc.page_content = clean_and_normalize(doc.page_content)

### Text Splitter

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# split data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
docs = text_splitter.split_documents(data)


print("Total number of documents: ",len(docs))

Total number of documents:  337


In [None]:
docs[86]

Document(metadata={'source': '/content/drive/MyDrive/NLP_Project/Cybersecurity-Handbook-English-version.pdf', 'page': 39}, page_content='of users and processes to data files, URLs, services and other resources of your application. ► 9.10 Verify that every connection of your web servers (with user browsers, other web service calls, databases, cloud, etc.) is encrypted using the latest version of the TLS protocol (encryption in transit).')

### Embeddings

In [None]:
#from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

vector = embeddings.embed_query("hello, world!")
vector[:5]
#vector

[0.03492268547415733,
 0.0188300758600235,
 -0.017854738980531693,
 0.0001388332893839106,
 0.07407363504171371]

### Chroma DB

In [None]:
from langchain_chroma import Chroma
vectorstore = Chroma.from_documents(documents=docs, embedding=HuggingFaceEmbeddings())

### Retriever

In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

retrieved_docs = retriever.invoke("Email phishing")

In [None]:
len(retrieved_docs)

3

In [None]:
print(retrieved_docs[1].page_content)

CYBERSECURITY FOR SMALL BUSINESS WHAT TO DO IF YOUR EMAIL IS SPOOFED Email authentication helps keep your business’s email from being used in phishing schemes because it notifies you if someone spoofs your company’s email. If you get that notification, take these actions: Report it Report the scam to local law enforcement, the FBI’s Internet Crime Complaint Center at IC3.gov, and the FTC at FTC.gov/Complaint. Y ou also can forward phishing emails to spam@uce.gov (an address used by the FTC) and to reportphishing@apwg.org (an address used by the Anti-Phishing Working Group, which includes ISPs, security vendors, financial institutions, and law enforcement agencies). Notify your customers If you find out scammers are impersonating your business, tell your customers as soon as possible — by mail, email, or social media. If you email your customers, send an email without hyperlinks: you don’t want your notification email to look like a phishing scam. Remind customers not to share any


### LangChain pipeline using a HuggingFace LLM


In [None]:
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


#model_id = "meta-llama/Meta-Llama-3-8B"
model_id = "Qwen/Qwen2.5-1.5B-Instruct"

text_generation_pipeline = pipeline(
    "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, max_new_tokens=400, device=0)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt_template = """
<|system|>
You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Do not provide links in responses and make responses structured"

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = prompt | llm | StrOutputParser()

In [None]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

### Evaluating RAG Model Responses on Accuracy, Groundedness, and Speed


In [None]:
import time
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Global lists to store data (accuracy, groundedness, speed)
accuracy_list = []
groundedness_list = []
speed_list = []

# Function to calculate cosine similarity (groundedness)
def calculate_similarity(response, retrieved_docs):
    context = [doc.page_content for doc in retrieved_docs]
    context.append(response)  # Add the response to the context for comparison

    # Vectorize the context and response using TF-IDF
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(context)

    # Calculate cosine similarity between the response and the context
    similarity_score = cosine_similarity(vectors[-1], vectors[:-1])  # Compare the response with the context
    return similarity_score.flatten()

# Function to ask a question, generate a response, and record data
def ask_question(question):
    global accuracy_list, groundedness_list, speed_list

    # Start timer for response generation
    start_time = time.time()

    # Generate the response using the RAG chain (you should replace this with actual invocation)
    response = rag_chain.invoke(question)

    # Stop timer after response is generated
    end_time = time.time()
    response_time = end_time - start_time
    speed_list.append(response_time)  # Store the speed

    # Calculate cosine similarity for groundedness
    retrieved_docs = retriever.invoke(question)  # Get retrieved documents (this may be adjusted based on your setup)
    similarity_scores = calculate_similarity(response, retrieved_docs)
    avg_similarity = similarity_scores.mean() if similarity_scores.size > 0 else 0  # Handle array correctly
    groundedness_list.append(avg_similarity)  # Store groundedness score

    # Display the response and similarity with nice formatting
    print(f"\nResponse: {response}")
    print(f"\nCosine Similarity (Groundedness): {avg_similarity:.4f}")
    print(f"\nSpeed: {response_time:.4f} seconds")

    # Ask for accuracy rating
    accuracy = int(input("\nPlease rate the accuracy of the response (1 to 5): "))
    accuracy_list.append(accuracy)

    # Add separator for next question
    print("\n" + "-"*50)

# Function to calculate average accuracy, groundedness, and speed
def calculate_metrics():
    avg_accuracy = sum(accuracy_list) / len(accuracy_list) if accuracy_list else 0
    avg_groundedness = sum(groundedness_list) / len(groundedness_list) if groundedness_list else 0
    avg_speed = sum(speed_list) / len(speed_list) if speed_list else 0

    # Print the results in a table format
    print("\nFinal Evaluation")
    print(f"{'Average Accuracy (%)':<20}{'Average Groundedness':<20}{'Average Speed (seconds)'}")
    print(f"{avg_accuracy*20:<20.2f}{avg_groundedness*100:<20.2f}{avg_speed:.2f}")

# Ask questions in a loop
def start_session():
    print("Welcome to the Question-Answer Evaluation Session!")
    print("Please ask a question, and rate the response based on accuracy. Type 'end' to finish.")

    while True:
        question = input("\nAsk a question (or type 'end' to finish): ")
        if question.lower() == 'end':
            break
        ask_question(question)

    # After all questions, print the summary
    calculate_metrics()

# Start the session
start_session()


Welcome to the Question-Answer Evaluation Session!
Please ask a question, and rate the response based on accuracy. Type 'end' to finish.

Ask a question (or type 'end' to finish): What is phishing, and how does it work?

Response: 
<|system|>
You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Do not provide links in responses and make responses structured"

[Document(metadata={'page': 8, 'source': '/content/drive/MyDrive/NLP_Project/cybersecuirty_sb_factsheets_all.pdf'}, page_content='CYBERSECURITY FOR\nSMALL BUSINESS\nPHISHING\nLEARN MORE AT:\nFTC.gov/SmallBusiness\nYou get an email that looks like it’s from someone you know.\nIt seems to be from one of your company’s vendors and asks that you click on a link to update your \nbusiness account. Should you click? Maybe it looks like it’s from your boss and asks for your network \npassword. Sho