In [None]:
# Cybersecurity RAG (Retrieval Augmented Generation) Lab
# Building a Question-Answering System with CSV Knowledge Base

# This notebook demonstrates how to create a simple RAG system for
# cybersecurity questions using a CSV knowledge base.

import pandas as pd
import numpy as np
import re
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from google.colab import files
import requests
from IPython.display import Markdown, display, HTML

# Check if we're running in Colab
try:
    import google.colab
    IN_COLAB = True
    print("Running in Google Colab")
except:
    IN_COLAB = False
    print("Not running in Google Colab")

# Install required packages
!pip install transformers langchain sentence_transformers torch -q

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer


Running in Google Colab
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m103.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m78.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━

In [None]:

# PART 1: SETUP AND DATA PREPARATION
# =============================================

# First, let's create a cybersecurity knowledge base as a CSV file
def create_cybersecurity_csv():
    """Create a CSV file with cybersecurity concepts and explanations"""

    cybersecurity_data = [
        ["Phishing", "A social engineering attack where attackers send fraudulent messages to trick individuals into revealing sensitive information or installing malware. Common indicators include urgent language, suspicious links, and requests for personal information."],
        ["Ransomware", "A type of malicious software that encrypts a victim's files and demands payment for the decryption key. Ransomware often spreads through phishing emails, malicious downloads, or exploiting system vulnerabilities."],
        ["Two-Factor Authentication (2FA)", "A security method that requires users to provide two different authentication factors: something they know (password) and something they have (mobile device) or something they are (biometric). This significantly increases account security."],
        ["SQL Injection", "A code injection technique that exploits vulnerabilities in database-driven websites. Attackers insert malicious SQL statements into entry fields, allowing them to access, modify, or delete data from the database."],
        ["Zero-Day Vulnerability", "A software security flaw unknown to the vendor that hackers can exploit before a patch is created. These vulnerabilities are particularly dangerous as no defense exists at the time of discovery."],
        ["DDoS Attack", "Distributed Denial of Service attack overwhelms a target system with traffic from multiple compromised computers. This renders the target system unavailable to legitimate users and can cause significant service disruption."],
        ["Man-in-the-Middle Attack", "An attack where the attacker secretly intercepts and possibly alters communications between two parties who believe they're directly communicating with each other. It can be used to steal login credentials or personal information."],
        ["VPN", "Virtual Private Network creates an encrypted connection over a less secure network. VPNs provide privacy, anonymity, and security by creating a private network from a public internet connection."],
        ["Social Engineering", "Psychological manipulation techniques that trick people into making security mistakes or giving away sensitive information. Types include phishing, pretexting, baiting, and tailgating."],
        ["Firewall", "A network security device that monitors and filters incoming and outgoing network traffic based on an organization's security policies. Firewalls establish a barrier between trusted internal networks and untrusted external networks."],
        ["Encryption", "The process of encoding information so that only authorized parties can access it. Encryption uses mathematical algorithms to convert data into a coded format that appears random without the decryption key."],
        ["Malware", "Short for malicious software, it refers to any software designed to harm or exploit devices, services, or networks. Types include viruses, trojans, worms, ransomware, spyware, and adware."],
        ["Brute Force Attack", "A method of trial and error used to decode encrypted data such as passwords by systematically checking all possible combinations until the correct one is found. Protection includes complex passwords and account lockouts."],
        ["Penetration Testing", "An authorized simulated attack on a computer system to evaluate security. Penetration testers use the same tools and techniques as attackers to find and demonstrate business impacts of vulnerabilities."],
        ["Cross-Site Scripting (XSS)", "A web security vulnerability that allows attackers to inject client-side scripts into web pages viewed by other users. This can be used to bypass access controls and impersonate users."],
        ["Spyware", "Software that secretly gathers user information through their internet connection without their knowledge, usually for advertising purposes. It can track internet activity, harvest data, and monitor keystrokes."],
        ["Hashing", "The process of converting data of any size into a fixed-size string. Unlike encryption, hashing is one-way and cannot be reversed. It's commonly used to verify data integrity and store passwords securely."],
        ["Botnet", "A network of infected computers controlled remotely by attackers, often used for DDoS attacks or spam distribution. Users are typically unaware their computer is part of a botnet."],
        ["Cyber Threat Intelligence", "Evidence-based knowledge about existing or emerging threats that helps organizations make informed security decisions. It includes context, mechanisms, indicators, implications, and action-oriented advice."],
        ["CSRF Attack", "Cross-Site Request Forgery tricks users into submitting unwanted requests to websites where they're authenticated. This can force users to execute actions without their consent or knowledge."],
        ["Zero Trust Security", "A security model that requires strict identity verification for every person and device trying to access resources, regardless of whether they're inside or outside the network perimeter."],
        ["APT", "Advanced Persistent Threat is a prolonged, targeted cyber attack where an attacker establishes an undetected presence in a network to steal sensitive data. APTs are typically conducted by nation-states or state-sponsored groups."],
        ["Security Misconfigurations", "Improperly configured security settings that leave systems vulnerable. Common examples include default credentials, error messages revealing too much information, and unnecessary features enabled."],
        ["Privilege Escalation", "A type of attack that exploits bugs, design flaws, or configuration oversights to gain elevated access to resources that are normally protected. It allows attackers to gain higher-level permissions than intended."],
        ["DNS Spoofing", "A type of cyber attack where corrupted DNS data is introduced into a DNS resolver's cache, causing the resolver to return an incorrect IP address. This diverts traffic to the attacker's computer."],
        ["Supply Chain Attack", "A cyber attack that targets less-secure elements in the supply chain, such as third-party vendors or software. The SolarWinds attack of 2020 is a notable example that affected thousands of organizations."],
        ["Defense in Depth", "A cybersecurity approach that uses multiple layers of security controls throughout a system. If one defense fails, others still provide protection, making it harder for attackers to reach valuable assets."],
        ["Digital Forensics", "The process of uncovering and interpreting electronic data to preserve evidence in a way that is suitable for presentation in a court of law. Used to investigate cyber crimes and security incidents."],
        ["Fileless Malware", "A type of malicious software that exists exclusively in a computer's RAM, making it difficult to detect using traditional security tools that scan for files on disk. It often leverages legitimate system tools."],
        ["SIEM", "Security Information and Event Management systems combine security information management and security event management. They provide real-time analysis of security alerts generated by applications and network hardware."]
    ]

    # Create DataFrame and save to CSV
    df = pd.DataFrame(cybersecurity_data, columns=['Concept', 'Description'])
    df.to_csv('cybersecurity_knowledge.csv', index=False)
    print(f"Created CSV with {len(df)} cybersecurity concepts")
    return df

# Create and load our knowledge base
df = create_cybersecurity_csv()

# Display the first few rows of our knowledge base
display(df.head())


Created CSV with 30 cybersecurity concepts


Unnamed: 0,Concept,Description
0,Phishing,A social engineering attack where attackers se...
1,Ransomware,A type of malicious software that encrypts a v...
2,Two-Factor Authentication (2FA),A security method that requires users to provi...
3,SQL Injection,A code injection technique that exploits vulne...
4,Zero-Day Vulnerability,A software security flaw unknown to the vendor...


In [None]:

# PART 2: CREATING THE VECTOR DATABASE
# =============================================

# Initialize the sentence transformer model for better embeddings
print("Loading sentence transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for all descriptions in our knowledge base
def create_embeddings(df):
    """Create embeddings for all descriptions in the DataFrame"""
    print("Creating embeddings for knowledge base...")

    # Combine concept and description for better context
    df['Content'] = df['Concept'] + ": " + df['Description']

    # Generate embeddings
    embeddings = model.encode(df['Content'].tolist())

    print(f"Created {len(embeddings)} embeddings of dimension {embeddings[0].shape[0]}")
    return embeddings

# Generate embeddings
embeddings = create_embeddings(df)



Loading sentence transformer model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating embeddings for knowledge base...
Created 30 embeddings of dimension 384


In [None]:
# PART 3: IMPLEMENTING THE RETRIEVER
# =============================================

def retrieve_relevant_context(query, df, embeddings, top_k=3):
    """
    Retrieve the most relevant context passages for a given query

    Args:
        query: The user's question
        df: DataFrame containing our knowledge base
        embeddings: Pre-computed embeddings for each row in df
        top_k: Number of most relevant passages to retrieve

    Returns:
        List of relevant passages
    """
    # Generate embedding for the query
    query_embedding = model.encode([query])[0]

    # Calculate similarity between query and all descriptions
    similarities = cosine_similarity([query_embedding], embeddings)[0]

    # Get top-k most similar indices
    top_indices = np.argsort(similarities)[-top_k:][::-1]

    # Get the relevant passages
    relevant_passages = df.iloc[top_indices]['Content'].tolist()
    relevance_scores = similarities[top_indices]

    # Print retrieval results for demonstration
    print("\n=== Retrieval Results ===")
    for i, (passage, score) in enumerate(zip(relevant_passages, relevance_scores)):
        print(f"Result {i+1} [Similarity: {score:.4f}]:")
        print(f"{passage[:150]}...\n")

    return relevant_passages, relevance_scores



In [None]:
# PART 4: IMPLEMENTING THE GENERATOR (LLM INTEGRATION)
# =============================================

# Function to set up Hugging Face model
def setup_huggingface_model():
    """Setup Hugging Face model for text generation"""
    print("\n=== Hugging Face Model Setup ===")
    print("Loading text generation model. This may take a few minutes...")

    try:
        # Load a smaller model suitable for Google Colab
        model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
        print(f"Loading model: {model_name}")

        # Create a text generation pipeline
        global generator
        generator = pipeline(
            "text-generation",
            model=model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        print("Model loaded successfully!")
        return True
    except Exception as e:
        print(f"Error loading Hugging Face model: {e}")
        return False

# Generate answer using retrieved context and Hugging Face model
def generate_answer(query, relevant_passages, relevance_scores):
    """
    Generate an answer to the query using Hugging Face model and retrieved context

    Args:
        query: The user's question
        relevant_passages: Retrieved relevant passages
        relevance_scores: Similarity scores for the relevant passages

    Returns:
        Generated answer
    """
    # Prepare context from relevant passages
    context = "\n\n".join([f"Passage {i+1} [Relevance: {score:.2f}]: {passage}"
                          for i, (passage, score) in enumerate(zip(relevant_passages, relevance_scores))])

    # Construct the prompt for TinyLlama
    prompt = f"""<|system|>
You are a cybersecurity expert assistant. Your task is to answer user questions about cybersecurity concepts accurately.
Use ONLY the provided context to formulate your answer.
If the context doesn't contain relevant information, admit that you don't have enough information to answer accurately.
Keep your answers concise, informative, and focused on cybersecurity.

Context:
{context}
<|user|>
{query}
<|assistant|>"""

    # Generate text using Hugging Face model
    try:
        result = generator(
            prompt,
            max_new_tokens=250,
            do_sample=True,
            temperature=0.3,
            top_p=0.9,
            num_return_sequences=1
        )

        # Extract the generated text
        generated_text = result[0]['generated_text']

        # Extract only the assistant's response part
        assistant_part = generated_text.split("<|assistant|>")[-1].strip()

        # Clean up any trailing model tokens
        if "<|" in assistant_part:
            assistant_part = assistant_part.split("<|")[0].strip()

        return assistant_part
    except Exception as e:
        return f"Error generating answer: {e}"



In [None]:
# PART 5: PUTTING IT ALL TOGETHER - THE RAG SYSTEM
# =============================================

def cybersecurity_rag_system(query, df, embeddings, top_k=3):
    """
    Complete RAG system that retrieves context and generates an answer

    Args:
        query: User's question
        df: Knowledge base DataFrame
        embeddings: Pre-computed embeddings
        top_k: Number of passages to retrieve

    Returns:
        Generated answer
    """
    print(f"\nQuestion: {query}")

    # Retrieve relevant context
    relevant_passages, relevance_scores = retrieve_relevant_context(query, df, embeddings, top_k)

    # Generate answer
    answer = generate_answer(query, relevant_passages, relevance_scores)

    print("\n=== Generated Answer ===")
    display(Markdown(answer))

    return answer



In [None]:
# PART 6: EVALUATION METRICS
# =============================================

def evaluate_retrieval(query, retrieved_passages, relevance_scores, expected_concept):
    """
    Evaluate the retrieval performance

    Args:
        query: The user's question
        retrieved_passages: Retrieved passages
        relevance_scores: Similarity scores
        expected_concept: The expected concept that should be retrieved

    Returns:
        Evaluation metrics
    """
    # Check if the expected concept is in any of the retrieved passages
    found = any(expected_concept.lower() in passage.lower() for passage in retrieved_passages)

    # Calculate Mean Reciprocal Rank (MRR)
    rank = None
    for i, passage in enumerate(retrieved_passages):
        if expected_concept.lower() in passage.lower():
            rank = i + 1
            break

    mrr = 1 / rank if rank else 0

    # Calculate precision@k
    relevant_count = sum(1 for passage in retrieved_passages if expected_concept.lower() in passage.lower())
    precision_at_k = relevant_count / len(retrieved_passages)

    return {
        "found": found,
        "rank": rank,
        "mrr": mrr,
        "precision_at_k": precision_at_k,
        "top_relevance_score": max(relevance_scores) if relevance_scores else 0
    }



In [None]:
# PART 7: ALTERNATIVE MODELS (OPTIONAL)
# =============================================

def try_alternative_model():
    """
    Function to try a different model from Hugging Face

    This is provided as an optional exercise for students to experiment with
    different models and compare their performance.
    """
    print("\n=== Try Alternative Model ===")
    print("You can experiment with other models from Hugging Face.")
    print("Here are some options to try:")
    print("1. google/flan-t5-small (smaller, faster)")
    print("2. facebook/opt-350m (medium size)")
    print("3. databricks/dolly-v2-3b (larger, may need more memory)")

    choice = input("Enter a number to select a model, or 'c' to continue with current model: ")

    if choice.lower() == 'c':
        return True

    try:
        model_options = {
            "1": "google/flan-t5-small",
            "2": "facebook/opt-350m",
            "3": "databricks/dolly-v2-3b"
        }

        if choice in model_options:
            model_name = model_options[choice]
            print(f"Loading {model_name}... (this may take a few minutes)")

            global generator
            generator = pipeline(
                "text-generation",
                model=model_name,
                torch_dtype="auto",
                device_map="auto"
            )

            print(f"Successfully loaded {model_name}!")
            return True
        else:
            print("Invalid choice. Continuing with current model.")
            return True
    except Exception as e:
        print(f"Error loading alternative model: {e}")
        print("Continuing with the default model.")
        return True



In [None]:
# PART 8: USER INTERFACE
# =============================================

def run_cybersecurity_rag_lab():
    """Main function to run the lab session"""
    # Display header
    display(HTML("""
    <div style="background-color:#4CAF50; color:white; padding:10px; border-radius:5px; margin-bottom:20px;">
        <h1 style="text-align:center;">Cybersecurity RAG Lab Session</h1>
        <h3 style="text-align:center;">Retrieval Augmented Generation for Question Answering</h3>
    </div>
    """))

    # Setup Hugging Face model
    if not setup_huggingface_model():
        print("Failed to setup Hugging Face model. Please restart the notebook and try again.")
        return

    # Sample questions for the lab
    sample_questions = [
        "What is phishing and how can I identify it?",
        "Can you explain what a DDoS attack is?",
        "How does two-factor authentication improve security?",
        "What is the difference between encryption and hashing?",
        "How do zero-day vulnerabilities work?",
        "What is a supply chain attack and can you give an example?"
    ]

    print("\n=== Sample Questions ===")
    for i, question in enumerate(sample_questions):
        print(f"{i+1}. {question}")

    # Interactive Q&A session
    while True:
        print("\n" + "="*50)
        choice = input("Enter a number to select a sample question, or type your own question, or 'q' to quit: ")

        if choice.lower() == 'q':
            break

        try:
            # Check if user selected a sample question
            if choice.isdigit() and 1 <= int(choice) <= len(sample_questions):
                query = sample_questions[int(choice)-1]
            else:
                query = choice

            # Process the query
            start_time = time.time()
            answer = cybersecurity_rag_system(query, df, embeddings, top_k=3)
            end_time = time.time()

            print(f"\nProcessing time: {end_time - start_time:.2f} seconds")

            # Ask for feedback
            feedback = input("\nWas this answer helpful? (y/n): ")
            if feedback.lower() == 'n':
                print("Thank you for your feedback. Let's try another question.")

        except Exception as e:
            print(f"Error processing question: {e}")



# Run the lab with Hugging Face model
if __name__ == "__main__" and IN_COLAB:
    # Initialize global variable for generator
    generator = None
    run_cybersecurity_rag_lab()




=== Hugging Face Model Setup ===
Loading text generation model. This may take a few minutes...
Loading model: TinyLlama/TinyLlama-1.1B-Chat-v1.0


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Device set to use cuda:0


Model loaded successfully!

=== Sample Questions ===
1. What is phishing and how can I identify it?
2. Can you explain what a DDoS attack is?
3. How does two-factor authentication improve security?
4. What is the difference between encryption and hashing?
5. How do zero-day vulnerabilities work?
6. What is a supply chain attack and can you give an example?

Enter a number to select a sample question, or type your own question, or 'q' to quit: 2

Question: Can you explain what a DDoS attack is?

=== Retrieval Results ===
Result 1 [Similarity: 0.7973]:
DDoS Attack: Distributed Denial of Service attack overwhelms a target system with traffic from multiple compromised computers. This renders the target...

Result 2 [Similarity: 0.5223]:
DNS Spoofing: A type of cyber attack where corrupted DNS data is introduced into a DNS resolver's cache, causing the resolver to return an incorrect I...

Result 3 [Similarity: 0.5025]:
Botnet: A network of infected computers controlled remotely by attacker

Sure! A DDoS attack is a type of cyber attack that involves flooding a target system with traffic from multiple sources, overwhelming it with a massive amount of data. This can cause the target system to become unavailable to legitimate users, leading to service disruption. DDoS attacks are often used for various purposes, such as disrupting a website or service, causing a denial of service (DoS) attack, or stealing sensitive data.


Processing time: 5.26 seconds

Was this answer helpful? (y/n): y

Enter a number to select a sample question, or type your own question, or 'q' to quit: q


In [None]:
# PART 9: chat bot
# =============================================

def run_cybersecurity_rag_lab():
    """Main function to run the lab session"""
    # Display header
    display(HTML("""
    <div style="background-color:#4CAF50; color:white; padding:10px; border-radius:5px; margin-bottom:20px;">
        <h1 style="text-align:center;">Cybersecurity : chatbot</h1>
        <h3 style="text-align:center;">Retrieval Augmented Generation for Question Answering</h3>
    </div>
    """))

    # Setup Hugging Face model
    if not setup_huggingface_model():
        print("Failed to setup Hugging Face model. Please restart the notebook and try again.")
        return

    # Display chat interface header
    print("\n" + "="*60)
    print("=== Cybersecurity Assistant Chat ===".center(60))
    print("Type your questions below. Enter 'q' to quit.".center(60))
    print("="*60)

    # Welcome message
    print("\nCybersecurity Assistant: Hello! I'm your cybersecurity assistant. I can answer questions about cybersecurity concepts, threats, and best practices. How can I help you today?")

    # Set default retrieval parameter
    top_k = 3

    # Interactive Q&A session
    while True:
        print("\n" + "-"*60)
        query = input("You: ")

        if query.lower() in ['q', 'quit', 'exit']:
            print("\nCybersecurity Assistant: Thank you for chatting. Goodbye!")
            break

        if not query.strip():
            continue

        try:
            # Process the query
            start_time = time.time()

            # Check if query is cybersecurity related
            is_cybersecurity = check_if_cybersecurity_related(query)

            if is_cybersecurity:
                # Retrieve relevant context
                relevant_passages, relevance_scores = retrieve_relevant_context(query, df, embeddings, top_k)

                # Generate answer using RAG
                answer = generate_answer(query, relevant_passages, relevance_scores)
                system_message = f"[Used RAG with {top_k} passages]"
            else:
                # Generate direct answer for non-cybersecurity questions
                answer = handle_non_cybersecurity_query(query)
                system_message = "[Non-cybersecurity question detected]"

            end_time = time.time()

            # Print answer with proper formatting
            print(f"\nCybersecurity Assistant: {answer}")
            print(f"\n{system_message} - Processing time: {end_time - start_time:.2f} seconds")

        except Exception as e:
            print(f"\nCybersecurity Assistant: I apologize, but I encountered an error processing your question. Could you try rephrasing it or asking something else?")
            print(f"\n[System error: {e}]")

def check_if_cybersecurity_related(query):
    """Check if the query is related to cybersecurity"""
    cybersecurity_keywords = [
        'security', 'cyber', 'hack', 'attack', 'phishing', 'malware', 'virus',
        'ransomware', 'authentication', 'encryption', 'password', 'threat',
        'vulnerability', 'firewall', 'breach', 'defense', 'protection', 'data',
        'network', 'system', 'access', 'identity', 'secure', 'exploit', 'risk'
    ]

    # Convert query to lowercase for case-insensitive matching
    query_lower = query.lower()

    # Check if any cybersecurity keyword is in the query
    return any(keyword in query_lower for keyword in cybersecurity_keywords)

def handle_non_cybersecurity_query(query):
    """Handle non-cybersecurity related questions"""
    # For general greetings
    if any(keyword in query.lower() for keyword in ['hello', 'hi', 'hey', 'how are you']):
        return "Hello! I'm a cybersecurity assistant. I'm here to help answer your cybersecurity-related questions. If you have any questions about cybersecurity concepts, threats, or best practices, feel free to ask!"

    # For questions about people, places, current events
    if any(keyword in query.lower() for keyword in ['who is', 'where is', 'what is the capital', 'president', 'prime minister', 'pm of']):
        return "I'm specifically designed to answer cybersecurity-related questions using my knowledge base. I don't have up-to-date information about current events, people, or places. Could you ask me something about cybersecurity instead?"

    # Default response for other non-cybersecurity questions
    return "I'm a cybersecurity assistant specialized in answering questions about cybersecurity topics like malware, phishing, encryption, and network security. For other topics, I might not have the relevant information. Could you try asking a cybersecurity question instead?"

# Run the lab with Hugging Face model
if __name__ == "__main__" and IN_COLAB:
    # Initialize global variable for generator
    generator = None
    run_cybersecurity_rag_lab()


=== Hugging Face Model Setup ===
Loading text generation model. This may take a few minutes...
Loading model: TinyLlama/TinyLlama-1.1B-Chat-v1.0


Device set to use cuda:0


Model loaded successfully!

            === Cybersecurity Assistant Chat ===            
       Type your questions below. Enter 'q' to quit.        

Cybersecurity Assistant: Hello! I'm your cybersecurity assistant. I can answer questions about cybersecurity concepts, threats, and best practices. How can I help you today?

------------------------------------------------------------
You: who is pm of nepal

Cybersecurity Assistant: I'm specifically designed to answer cybersecurity-related questions using my knowledge base. I don't have up-to-date information about current events, people, or places. Could you ask me something about cybersecurity instead?

[Non-cybersecurity question detected] - Processing time: 0.00 seconds

------------------------------------------------------------
You: what is identity theft

=== Retrieval Results ===
Result 1 [Similarity: 0.4302]:
Phishing: A social engineering attack where attackers send fraudulent messages to trick individuals into revealing sen

## Hackathon Chatbot

In [None]:
# PART 10: HACKATHON CHATBOT
# =============================================
# This section implements a simple chatbot that can answer questions both
# with and without RAG, using information about Nepali tech organizations and people

import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from IPython.display import HTML, display, Markdown
import time
import threading
import itertools
import sys

# Define our small knowledge base
organizations_data = [
    [
        "NAAMII Nepal",
        "Nepal Applied Mathematics and Informatics Institute (NAAMII) is a private, not-for-profit research organization established by Nepali researchers. Its mission is to build a strong foundation for scientific research in Nepal, focusing on informatics, applied mathematics, and artificial intelligence (AI). NAAMII aims to contribute to the democratization and decentralization of AI globally. The institute organizes the Annual Nepal AI School (ANAIS), attracting participants and experts worldwide to promote AI education and research in Nepal."
    ],
    [
        "SecurityPal Nepal",
        "SecurityPal is a cybersecurity startup founded by Pukar C. Hamal, with headquarters in San Francisco and a significant presence in Kathmandu, Nepal. The company specializes in expediting security questionnaires and compliance processes for major tech firms, including OpenAI and Grammarly. In 2023, SecurityPal launched a 24/7 Security Operations Command Center (SOCC) in Kathmandu, leveraging local tech talent to provide continuous support to its global clientele. As of early 2025, SecurityPal reached a valuation of $105 million, marking a significant milestone in Nepal's startup ecosystem."
    ],
    [
        "Anmol Guragain",
        "Anmol Guragain is a computer science graduate from Vellore Institute of Technology. He has interned at the Aural and Language Intelligence Lab at A*STAR in Singapore and currently serves as a Research Assistant at NAAMII. Anmol focuses on language and speech research, particularly Natural Language Processing (NLP) for the Nepali language. His research areas include Automatic Speaker Verification (ASV), Speech Emotion Recognition (SER), and deepfake detection in audio. Anmol is dedicated to advancing AI in low-resource languages and explores parameter-efficient fine-tuning approaches to minimize computational resource usage."
    ]
]

def setup_hackathon_chatbot():
    """Setup the hackathon chatbot with necessary models"""
    print("Setting up hackathon chatbot...")

    # Create DataFrame for our knowledge base
    df = pd.DataFrame(organizations_data, columns=['Entity', 'Description'])

    # Load sentence transformer for embeddings
    print("Loading embedding model...")
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

    # Create embeddings for our knowledge base
    print("Creating embeddings for knowledge base...")
    df['Content'] = df['Entity'] + ": " + df['Description']
    embeddings = embedding_model.encode(df['Content'].tolist())

    # Load language model
    print("Loading language model (TinyLlama)...")
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

    generator = pipeline(
        "text-generation",
        model=model_name,
        torch_dtype="auto",
        device_map="auto"
    )

    print("Setup complete!")
    return df, embedding_model, embeddings, generator

def retrieve_context(query, df, embedding_model, embeddings, top_k=1):
    """Retrieve relevant context using vector similarity search"""
    # Generate embedding for the query
    query_embedding = embedding_model.encode([query])[0]

    # Calculate similarity between query and all descriptions
    similarities = np.dot(embeddings, query_embedding) / (
        np.linalg.norm(embeddings, axis=1) * np.linalg.norm(query_embedding)
    )

    # Get top-k most similar indices
    top_indices = np.argsort(similarities)[-top_k:][::-1]

    # Get the relevant passages
    relevant_passages = df.iloc[top_indices]['Content'].tolist()
    relevance_scores = similarities[top_indices]

    # Format the context
    context = "\n\n".join([f"[Relevance: {score:.2f}] {passage}"
                          for passage, score in zip(relevant_passages, relevance_scores)])

    return context, relevance_scores[0] if len(relevance_scores) > 0 else 0

def generate_answer_with_rag(query, context, generator):
    """Generate answer using the language model with retrieved context"""
    # Construct the prompt
    prompt = f"""<|system|>
You are a helpful assistant with knowledge about organizations and people in Nepal, especially in the tech sector.
Answer questions based ONLY on the following context:

{context}

If the context doesn't contain relevant information to answer the question, admit that you don't know.
<|user|>
{query}
<|assistant|>"""

    # Generate text using the language model
    result = generator(
        prompt,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.3,
        top_p=0.9,
        num_return_sequences=1
    )

    # Extract the generated text
    generated_text = result[0]['generated_text']

    # Extract only the assistant's response part
    assistant_part = generated_text.split("<|assistant|>")[-1].strip()

    # Clean up any trailing model tokens
    if "<|" in assistant_part:
        assistant_part = assistant_part.split("<|")[0].strip()

    return assistant_part

def generate_answer_without_rag(query, generator):
    """Generate answer using the language model without context retrieval"""
    # Construct the prompt
    prompt = f"""<|system|>
You are a helpful assistant who can answer questions based on general knowledge.
<|user|>
{query}
<|assistant|>"""

    # Generate text using the language model
    result = generator(
        prompt,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.3,
        top_p=0.9,
        num_return_sequences=1
    )

    # Extract the generated text
    generated_text = result[0]['generated_text']

    # Extract only the assistant's response part
    assistant_part = generated_text.split("<|assistant|>")[-1].strip()

    # Clean up any trailing model tokens
    if "<|" in assistant_part:
        assistant_part = assistant_part.split("<|")[0].strip()

    return assistant_part

def animate_thinking():
    """Display a thinking animation"""
    done = False  # Initialize done as False
    def animate():
        for c in itertools.cycle(['|', '/', '-', '\\']):
            if done:
                break
            sys.stdout.write('\rThinking ' + c)
            sys.stdout.flush()
            time.sleep(0.1)
        sys.stdout.write('\r' + ' ' * 20 + '\r')
        sys.stdout.flush()

    t = threading.Thread(target=animate)
    t.start()
    # Return a function to set done to True, stopping the animation
    return lambda: globals().update(done=True)

def run_hackathon_chatbot():
    """Run the hackathon chatbot interface"""
    # Display header
    display(HTML("""
    <div style="background-color:#FF5722; color:white; padding:10px; border-radius:5px; margin-bottom:20px;">
        <h1 style="text-align:center;">Nepal Tech Hackathon Chatbot</h1>
        <h3 style="text-align:center;">Ask me about Nepali tech organizations and people!</h3>
    </div>
    """))

    # Setup the models and knowledge base
    df, embedding_model, embeddings, generator = setup_hackathon_chatbot()

    # Choose mode
    print("\n=== Chatbot Modes ===")
    print("1. With RAG (retrieval-augmented generation)")
    print("2. Without RAG (direct generation)")

    mode = input("\nSelect mode (1 or 2): ")
    use_rag = mode == "1"

    # Set RAG parameters
    top_k = 1
    relevance_threshold = 0.3

    if use_rag:
        print(f"\nUsing RAG mode with top_k={top_k} and relevance_threshold={relevance_threshold}")
    else:
        print("\nUsing direct generation mode (no RAG)")

    # Display chat interface header
    print("\n" + "="*60)
    print("=== Nepal Tech Chatbot ===".center(60))
    print("Type your questions below. Enter 'q' to quit.".center(60))
    print("="*60)

    # Welcome message
    print("\nChatbot: Hello! I'm your Nepal Tech chatbot. Ask me about NAAMII, SecurityPal, or Anmol Guragain!")

    # Interactive Q&A session
    while True:
        print("\n" + "-"*60)
        query = input("You: ")

        if query.lower() in ['q', 'quit', 'exit']:
            print("\nChatbot: Thank you for chatting. Goodbye!")
            break

        if not query.strip():
            continue

        try:
            # Start the thinking animation
            stop_animation = animate_thinking()

            # Process the query based on mode
            if use_rag:
                # Retrieve context
                context, relevance = retrieve_context(query, df, embedding_model, embeddings, top_k)

                # Check if relevance is high enough
                if relevance > relevance_threshold:
                    answer = generate_answer_with_rag(query, context, generator)
                    method = f"RAG (relevance: {relevance:.2f})"
                else:
                    answer = generate_answer_without_rag(query, generator)
                    method = f"Direct (low relevance: {relevance:.2f})"
            else:
                # Direct generation without RAG
                answer = generate_answer_without_rag(query, generator)
                method = "Direct (no RAG)"

            # Stop the thinking animation
            stop_animation()

            # Print answer with proper formatting
            print(f"\nChatbot: {answer}")
            print(f"\n[Method: {method}]")

        except Exception as e:
            # Stop the thinking animation
            stop_animation()
            print(f"\nChatbot: I apologize, but I encountered an error processing your question.")
            print(f"\n[System error: {e}]")

# Run the chatbot when executing this part
if __name__ == "__main__" and 'IN_COLAB' in globals() and IN_COLAB:
    run_hackathon_chatbot()

Setting up hackathon chatbot...
Loading embedding model...
Thinking |Creating embeddings for knowledge base...
Loading language model (TinyLlama)...
Thinking -

Device set to use cuda:0


Setup complete!

=== Chatbot Modes ===
1. With RAG (retrieval-augmented generation)
2. Without RAG (direct generation)
Thinking |
Select mode (1 or 2): 1

Using RAG mode with top_k=1 and relevance_threshold=0.3

                 === Nepal Tech Chatbot ===                 
       Type your questions below. Enter 'q' to quit.        

Chatbot: Hello! I'm your Nepal Tech chatbot. Ask me about NAAMII, SecurityPal, or Anmol Guragain!

------------------------------------------------------------
Thinking |You: who is anmol guragain
Thinking \
Chatbot: Anmol Guragain is a computer science graduate from Vellore Institute of Technology.

[Method: RAG (relevance: 0.51)]

------------------------------------------------------------
Thinking |You: what are research area of anmol guragain
Thinking /
Chatbot: Research area of Anmol Guragain is language and speech research, particularly Natural Language Processing (NLP) for the Nepali language. His focus areas include Automatic Speaker Verification (