In [1]:
import os
import PyPDF2
import fitz  # PyMuPDF
import pdfplumber
from langchain_openai import OpenAIEmbeddings
import time
from pinecone import ServerlessSpec
from pinecone import Pinecone
import numpy as np
import sys
from abc import ABC, abstractmethod
from langchain.prompts import PromptTemplate
from pydantic import BaseModel
from sentence_transformers.SentenceTransformer import SentenceTransformer
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login




In [2]:
file_path = "data"
output_path = "data/merged_document.pdf"

In [2]:
def load_pdfs_from_directory(directory_path):
    """
    Iterates through a directory and loads all the PDF files.

    Args:
        directory_path (str): The path of the directory to search for PDF files.

    Returns:
        list: A list of opened PDF file objects.
    """
    pdf_files = []

    # Iterate through the files in the directory
    for file_name in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file_name)

        # Check if the file is a PDF
        if file_name.lower().endswith(".pdf"):
            try:
               # Open the PDF file in binary mode and store the file handle and reader
                pdf_file = open(file_path, "rb")
                reader = PyPDF2.PdfReader(pdf_file)
                pdf_files.append((pdf_file, reader))  # Store both the file and reader
                print(f"Loaded PDF: {file_name}")
            except Exception as e:
                print(f"Error loading {file_name}: {e}")
    return pdf_files


def print_pdf_text(pdf_reader, page_number=0):
    """
    Prints the text from a specified page of a PDF file.

    Args:
        pdf_reader (PyPDF2.PdfReader): The PDF reader object.
        page_number (int): The page number to extract text from (default is 0).
    """

    try:
        # Extract from a specified page number
        page = pdf_reader.pages[page_number]
        text = page.extract_text()

        if text:
            print(f"Text from page {page_number + 1}:\n{text}")
        else:
            print(f"No text found on {page_number + 1}.")
    except IndexError:
        print(f"Page {page_number + 1} does not exist in this PDF.")


def close_pdf_files(pdf_files):
    """
    Closes all the PDF files that were opened.

    Args:
        pdf_files (list): A list of tuples containing opened PDF files and their readers.
    """
    for pdf_file, _ in pdf_files:
        pdf_file.close()


def merge_pdfs(file_path, output_path):
    """
    Merges all PDF files from the specified directory into a single PDF document.

    Args:
        file_path (str): The path to the directory containing PDF files.
        output_path (str): The path to save the merged PDF file.

    Returns:
        None
    """

    # Create a new PDF document
    merged_pdf = fitz.open()

    # Iterate over all files in the directory
    for file_name in os.listdir(file_path):
        # Check if the file ends with .pdf
        if file_name.lower().endswith(".pdf"):
            current_file_path = os.path.join(file_path, file_name)
            print(f"Processing file: {current_file_path}")
            try:
                # Open the PDF file
                pdf_document = fitz.open(current_file_path)
                # Append the PDF pages to the merged PDF
                merged_pdf.insert_pdf(pdf_document)
                print(f"Successfully appended: {current_file_path}")
            except Exception as e:
                print(f"Error processing {current_file_path}: {e}")

    # Write the merged PDF to the output file
    merged_pdf.save(output_path)
    merged_pdf.close()
    print(f"All PDFs have been merged into {output_path}")

def split_pdf_and_extract_text(input_pdf_path, output_dir):
    """
    Splits a PDF file into individual pages and extracts text from each page.

    Args:
        input_pdf_path (str): The path to the input PDF file.
        output_dir (str): The directory where the extracted texts will be saved.

    Returns:
        dict: A dictionary with page numbers as keys and extracted text as values.
    """

    # ensure the directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Read the input PDF
    with open(input_pdf_path, 'rb') as pdf_file:
        reader = PyPDF2.PdfReader(pdf_file)
        extracted_texts = {}

        # Iterate through each page and extract the text
        for page_number in range(len(reader.pages)):
            page = reader.pages[page_number]
            text = page.extract_text()
    
            # store the extracted text in the dictionary
            extracted_texts[page_number + 1] = text

    return extracted_texts    

In [4]:
pdf_readers = load_pdfs_from_directory(file_path)

Loaded PDF: Advanced_Cyber_Security_Techniques.pdf
Loaded PDF: CANVAS-Book-Chapter-Herrmann-Pridhl.pdf
Loaded PDF: cyber_history.pdf
Loaded PDF: CyBOK-version-1.0.pdf
Loaded PDF: Edureka-Cybersecurity-Ebook.pdf
Loaded PDF: merged_document.pdf


In [5]:
if pdf_readers:
    print_pdf_text(pdf_readers[4][1], page_number=1)

Text from page 2:
MASTERING CYBERSECURITY WITH EDUREKA
1. INTRODUCTION T O CYBERSECURITY
2. CIA TRIAD
Con dentiality
Integrity
Availability
4. ETHIC AL HA CKING
Types of Ethical Hackers
Types of Hacking
5. PHA SES OF ETHIC AL HA CKING3
5
10
12
TABLE OF CONTENTS
sales@edur eka.co  IN: 9606058406  
US: 1833855577 5 What is Cybersecurity?
Need for Cybersecurity
3. INTRODUCTION T O CR YPTOGRAPHY 7
Reconnaissance
Scanning
Gaining Access
Maintaining Access
Clearing Tracks
Reporting


In [6]:
# merge_pdfs(file_path, output_path)

In [36]:
document_path = "data/merged_document.pdf"
output_dir = 'data/extracted_texts'

In [4]:
with pdfplumber.open(document_path) as pdf:
    # iterate each page and extract text
    for page_number in range(len(pdf.pages)):
        page = pdf.pages[page_number]
        text = page.extract_text()

In [5]:
page.find_tables()

[]

In [37]:
extracted_texts = split_pdf_and_extract_text(document_path, output_dir)

In [53]:
print(type(extracted_texts))

<class 'dict'>


In [54]:
print(extracted_texts.keys())

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 22

In [40]:
len(extracted_texts)

1302

In [4]:
openai_api_key = ""

In [57]:
# Create embeddings and vector index
embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)

In [46]:
# Convert every element in extracted_texts to a string
converted_texts = [str(text) for text in extracted_texts]

In [55]:
for key, value in list(extracted_texts.items())[:5]:  # Adjust the number as needed
    print(f"Key: {key}, Value: {value[:500]}")

Key: 1, Value:  
  
 
 
 
 
 
 
 
 
Post-Graduate Diploma in  Cyber Security  
Advanced Cyber Security Techniques  
(PGDCS -08) 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
   

Key: 2, Value:  
  
 
 
 Title  Advanced Cyber Security Techniques  
Advisors  Mr. R. Thyagarajan , Head, Admn. & Finance 
and Acting Director , CEMCA  
Dr. Manas Ranjan Panigrahi , Program 
Officer  (Education), CEMCA  
Prof. Durgesh Pant , Director -SCS&IT, UOU  
Editor  Mr. Manish Koranga, Senior  Consultant, 
Wipro Technologies, Bangalore  
Authors  
Block I> Unit I,  Unit II, Unit III & Unit 
IV Mr. Ashutosh Bahuguna , Scientist - Indian 
Computer Emergency Response Team 
(CERT -In), Department of Electro
Key: 3, Value:  
  
 
 
 
 
 
 
  Expert Panel  
S. No.  Name  
1 Dr. Jeetendra Pande, School of Computer Science & IT, Uttarakhand Open 
University, Haldwani  
2 Prof. Ashok Panjwani, Professor, MDI, Gurgoan  
3 Group Captain Ashok Katariya, Ministry of Defense, New Delhi  
4 Mr. Ashutosh Bahuguna, Scientist -CERT -In

In [60]:
# Generate embeddings for each of the extracted texts
embeddings = [embedding.embed_query(text) for text in extracted_texts.values()]

In [72]:
len(embeddings)

1302

In [3]:
PINECONE_API_KEY = ""

In [34]:
def delete_index():
    pc = Pinecone(api_key=PINECONE_API_KEY)
    pc.delete_index(PINECONE_INDEX)
    print(f"Deleted index: {PINECONE_INDEX}")

# Call this function to delete the index if it exists
delete_index()

Deleted index: cyberdomain


In [41]:
PINECONE_INDEX = "cyberdomain"

def configure_pinecone_index():
    # configure client
    pc = Pinecone(api_key=PINECONE_API_KEY)

    if PINECONE_INDEX == pc.list_indexes()[0]["name"]:
        print("VectorDatabase :: Ready!")
    else:
        
        spec = ServerlessSpec(cloud='aws', region='us-east-1')
        # create a new index
        pc.create_index(
            PINECONE_INDEX,
            dimension=1536,  # dimensionality of mixedbread large
            metric='dotproduct',
            spec=spec
        )

        # wait for index to be initialized
        while not pc.describe_index(PINECONE_INDEX).status['ready']:
            time.sleep(1)

        print("VectorDatabase :: Ready!")

In [73]:
def estimate_size(obj):
    return len(str(obj).encode('utf-8'))  # Size in bytes

def upsert_embeddings_to_pinecone(embeddings, extracted_texts, max_size=4194304):
    # Connect to Pinecone index
    pc = Pinecone(api_key=PINECONE_API_KEY)
    index = pc.Index(PINECONE_INDEX)

    batch = []
    current_size = 0
    keys = list(extracted_texts.keys())  # Get keys for reference

    for i, embedding in enumerate(embeddings):
        emb_size = estimate_size(embedding)
        
        # Include metadata size in the total request size calculation
        metadata_size = estimate_size({"text": extracted_texts[keys[i]]})
        total_size = emb_size + metadata_size

        # Check if adding this embedding would exceed the max size
        if current_size + total_size > 2 * 1024 * 1024:  # 2 MB limit
            # Upsert current batch and reset
            if batch:  # Only proceed if there's a batch to upsert
                vectors_to_upsert = [
                    {
                        "id": f"doc_{len(embeddings) - len(batch) + j + i - len(batch)}",  # Unique ID
                        "values": emb,
                        "metadata": {"text": extracted_texts[keys[i - len(batch) + j]]}
                    }
                    for j, emb in enumerate(batch)
                ]
                index.upsert(vectors=vectors_to_upsert)
                print(f"Upserted {len(vectors_to_upsert)} embeddings to Pinecone.")

            # Reset for the next batch
            batch = [embedding]
            current_size = emb_size + metadata_size  # Reset current size to the size of the new embedding
        else:
            batch.append(embedding)
            current_size += total_size  # Update current size

    # Upsert any remaining embeddings
    if batch:
        vectors_to_upsert = [
            {
                "id": f"doc_{len(embeddings) - len(batch) + j + len(embeddings) - len(batch)}",  # Unique ID
                "values": emb,
                "metadata": {"text": extracted_texts[keys[len(embeddings) - len(batch) + j]]}
            }
            for j, emb in enumerate(batch)
        ]
        index.upsert(vectors=vectors_to_upsert)
        print(f"Upserted {len(vectors_to_upsert)} embeddings to Pinecone.")


In [43]:
configure_pinecone_index()  # This prepares the Pinecone index

VectorDatabase :: Ready!


In [74]:
upsert_embeddings_to_pinecone(embeddings, extracted_texts)  # This uploads the embeddings

Upserted 56 embeddings to Pinecone.
Upserted 56 embeddings to Pinecone.
Upserted 57 embeddings to Pinecone.
Upserted 57 embeddings to Pinecone.
Upserted 58 embeddings to Pinecone.
Upserted 56 embeddings to Pinecone.
Upserted 56 embeddings to Pinecone.
Upserted 56 embeddings to Pinecone.
Upserted 54 embeddings to Pinecone.
Upserted 55 embeddings to Pinecone.
Upserted 54 embeddings to Pinecone.
Upserted 55 embeddings to Pinecone.
Upserted 55 embeddings to Pinecone.
Upserted 55 embeddings to Pinecone.
Upserted 56 embeddings to Pinecone.
Upserted 54 embeddings to Pinecone.
Upserted 55 embeddings to Pinecone.
Upserted 55 embeddings to Pinecone.
Upserted 55 embeddings to Pinecone.
Upserted 54 embeddings to Pinecone.
Upserted 54 embeddings to Pinecone.
Upserted 53 embeddings to Pinecone.
Upserted 53 embeddings to Pinecone.
Upserted 33 embeddings to Pinecone.


In [5]:
PINECONE_INDEX = "cyberdomain"
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(PINECONE_INDEX)

In [6]:
print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1540}},
 'total_vector_count': 1540}


In [34]:
class BasePromptTemplate(ABC, BaseModel):
    @abstractmethod
    def create_template(self, *args) -> PromptTemplate:
        pass


class QueryExpansionTemplate(BasePromptTemplate):
    prompt: str = """You are an AI language model assistant. Your task is to generate {to_expand_to_n}
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search.
    Provide these alternative questions separated by '{separator}'.
    Original question: {question}"""

    @property
    def separator(self) -> str:
        return "#next-question#"

    def create_template(self, to_expand_to_n: int) -> PromptTemplate:
        return PromptTemplate(
            template=self.prompt,
            input_variables=["question"],
            partial_variables={
                "separator": self.separator,
                "to_expand_to_n": to_expand_to_n,
            },
        )


class SelfQueryTemplate(BasePromptTemplate):
    prompt: str = """You are an AI language model assistant. Your task is to extract information from a user question.
    The required information that needs to be extracted is the user or author id. 
    Your response should consist of only the extracted id (e.g. 1345256), nothing else.
    User question: {question}"""

    def create_template(self) -> PromptTemplate:
        return PromptTemplate(template=self.prompt, input_variables=["question"])


class RerankingTemplate(BasePromptTemplate):
    prompt: str = """You are an AI language model assistant. Your task is to rerank passages related to a query
    based on their relevance. 
    You should only return the summary of the most relevant passage.
    
    The following are passages related to this query: {question}.
    
    Passages: 
    {passages}
    
    Please provide only the summary of the most relevant passage.
    """



    def create_template(self, keep_top_k: int) -> PromptTemplate:
        return PromptTemplate(
            template=self.prompt,
            input_variables=["question", "passages"],
            partial_variables={"keep_top_k": keep_top_k, "separator": self.separator},
        )

    @property
    def separator(self) -> str:
        return "\n#next-document#\n"

In [8]:
class GeneralChain:
    @staticmethod
    def get_chain(llm, template: PromptTemplate, output_key: str, verbose=True):
        return LLMChain(
            llm=llm, prompt=template, output_key=output_key, verbose=verbose
        )


In [30]:
class QueryExpansion:
    @staticmethod
    def generate_response(query: str, to_expand_to_n: int) -> list[str]:
        query_expansion_template = QueryExpansionTemplate()
        prompt_template = query_expansion_template.create_template(to_expand_to_n)
        model = ChatOpenAI(
            model="gpt-4-1106-preview",
            api_key=openai_api_key,
            temperature=0,
        )

        chain = GeneralChain().get_chain(
            llm=model, output_key="expanded_queries", template=prompt_template
        )

        response = chain.invoke({"question": query})
        result = response["expanded_queries"]

        queries = result.strip().split(query_expansion_template.separator)
        stripped_queries = [
            stripped_item for item in queries if (stripped_item := item.strip())
        ]

        return stripped_queries


class SelfQuery:
    @staticmethod
    def generate_response(query: str) -> str:
        prompt = SelfQueryTemplate().create_template()
        model = ChatOpenAI(
            model="gpt-4-1106-preview",
            api_key=openai_api_key,
            temperature=0,
        )

        chain = GeneralChain().get_chain(
            llm=model, output_key="metadata_filter_value", template=prompt
        )

        response = chain.invoke({"question": query})
        result = response["metadata_filter_value"]

        return result


class Reranker:
    @staticmethod
    def generate_response(
        query: str, passages: list[str], keep_top_k: int
    ) -> list[str]:
        reranking_template = RerankingTemplate()
        prompt_template = reranking_template.create_template(keep_top_k=keep_top_k)

        model = ChatOpenAI(
            model="gpt-4-1106-preview",
            api_key=openai_api_key,
            temperature=0,
        )
        chain = GeneralChain().get_chain(
            llm=model, output_key="rerank", template=prompt_template
        )

        stripped_passages = [
            stripped_item for item in passages if (stripped_item := item.strip())
        ]
        passages = reranking_template.separator.join(stripped_passages)
        response = chain.invoke({"question": query, "passages": passages})

        result = response["rerank"]
        reranked_passages = result.strip().split(reranking_template.separator)
        stripped_passages = [
            stripped_item
            for item in reranked_passages
            if (stripped_item := item.strip())
        ]

        return stripped_passages

In [10]:
import concurrent.futures
# import core.logger_utils as logger_utils
import structlog


def get_logger(cls: str):
    return structlog.get_logger().bind(cls=cls)

logger = get_logger(__name__)

In [35]:
def flatten(nested_list: list) -> list:
    """Flatten a list of lists into a single list."""

    return [item for sublist in nested_list for item in sublist]
    

class VectorRetriever:
    """
    Class for retrieving vectors from a Vector store in a RAG system using query expansion and Multitenancy search.
    """

    def __init__(self, index, query: str) -> None:
        self._client = index
        self.query = query
        self._embedder = OpenAIEmbeddings(openai_api_key=openai_api_key)
        self._query_expander = QueryExpansion()
        self._metadata_extractor = SelfQuery()
        self._reranker = Reranker()

    def _search_single_query(
        self, generated_query: str, k: int, include_metadata=True
        ):
        assert k > 3, "k should be greater than 3"

        query_vector = self._embedder.embed_query(generated_query)

        # Query Pinecone using the embedded vector
        query_results = self._client.query(
            vector=query_vector,  # Correct placement for the vector
            top_k=k // 3,
            # include_values=True,  # Include actual vector values if needed
            include_metadata=True
        )

        return query_results['matches']


    def retrieve_top_k(self, k: int, to_expand_to_n_queries: int) -> list:
        generated_queries = self._query_expander.generate_response(
            self.query, to_expand_to_n=to_expand_to_n_queries
        )
        
        logger.info("Successfully generated queries for search.", num_queries=len(generated_queries))
    
        with concurrent.futures.ThreadPoolExecutor() as executor:
            search_tasks = [
                executor.submit(self._search_single_query, query, k, include_metadata=True)  # Ensure metadata is included
                for query in generated_queries
            ]
    
            hits = [task.result() for task in concurrent.futures.as_completed(search_tasks)]
    
            # Since 'hits' is a list of lists (matches), flatten them manually
            hits = [item for sublist in hits for item in sublist]
    
        logger.info("All documents retrieved successfully.", num_documents=len(hits))
    
        return hits



    def rerank(self, hits: list, keep_top_k: int) -> str:
        # Extract the 'text' field from 'metadata' for each hit
        content_list = [hit['metadata']['text'] for hit in hits if hit and hit.get('metadata') and hit['metadata'].get('text')]
        
        rerank_hits = self._reranker.generate_response(
            query=self.query, passages=content_list, keep_top_k=keep_top_k
        )
        
        
        # Return the first reranked hit as the best answer
        if rerank_hits:
            best_answer = rerank_hits[0]
            logger.info(f"Best answer selected: {best_answer}")
            return best_answer
        else:
            return ""




    def set_query(self, query: str):
        self.query = query


In [32]:
query = "What is malware?"

# Initialize the VectorRetriever class with the index and query
retriever = VectorRetriever(index=index, query=query)

# Define parameters for retrieval
k = 10  # Number of results to retrieve
to_expand_to_n_queries = 5  # Number of expanded queries to generate

# Perform the vector search to retrieve the top-k documents
top_k_hits = retriever.retrieve_top_k(k=k, to_expand_to_n_queries=to_expand_to_n_queries)

# Optional: Rerank the retrieved hits and get the best answer (the top result)
best_answer = retriever.rerank(hits=top_k_hits, keep_top_k=1)  # keep only the top result

# Print the best answer
print(best_answer)



> Entering new LLMChain chain...
Prompt after formatting:
You are an AI language model assistant. Your task is to generate 5
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search.
    Provide these alternative questions separated by '#next-question#'.
    Original question: What is malware?

> Finished chain.
2024-10-19 13:37:30 [info     ] Successfully generated queries for search. cls=__main__ num_queries=5
2024-10-19 13:37:34 [info     ] All documents retrieved successfully. cls=__main__ num_documents=15


> Entering new LLMChain chain...
Prompt after formatting:
You are an AI language model assistant. Your task is to rerank passages related to a query
    based on their relevance. 
    You should only return the summary of the most relevant passage.
    
    The f

In [33]:
print(best_answer)

Malware, short for 'malicious software', is any program designed to perform malicious activities. It encompasses a variety of forms such as viruses, Trojans, worms, spyware, botnet malware, and ransomware. Malware is responsible for a range of cyberattacks, including nation-state cyberwarfare, cybercrime, fraud, and scams. For instance, Trojans can provide backdoor access to networks for data theft, ransomware can encrypt user data and demand payment for decryption, and botnet malware can facilitate DDoS attacks and phishing campaigns. Understanding malware development and deployment is crucial for creating effective cyber defense mechanisms and countermeasures. As the stakes in cybersecurity rise, both malware and defense technologies are becoming more sophisticated, with attackers using techniques like obfuscation to evade detection and setting up adaptive infrastructures for malware support.
