In [None]:
import fitz
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
import pandas as pd
from langchain import hub
from langchain_chroma import Chroma
from langchain import PromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.embeddings import LlamaCppEmbeddings
from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings)
import re
from unidecode import unidecode
import tiktoken

In [117]:
PROMPT = """You are an assistant tasked with summarizing text. 
The following text is containing information of the page with heading, important points, the complete text and a picture description if any. 
{content_of_pdf}"""

In [None]:
BOOK_PDF = '/home/iai/sb7059/git/llm_test/data/Book/industrial-cybersecurity-efficiently-monitor-the-cybersecurity-posture-of-your-ics-environment_compress.pdf'
PATH = '/home/iai/sb7059/git/llm_test/data/Book/Images'
#BOOK_PDF = '/home/iai/sb7059/git/llm_test/data/Book/fdgth-06-1321485.pdf'
#BOOK_PDF = '/home/iai/sb7059/git/llm_test/data/Book/smeggitt.pdf'
WORKSPACE_DIC = "/hkfs/work/workspace_haic/scratch/sb7059-llm_models_jeremy"

MODEL_PATH = { #"Mixtral-8x-7b": WORKSPACE_DIC + "/Mixtral/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf",
               #"Phi-2": WORKSPACE_DIC + "/Phi/Phi2/phi-2.Q4_K_M.gguf",
               "Llama2-70b": WORKSPACE_DIC + "/Llama/Llama2/llama-2-70b.Q5_K_M.gguf",
                "Phi-3-medium-128k": WORKSPACE_DIC + "/Phi/Phi3/Phi-3-mini-4k-instruct-q4.gguf",
               #"LLama-3-70b": WORKSPACE_DIC + "/Llama/LLama3/Meta-Llama-3-70B-Instruct-v2.Q4_K_M.gguf",
               #"Mixtral-8x22b": WORKSPACE_DIC + "/Mixtral/Mixtral-8x22b-Instruct",
               #"Mixtral-8x-22b": WORKSPACE_DIC + "/Mixtral/Mixtral-8x22B-Instruct-v0.1.Q4_K_M-00001-of-00002.gguf",
              }

In [None]:
def extract_tables_to_string(page):
    """
    Extracts all tables from a given page and concatenates them into a single string.

    Parameters:
    - page: The page object from which to extract tables.

    Returns:
    - A string containing all tables extracted from the page.
    """
    content_of_pdf = ""
    tabs = page.find_tables()  # detect the tables
    for i, tab in enumerate(tabs):  # iterate over all tables
        df = tab.to_pandas()
        # Add the table to a string to be used in the prompt
        content_of_pdf += df.to_string()
    return content_of_pdf

def extract_and_save_images(page, doc, PATH):
    """
    Extracts all images from a given page and saves them to a specified path.

    Parameters:
    - page: The page object from which to extract images.
    - doc: The document object containing the page.
    - PATH: The file path where images will be saved.
    """
    for i in page.get_images(full=True):
        xref = i[0]
        image = fitz.Pixmap(doc, xref)
        with open(f'{PATH}/image_{xref}.png', 'wb') as f:
            f.write(image.tobytes())

def extract_spans_from_blocks(block_dict):
    # Initialize an empty list to store row data
    rows = []
    
    # Iterate through each page and its blocks
    for page_num, blocks in block_dict.items():
        for block in blocks:
            # Check if the block is of type 0 (text)
            if block['type'] == 0:
                for line in block['lines']:
                    for span in line['spans']:
                        # Extract bounding box and other span properties
                        xmin, ymin, xmax, ymax = list(span['bbox'])
                        font_size = span['size']
                        text = unidecode(span['text'])
                        span_font = span['font']
                        is_bold = "bold" in span_font.lower()
                        
                        # Ensure the text is not just whitespace
                        if text.replace(" ", ""):
                            rows.append((xmin, ymin, xmax, ymax, text, is_bold, span_font, font_size, page_num))
    
    # Create a DataFrame from the rows
    span_df = pd.DataFrame(rows, columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'is_bold', 'span_font', 'font_size', 'page_num'])
    return span_df

def get_title(span_df):
    title_page = span_df[span_df['page_num'] == 1]
    unique_font_sizes_title = title_page['font_size'].unique()
    title = ""
    for index, row in title_page.iterrows():
        #Check if the row is bold and if the font size is the greatest font size
        if row['font_size'] == max(unique_font_sizes_title): 
            title += row['text']
    return title

def extract_toc_as_df(pdf_path):
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    # Extract the table of contents
    toc = doc.get_toc()
    
    # Close the document
    doc.close()
    
    # Convert TOC to DataFrame
    toc_df = pd.DataFrame(toc, columns=['Level', 'Title', 'Page'])
    
    return toc_df

def extract_pdf_content(pdf_path, start_page, end_page):
    pdf_document = fitz.open(pdf_path)
    pdf_content = []
    block_dict = {}
    for page_number in range(start_page-1, end_page):
        page = pdf_document[page_number]
        file_dict = page.get_text('dict') # Get the page dictionary
        block = file_dict['blocks'] # Get the block information
        block_dict[page_number] = block

        page_content = page.get_text()
        pdf_content.append(page_content)

        # Extract images from the page
        extract_and_save_images(page, pdf_document, PATH)

        # Extract tables from the page
        pdf_content.append(extract_tables_to_string(page))

    pdf_content = (analyse_content(extract_spans_from_blocks(block_dict),page))
    
    return pdf_content


def analyse_content(span_df, page):
    # Initialize lists to hold categorized content
    site_header = []
    headers = []
    important_texts = []
    picture_descriptions = []
    page_content = []

    # Calculate font size and font type statistics
    font_size_counts = span_df['font_size'].value_counts()
    unique_font_sizes = span_df['font_size'].unique()

    # Define thresholds
    site_header_threshold = page.rect.height * 0.07
    # Iterate through each row in the DataFrame
    for index, row in span_df.iterrows():
        text = row['text']
        font_size = row['font_size']
        is_bold = row['is_bold']
        ymin = row['ymin']
        
        # Site Header - typically at the top of the page
        if ymin < site_header_threshold:
            site_header.append(text)

        # Determine if text is a header or subheader based on font size and boldness
        elif is_bold and font_size > font_size_counts.idxmax():
            headers.append(text)

        # Bold text that is not categorized as headers or subheaders
        elif is_bold and font_size == font_size_counts.idxmax():
            important_texts.append(text)

        # Check for figure descriptions
        elif "figure" in text.lower() or "fig." in text.lower() or "image" in text.lower():
            picture_descriptions.append(text)

        page_content.append(text)
    
    #Fromat the page_content to a string
    page_content = " ".join(page_content)

        # Store results in a dictionary
    data = {
        "site_title": site_header,
        "headers": headers,
        "important_texts": important_texts,
        "picture_descriptions": picture_descriptions,
        "page_content": page_content
    }

    formatted_string = (
    "Site Title:\n"
    f"  - {data['site_title']}\n\n"
    
    "Headers:\n"
    "  - " + "\n  - ".join(data['headers']) + "\n\n"
    
    "Important Texts:\n"
    "  - " + "\n  - ".join(data['important_texts']) + "\n\n"
    
    "Picture Descriptions:\n"
    "  - " + "\n  - ".join(data['picture_descriptions']) + "\n\n"
    
    "Page Content:\n"
    f"  {data['page_content']}\n")

    
    return formatted_string

In [None]:
test = extract_pdf_content(BOOK_PDF, 22, 24)
print(test)

In [None]:
model_path = WORKSPACE_DIC + "/Phi/Phi3/Phi-3-mini-4k-instruct-q4.gguf"
llm = LlamaCpp(
    model_path= model_path,
    n_gpu_layers=-1,
    n_batch=4096,
    n_ctx=4096,
    temperature=1,
    top_p=1,
    max_tokens = 4000,
    #callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

In [118]:
#Create a document object
doc = fitz.open(BOOK_PDF)
#Extract the table of contents
toc_df = extract_toc_as_df(BOOK_PDF)

prompt_text = """You are an assistant tasked with summarizing texts. \ 
Give a concise summary of the text. Text chunk: {element} """

for index, row in toc_df.iterrows():
    # all the pages until the the row containing the word "Chapter" or "Section" in the title
    if "Chapter" in row['Title'] or "Section" in row['Title']:
        #Keep all the rows from the index to the end
        toc_df = toc_df.iloc[index:]
        break

toc_df['End Page'] = None
toc_df = toc_df.sort_values(by='Page').reset_index(drop=True)

prompt_template = PromptTemplate.from_template(PROMPT)

#Create text splitter to split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4050,    # 4050 tokens is the maximum number of tokens that can be processed in a single run
    chunk_overlap=128,
)
for index, row in toc_df.iterrows():
#Iterate only over the first 10 chapters
    if index > 10:
        break
    # Get the start and end page for the current chapter
    start_page = row['Page']
    if index < len(toc_df)-1:
        end_page = toc_df.loc[index+1, 'Page']
    else:
        end_page = doc.page_count - 1
    # # Extract the content of the chapter
    content_of_pdf = extract_pdf_content(BOOK_PDF, start_page, end_page)
    #Split the text into chunks
    texts = text_splitter.split_text(content_of_pdf)
    for text in texts:
        print(text)

    # # Create a prompt from the template
    prompt = prompt_template

    # # Create a summarization chain
    summarize_chain = {"content_of_pdf": lambda x: x} | prompt | llm | StrOutputParser()

    # # Summarize the text chunks
    text_summaries = summarize_chain.batch(texts, {"max_concurrency": 50})
    for summary in text_summaries:
        print(summary)


    # # Display the summaries
    # for summary in text_summaries:
    #     print(summary)

Site Title:
  - []

Headers:
  - Section 1:

  - ICS Cybersecurity 
  - Fundamentals
  - 1
  - Introduction and 
  - Recap of First 
  - Edition

Important Texts:
  - Industrial Control System
  - ICS

Picture Descriptions:
  - 

Page Content:
  Section 1:
 ICS Cybersecurity  Fundamentals In part one, we will briefly recap the first edition of the book to outline what was covered  and to point out the content that is still very relevant and that will be built upon in this  second edition. The remainder of part one will be dedicated to discussions around a  revised IDMZ architecture, resulting from many deployments, experience in the field,  practice, and feedback. Part one will conclude with a deep dive into how to design for  security, architecture that allows all the tools, techniques, and activities discussed in the  rest of the book to be implemented effectively and easily. This section comprises the following chapters: *	  Chapter 1 ,  Introduction and Recap of the First Edition *

Llama.generate: prefix-match hit

llama_print_timings:        load time =    7349.21 ms
llama_print_timings:      sample time =     311.91 ms /  1433 runs   (    0.22 ms per token,  4594.29 tokens per second)
llama_print_timings: prompt eval time =      98.34 ms /   520 tokens (    0.19 ms per token,  5287.94 tokens per second)
llama_print_timings:        eval time =    8794.29 ms /  1432 runs   (    6.14 ms per token,   162.83 tokens per second)
llama_print_timings:       total time =   11070.69 ms /  1952 tokens


  In order for all these topics to be effectively implemented within your own organization, we will  start with a high-level overview and analysis of an Industrial Control System (ICS)  architecture that is designed from the ground up in compliance with security principles as outlined by  ISO/IEC TR15443:2018 , the ISA/ISA99 standards, NIST SP-800-82 Revision 2, and other similar guidelines. Our goal is to provide you a solid foundation upon which all  security activities can be performed more easily in your organization. The main sections of this first chapter are as follows: *	  1. Introduction *	   2. A Modern Look at the Industrial Control System Architecture (ICS) *	   3. The Industrial Demilitarized Zone (IDMZ)*  •   Note that while there is a great deal of variation in how ICSs are deployed, it has been our experience that most architectures exhibit certain common characteristics and design patterns regardless of vendor or system type. A large portion of this section will cover 

Llama.generate: prefix-match hit

llama_print_timings:        load time =    7349.21 ms
llama_print_timings:      sample time =     181.25 ms /   790 runs   (    0.23 ms per token,  4358.72 tokens per second)
llama_print_timings: prompt eval time =     107.28 ms /   734 tokens (    0.15 ms per token,  6842.04 tokens per second)
llama_print_timings:        eval time =    4804.40 ms /   789 runs   (    6.09 ms per token,   164.22 tokens per second)
llama_print_timings:       total time =    5791.70 ms /  1523 tokens


 Note that we won't cover everything here;   rather, it is a summary and overview of some major concepts with an aim to get you up-to-speed again for this second-edition book .
Output=**Introduction and Recap of First Edition - Industrial Cybersecurity Second Edition**

The second edition of "Industrial Cybersecurity" serves as a continuation from the foundational work laid out in its first iteration. This textbook is structured to guide readers through an expansive exploration of industrial cybersecurity, with particular focus on enhancing security monitoring and verification within Industrial Control Systems (ICS) environments. The initial chapter offers a comprehensive recap of the first edition's coverage, setting the stage for deeper dives into emerging topics in this dynamic field.

**Key Topics Overviewed:**
- **What is an ICS?** Introduction to Industrial Control Systems and their significance in industrial environments.
- **IT (Information Technology) and OT (Operational Techn

Llama.generate: prefix-match hit


KeyboardInterrupt: 

In [None]:
# Create a document object
doc = fitz.open(BOOK_PDF)

content_of_pdf = ""
block_dict = {}


#Iterate over all pages in the documents
for i in range(doc.page_count):
#for i in range(0,100):
  page = doc.load_page(i)
  file_dict = page.get_text('dict') # Get the page dictionary
  block = file_dict['blocks'] # Get the block information
  block_dict[i] = block
  # read text and print it
  text = page.get_text()
  #Add the text to a string to be used in the prompt
  content_of_pdf = content_of_pdf + text

  ### IMAGES ###
  # Extract all the images on the page and save the images
  for i in page.get_images(full=True):
    xref = i[0]
    base_image = doc.extract_image(xref)
    image_bytes = base_image["image"]
    image = fitz.Pixmap(doc, xref)
    with open(f'{PATH}/image_{xref}.png', 'wb') as f:
      f.write(image.tobytes())

  ## TABLES ##
  # Extract all the tables on the page and save the tables
  tabs = page.find_tables()  # detect the tables
  for i,tab in enumerate(tabs):  # iterate over all tables
      print(f"Table {i} column names: {tab.header.names}, external: {tab.header.external}")
      tab = tabs[i]
      df = tab.to_pandas()
      #Add the table to a string to be used in the prompt
      content_of_pdf += df.to_string()

In [None]:
#Make a function out of that
spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag'])
rows = []
for page_num, blocks in block_dict.items():
    for block in blocks:
        if block['type'] == 0:
            for line in block['lines']:
                for span in line['spans']:
                    xmin, ymin, xmax, ymax = list(span['bbox'])
                    font_size = span['size']
                    text = unidecode(span['text'])
                    span_font = span['font']
                    is_bold = False
                    if "bold" in span_font.lower():
                        is_bold = True
                    if text.replace(" ","") !=  "":
                        rows.append((xmin, ymin, xmax, ymax, text, is_bold, span_font, font_size, page_num))
                        span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text','is_bold','span_font', 'font_size', 'page_num'])

In [None]:
#Find text with table of content in span_df
toc = span_df[span_df['text'].str.contains("table of content", case=False)]


In [None]:
#About 30 tokens
prompt_text = """You are an assistant tasked with summarizing texts. \ 
Give a concise summary of the text. Text chunk: {element} """

In [None]:
model_path = WORKSPACE_DIC + "/Phi/Phi3/Phi-3-mini-4k-instruct-q4.gguf"
llm = LlamaCpp(
    model_path= model_path,
    n_gpu_layers=-1,
    n_batch=4096,
    n_ctx=4096,
    temperature=1,
    top_p=1,
    max_tokens = 5000,
    #callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

In [None]:
prompt_template = PromptTemplate.from_template(PROMPT)

#Create text splitter to split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4050,    # 4050 tokens is the maximum number of tokens that can be processed in a single run
    chunk_overlap=128,
)

# Split the text into chunks
texts = text_splitter.split_text(content_of_pdf)

prompt = PromptTemplate.from_template(prompt_text)

summarize_chain = {"element": lambda x: x} | prompt | llm | StrOutputParser()

text_summaries = summarize_chain.batch(texts, {"max_concurrency": 50})

In [None]:
#Save the summaries to a file
with open('summaries.txt', 'w') as f:
    for item in text_summaries:
        f.write("%s\n" % item)

In [None]:
#Load the summaries from the file
with open('summaries.txt', 'r') as f:
    text_summaries = f.readlines()

In [None]:
embedding_function = SentenceTransformerEmbeddings(model_name= "sentence-transformers/all-mpnet-base-v2")

db = Chroma.from_texts(text_summaries, embedding_function)

retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 6})

print(retriever)

template2 = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""

template = """Create multiple choice question in one of the following format:
Question: Which two options are the best reasons to use an IPV4 private IP space? (Choose two.)
A. to enable intra-enterprise communication
B. to implement NAT
C. to connect applications
D. to conserve global address space
E. to manage routing overhead
Answer: AD

Question: The corporate security policy requires multiple elements to be matched in an authorization policy. Which elements can be combined to meet the requirement?
A. Device registration status and device activation status
B. Network access device and time condition
C. User credentials and server certificate
D. Built-in profile and custom profile
Answer: B

using the following context:
{context}
"""

#prompt = hub.pull("rlm/rag-prompt")
# Retrieve and generate using the relevant snippets of the blog.
retriever = db.as_retriever()
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("""Create multiple choice question in one in the following format out of the provided context""")

In [None]:
display(llm_answer)