In [1]:
import fitz
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
import pandas as pd
from langchain import hub
from langchain_chroma import Chroma
from langchain import PromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.embeddings import LlamaCppEmbeddings
from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings)
import re
from unidecode import unidecode


In [None]:
PROMPT = """Give me the most important information from the given text:
{content_of_pdf}"""

In [2]:
BOOK_PDF = '/home/iai/sb7059/git/llm_test/data/Book/industrial-cybersecurity-efficiently-monitor-the-cybersecurity-posture-of-your-ics-environment_compress.pdf'
PATH = '/home/iai/sb7059/git/llm_test/data/Book/Images'
#BOOK_PDF = '/home/iai/sb7059/git/llm_test/data/Book/fdgth-06-1321485.pdf'
#BOOK_PDF = '/home/iai/sb7059/git/llm_test/data/Book/smeggitt.pdf'
WORKSPACE_DIC = "/hkfs/work/workspace_haic/scratch/sb7059-llm_models_jeremy"

MODEL_PATH = { #"Mixtral-8x-7b": WORKSPACE_DIC + "/Mixtral/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf",
               #"Phi-2": WORKSPACE_DIC + "/Phi/Phi2/phi-2.Q4_K_M.gguf",
               #"Llama2-70b": WORKSPACE_DIC + "/Llama/Llama2/llama-2-70b.Q5_K_M.gguf",
                "Phi-3-medium-128k": WORKSPACE_DIC + "/Phi/Phi3/Phi-3-mini-4k-instruct-q4.gguf",
               "LLama-3-70b": WORKSPACE_DIC + "/Llama/LLama3/Meta-Llama-3-70B-Instruct-v2.Q4_K_M.gguf",
               #"Mixtral-8x22b": WORKSPACE_DIC + "/Mixtral/Mixtral-8x22b-Instruct",
               #"Mixtral-8x-22b": WORKSPACE_DIC + "/Mixtral/Mixtral-8x22B-Instruct-v0.1.Q4_K_M-00001-of-00002.gguf",
              }

In [3]:
def extract_tables_to_string(page):
    """
    Extracts all tables from a given page and concatenates them into a single string.

    Parameters:
    - page: The page object from which to extract tables.

    Returns:
    - A string containing all tables extracted from the page.
    """
    content_of_pdf = ""
    tabs = page.find_tables()  # detect the tables
    for i, tab in enumerate(tabs):  # iterate over all tables
        df = tab.to_pandas()
        # Add the table to a string to be used in the prompt
        content_of_pdf += df.to_string()
    return content_of_pdf

def extract_and_save_images(page, doc, PATH):
    """
    Extracts all images from a given page and saves them to a specified path.

    Parameters:
    - page: The page object from which to extract images.
    - doc: The document object containing the page.
    - PATH: The file path where images will be saved.
    """
    for i in page.get_images(full=True):
        xref = i[0]
        image = fitz.Pixmap(doc, xref)
        with open(f'{PATH}/image_{xref}.png', 'wb') as f:
            f.write(image.tobytes())

def extract_spans_from_blocks(block_dict):
    # Initialize an empty list to store row data
    rows = []
    
    # Iterate through each page and its blocks
    for page_num, blocks in block_dict.items():
        for block in blocks:
            # Check if the block is of type 0 (text)
            if block['type'] == 0:
                for line in block['lines']:
                    for span in line['spans']:
                        # Extract bounding box and other span properties
                        xmin, ymin, xmax, ymax = list(span['bbox'])
                        font_size = span['size']
                        text = unidecode(span['text'])
                        span_font = span['font']
                        is_bold = "bold" in span_font.lower()
                        
                        # Ensure the text is not just whitespace
                        if text.replace(" ", ""):
                            rows.append((xmin, ymin, xmax, ymax, text, is_bold, span_font, font_size, page_num))
    
    # Create a DataFrame from the rows
    span_df = pd.DataFrame(rows, columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'is_bold', 'span_font', 'font_size', 'page_num'])
    return span_df

def get_title(span_df):
    title_page = span_df[span_df['page_num'] == 1]
    unique_font_sizes_title = title_page['font_size'].unique()
    title = ""
    for index, row in title_page.iterrows():
        #Check if the row is bold and if the font size is the greatest font size
        if row['font_size'] == max(unique_font_sizes_title): 
            title += row['text']
    return title

def extract_toc_as_df(pdf_path):
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    # Extract the table of contents
    toc = doc.get_toc()
    
    # Close the document
    doc.close()
    
    # Convert TOC to DataFrame
    toc_df = pd.DataFrame(toc, columns=['Level', 'Title', 'Page'])
    
    return toc_df

def extract_pdf_content(pdf_path, start_page, end_page):
    pdf_document = fitz.open(pdf_path)
    pdf_content = []
    block_dict = {}
    for page_number in range(start_page-1, end_page):
        page = pdf_document[page_number]
        file_dict = page.get_text('dict') # Get the page dictionary
        block = file_dict['blocks'] # Get the block information
        block_dict[page_number] = block

        page_content = page.get_text()
        pdf_content.append(page_content)
        

        # Extract images from the page
        extract_and_save_images(page, pdf_document, PATH)

        # Extract tables from the page
        pdf_content.append(extract_tables_to_string(page))

    pdf_content = (analyse_content(extract_spans_from_blocks(block_dict),page))
    
    return pdf_content


def analyse_content(span_df, page):
    # Initialize lists to hold categorized content
    site_header = []
    headers = []
    important_texts = []
    picture_descriptions = []
    page_content = []

    # Calculate font size and font type statistics
    font_size_counts = span_df['font_size'].value_counts()
    unique_font_sizes = span_df['font_size'].unique()

    # Define thresholds
    site_header_threshold = page.rect.height * 0.07
    # Iterate through each row in the DataFrame
    #display(span_df)
    for index, row in span_df.iterrows():
        text = row['text']
        font_size = row['font_size']
        is_bold = row['is_bold']
        ymin = row['ymin']
        
        # Site Header - typically at the top of the page
        if ymin < site_header_threshold:
            site_header.append(text)

        # Headers - typically larger font size and bold
        elif is_bold and font_size > font_size_counts.idxmax():
            headers.append(text)

        # Bold text that is not categorized as headers or subheaders
        elif is_bold and font_size == font_size_counts.idxmax():
            important_texts.append(text)

        # Check for figure descriptions
        elif "figure" in text.lower() or "fig." in text.lower() or "image" in text.lower():
            picture_descriptions.append(text)

        page_content.append(text)
    
    #Fromat the page_content to a string
    page_content = " ".join(page_content)

        # Store results in a dictionary
    data = {
        "site_title": site_header,
        "headers": headers,
        "important_texts": important_texts,
        "picture_descriptions": picture_descriptions,
        "page_content": page_content
    }

    #display(data)

    formatted_string = (
    "Site Title:\n"
    f"  - {data['site_title']}\n\n"
    
    "Headers:\n"
    "  - " + "\n  - ".join(data['headers']) + "\n\n"
    
    "Important Texts:\n"
    "  - " + "\n  - ".join(data['important_texts']) + "\n\n"
    
    "Picture Descriptions:\n"
    "  - " + "\n  - ".join(data['picture_descriptions']) + "\n\n"
    
    "Page Content:\n"
    f"  {data['page_content']}\n")

    
    return formatted_string

In [None]:
def analyse_content(df, page):
    df['group_id'] = (df['font_size'].ne(df['font_size'].shift()) |
                      df['page_num'].ne(df['page_num'].shift())).cumsum()

    formatted_text = []
    for group_id, group in df.groupby('group_id'):
        first_row = group.iloc[0]
        font_size = first_row['font_size']

        group_text = ' '.join(group['text'].tolist())
        
        formatted_text.append(f"\n{group_text}")

    output_text = '\n'.join(formatted_text)
    return output_text

In [73]:
test = extract_pdf_content(BOOK_PDF, 41, 41)
print(test)

Site Title:
  - ['20     Introduction and Recap of First Edition']

Headers:
  - 

Important Texts:
  - Level 3 - Site Operations
  - Level 2 - Area Supervisory Control
  - Level 1 - Basic Control
  - Variable-Frequency Drives
  - VFDs
  - proportional-integral-derivative
  - PID

Picture Descriptions:
  - 

Page Content:
  20     Introduction and Recap of First Edition Level 3 - Site Operations Level 3 is where systems reside that support plant-wide control and monitoring functions.  At this level, the operator is interacting with the overall production systems. Think of  centralized control rooms with HMIs and operator terminals that give an overview  of all the systems that run the processes in a plant or facility. The operator uses these  HMI systems to perform tasks such as quality control checks, managing uptime, and  monitoring alarms, events, and trends.  Level 3, Site Operations, is also where the OT systems live that report back to IT systems  in level 4. Systems in lower lev

In [74]:
model_path = WORKSPACE_DIC + "/Phi/Phi3/Phi-3-mini-4k-instruct-q4.gguf"
#model_path = WORKSPACE_DIC + "/Llama/LLama3/Meta-Llama-3-70B-Instruct-v2.Q4_K_M.gguf"
#model_path = WORKSPACE_DIC + "/Mixtral/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"
llm = LlamaCpp(
    model_path= model_path,
    n_gpu_layers=-1,
    n_batch=4096,
    n_ctx=8138,
    temperature=0.7,
    top_p=1,
    max_tokens = 2048,
    #callback_manager=callback_manager,
    verbose=False,  # Verbose is required to pass to the callback manager
)

In [75]:
PROMPT_TEMPLATE = """Please give a summarization of the following informtaion: {text}"""

In [76]:
prompt_template = PromptTemplate.from_template(PROMPT_TEMPLATE)
chain = prompt_template | llm
llm_answer = chain.invoke({"text" : test})     

In [77]:
print(llm_answer)

===
The document provides an overview of different operational levels within a plant's control system hierarchy, focusing on Levels 3, 2, and 1. Each level serves distinct functions in managing and monitoring the production processes.

**Level 3 - Site Operations:** This is the topmost layer where centralized control rooms house Human-Machine Interfaces (HMIs) and operator terminals. These systems provide a comprehensive view of all plant operations, allowing operators to perform tasks such as quality control checks, manage uptime, monitor alarms, events, and trends, and oversee the production processes across the facility. Level 3 is also where Operational Technology (OT) systems reside, which report back to IT systems in Level 4 for further processing or data analysis. Common components found here include database servers, application servers, file servers, Microsoft domain controllers, HMI servers, engineering workstations, and more. It's crucial that the Microsoft domain controller

In [78]:
PROMPT_QUESTION = """
Please give me multiple choice questions in the following format:

Question: Which two options are the best reasons to use an IPV4 private IP space? (Choose two.)
A. to enable intra-enterprise communication
B. to implement NAT
C. to connect applications
D. to conserve global address space
E. to manage routing overhead
Answer: AD

Question: The corporate security policy requires multiple elements to be matched in an authorization policy. Which elements can be combined to meet the requirement?
A. Device registration status and device activation status
B. Network access device and time condition
C. User credentials and server certificate
D. Built-in profile and custom profile
Answer: B

From the following text: {text}"""

In [79]:
llm = LlamaCpp(
    model_path= model_path,
    n_gpu_layers=-1,
    n_batch=4096,
    n_ctx=8138,
    temperature=0.9,
    top_p=1,
    max_tokens = 2048,
    #callback_manager=callback_manager,
    verbose=False,  # Verbose is required to pass to the callback manager
)

In [80]:
prompt_template = PromptTemplate.from_template(PROMPT_QUESTION)
chain = prompt_template | llm
llm_question = chain.invoke({"text" : llm_answer})

In [81]:
llm_question

"\n\n**Answer Choices:**\n\nQuestion: What is the primary function of Level 3 (Site Operations) within a plant's control system hierarchy?\nA. Managing specific process areas like line-control PLCs\nB. Providing an overview and centralized oversight for all operations across the facility\nC. Directly controlling critical processes such as valve operation and actuator movement\nD. Controlling VFDs associated with electrical systems\nE. Integrating OT systems from Level 4 for data analysis\nAnswer: B\n\nQuestion: Which of these components is typically found at all three operational levels (3, 2, and 1) within a plant's control system hierarchy?\nA. HMIs and engineering workstations\nB. Variable-Frequency Drives (VFDs) and PLCs\nC. Microsoft domain controllers in standalone industrial domains\nD. Operational Technology systems reporting to IT systems\nE. Network access device and server certificate\nAnswer: A\n\nQuestion: Within the context of a plant's control system hierarchy, what is t

In [82]:
PROMPT_Test= """
Are the following questions are related to the given text?
Questions: {questions}
Text: {text}"""

In [59]:
prompt_template = PromptTemplate.from_template(PROMPT_Test)
chain = prompt_template | llm
llm_test = chain.invoke({"questions" : llm_question, "text" : llm_answer})

In [60]:
llm_test

"\n\nGiven the focus on ICS and its components such as EUs (Equipiment Under Controle) mentioned in the text, it's essential to approach questions about implementing private IP spaces within this environment with an understanding of these principles. The specific security aspects that apply here are mainly related to operational continuity rather than traditional IT-focused concerns like confidentiality and integrity (Answer C).\n\nRegarding your first question on the implementation of IPv4 private IP space: Implementing a private IP network within an ICS environment, particularly for conserving global address space while ensuring secure communication, is crucial. The text does not explicitly mention NAT for device discovery and management but highlights efficient use of resources—an indirect reference to managing routing overhead (Answer E). While B (enabling intra-enterprise communication) is directly relevant, the specific choice D (conserving global address space) aligns closely wi

In [None]:
#Create a document object
doc = fitz.open(BOOK_PDF)
#Extract the table of contents
toc_df = extract_toc_as_df(BOOK_PDF)

prompt_text = """You are an assistant tasked with summarizing texts. \ 
Give a concise summary of the text. Text chunk: {element} """

for index, row in toc_df.iterrows():
    # all the pages until the the row containing the word "Chapter" or "Section" in the title
    if "Chapter" in row['Title'] or "Section" in row['Title']:
        #Keep all the rows from the index to the end
        toc_df = toc_df.iloc[index:]
        break

toc_df['End Page'] = None
toc_df = toc_df.sort_values(by='Page').reset_index(drop=True)

prompt_template = PromptTemplate.from_template(PROMPT)

#Create text splitter to split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4050,    # 4050 tokens is the maximum number of tokens that can be processed in a single run
    chunk_overlap=128,
)
for index, row in toc_df.iterrows():
#Iterate only over the first 10 chapters
    if index > 50:
        break
    # Get the start and end page for the current chapter
    start_page = row['Page']
    if index < len(toc_df)-1:
        end_page = toc_df.loc[index+1, 'Page']
    else:
        end_page = doc.page_count - 1
    # # Extract the content of the chapter
    content_of_pdf = extract_pdf_content(BOOK_PDF, start_page, end_page)
    #Split the text into chunks
    texts = text_splitter.split_text(content_of_pdf)
    for text in texts:
        print(text)

    # # Create a prompt from the template
    prompt = prompt_template

    # # Create a summarization chain
    summarize_chain = {"content_of_pdf": lambda x: x} | prompt | llm | StrOutputParser()

    print("------------- Summarizing the text chunks...------------")
    # # Summarize the text chunks
    text_summaries = summarize_chain.batch(texts, {"max_concurrency": 50})
    for summary in text_summaries:
        print(summary)
    print("--------------- Done! ---------------")


    # # Display the summaries
    # for summary in text_summaries:
    #     print(summary)

In [None]:
# Create a document object
doc = fitz.open(BOOK_PDF)

content_of_pdf = ""
block_dict = {}


#Iterate over all pages in the documents
for i in range(doc.page_count):
#for i in range(0,100):
  page = doc.load_page(i)
  file_dict = page.get_text('dict') # Get the page dictionary
  block = file_dict['blocks'] # Get the block information
  block_dict[i] = block
  # read text and print it
  text = page.get_text()
  #Add the text to a string to be used in the prompt
  content_of_pdf = content_of_pdf + text

  ### IMAGES ###
  # Extract all the images on the page and save the images
  for i in page.get_images(full=True):
    xref = i[0]
    base_image = doc.extract_image(xref)
    image_bytes = base_image["image"]
    image = fitz.Pixmap(doc, xref)
    with open(f'{PATH}/image_{xref}.png', 'wb') as f:
      f.write(image.tobytes())

  ## TABLES ##
  # Extract all the tables on the page and save the tables
  tabs = page.find_tables()  # detect the tables
  for i,tab in enumerate(tabs):  # iterate over all tables
      print(f"Table {i} column names: {tab.header.names}, external: {tab.header.external}")
      tab = tabs[i]
      df = tab.to_pandas()
      #Add the table to a string to be used in the prompt
      content_of_pdf += df.to_string()

In [None]:
#Make a function out of that
spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag'])
rows = []
for page_num, blocks in block_dict.items():
    for block in blocks:
        if block['type'] == 0:
            for line in block['lines']:
                for span in line['spans']:
                    xmin, ymin, xmax, ymax = list(span['bbox'])
                    font_size = span['size']
                    text = unidecode(span['text'])
                    span_font = span['font']
                    is_bold = False
                    if "bold" in span_font.lower():
                        is_bold = True
                    if text.replace(" ","") !=  "":
                        rows.append((xmin, ymin, xmax, ymax, text, is_bold, span_font, font_size, page_num))
                        span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text','is_bold','span_font', 'font_size', 'page_num'])

In [None]:
#Find text with table of content in span_df
toc = span_df[span_df['text'].str.contains("table of content", case=False)]


In [None]:
#About 30 tokens
prompt_text = """You are an assistant tasked with summarizing texts. \ 
Give a concise summary of the text. Text chunk: {element} """

In [None]:
model_path = WORKSPACE_DIC + "/Phi/Phi3/Phi-3-mini-4k-instruct-q4.gguf"
llm = LlamaCpp(
    model_path= model_path,
    n_gpu_layers=-1,
    n_batch=4096,
    n_ctx=4096,
    temperature=1,
    top_p=1,
    max_tokens = 5000,
    #callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

In [None]:
prompt_template = PromptTemplate.from_template(PROMPT)

#Create text splitter to split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4050,    # 4050 tokens is the maximum number of tokens that can be processed in a single run
    chunk_overlap=128,
)

# Split the text into chunks
texts = text_splitter.split_text(content_of_pdf)

prompt = PromptTemplate.from_template(prompt_text)

summarize_chain = {"element": lambda x: x} | prompt | llm | StrOutputParser()

text_summaries = summarize_chain.batch(texts, {"max_concurrency": 50})

In [None]:
#Save the summaries to a file
with open('summaries.txt', 'w') as f:
    for item in text_summaries:
        f.write("%s\n" % item)

In [None]:
#Load the summaries from the file
with open('summaries.txt', 'r') as f:
    text_summaries = f.readlines()

In [None]:
embedding_function = SentenceTransformerEmbeddings(model_name= "sentence-transformers/all-mpnet-base-v2")

db = Chroma.from_texts(text_summaries, embedding_function)

retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 6})

print(retriever)

template2 = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""

template = """Create multiple choice question in one of the following format:
Question: Which two options are the best reasons to use an IPV4 private IP space? (Choose two.)
A. to enable intra-enterprise communication
B. to implement NAT
C. to connect applications
D. to conserve global address space
E. to manage routing overhead
Answer: AD

Question: The corporate security policy requires multiple elements to be matched in an authorization policy. Which elements can be combined to meet the requirement?
A. Device registration status and device activation status
B. Network access device and time condition
C. User credentials and server certificate
D. Built-in profile and custom profile
Answer: B

using the following context:
{context}
"""

#prompt = hub.pull("rlm/rag-prompt")
# Retrieve and generate using the relevant snippets of the blog.
retriever = db.as_retriever()
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("""Create multiple choice question in one in the following format out of the provided context""")

In [None]:
display(llm_answer)