# Unstructured PDF Extractuion Pipeline Setup and Testing
This file serves as a playground/workspace to demo Unstructured functionality before it is put into production. This is where we will test the extraction, chunking, metadata extraction, metadata tagging, and vector embedding of PDFs for CASSIE

#### Changelog
**May 18th, 2024**  
**7:00 AM**  
* File created  
* Metadata extraction testing  
**7:30 AM**
* PDF Element Extraction

### Metadata Extraction from PDF Files

#### Imports

In [116]:
import pikepdf
import sys

#### Initial Extraction of Metadata

In [117]:
pdf_file = 'C:/Users/Dan/OneDrive/Documents/Crosslinx/CSDocs/Security Docs/5000-00-CTSC-48PA-0004_01.pdf'

pdf = pikepdf.Pdf.open(pdf_file)
docInfo = pdf.docinfo

for key, value in docInfo.items():
    print(f'{key}: {value}')

/CreationDate: D:20181004144423-04'00'
/Creator: Microsoft® Word 2013
/ModDate: D:20181004155108-04'00'
/Producer: Microsoft® Word 2013


#### Transform Date Function

In [118]:
import pikepdf
import datetime
import re
from dateutil.tz import tzutc, tzoffset
import sys

In [119]:
pdf_date_pattern = re.compile(''.join([
    r"(D:)?",
    r"(?P<year>/d/d/d/d)",
    r"(?P<month>/d/d)",
    r"(?P<day>/d/d)",
    r"(?P<hour>/d/d)",
    r"(?P<minute>/d/d)",
    r"(?P<second>/d/d)",
    r"(?P<tz_offset>[+-zZ])?",
    r"(?P<tz_hour>/d/d)?",
    r"'?(?P<tz_minute>/d/d)?'?"]))

In [120]:
def transform_date(date_str):
    """
    Convert a pdf date such as "D:20120321183444+07'00'" into a usable datetime
    http://www.verypdf.com/pdfinfoeditor/pdf-date-format.htm
    (D:YYYYMMDDHHmmSSOHH'mm')
    :param date_str: pdf date string
    :return: datetime object
    """
    global pdf_date_pattern
    match = re.match(pdf_date_pattern, date_str)
    if match:
        date_info = match.groupdict()

        for k, v in date_info.items():  # transform values
            if v is None:
                pass
            elif k == 'tz_offset':
                date_info[k] = v.lower()  # so we can treat Z as z
            else:
                date_info[k] = int(v)

        if date_info['tz_offset'] in ('z', None):  # UTC
            date_info['tzinfo'] = tzutc()
        else:
            multiplier = 1 if date_info['tz_offset'] == '+' else -1
            date_info['tzinfo'] = tzoffset(None, multiplier*(3600 * date_info['tz_hour'] + 60 * date_info['tz_minute']))

        for k in ('tz_offset', 'tz_hour', 'tz_minute'):  # no longer needed
            del date_info[k]

        return datetime.datetime(**date_info)

#### Extract Dates from PDF

In [121]:
pdf = pikepdf.Pdf.open(pdf_file)
docinfo = pdf.docinfo
for key, value in docinfo.items():
    if str(value).startswith("D:"):
        # pdf datetime format, convert to python datetime
        value = transform_date(str(pdf.docinfo["/CreationDate"]))
    print(key, ":", value)

/CreationDate : None
/Creator : Microsoft® Word 2013
/ModDate : None
/Producer : Microsoft® Word 2013


So it looks like the created and modified dates can be extracted from PDFs, even though I've downloaded them much later. I will try this for all the PDF files in the CSDocs folder to ensure it works properly

In [122]:
import os

In [123]:
CSDocs_path = 'C:/Users/Dan/OneDrive/Documents/Crosslinx/CSDocs/'

doc_info = []


for file in os.listdir(CSDocs_path):
        try:
            doc = {}
            if file.endswith('.pdf'):
                doc = {'filename': file}
                pdf = pikepdf.Pdf.open(CSDocs_path + file)
                docInfo = pdf.docinfo
                for key, value in docInfo.items():
                    print(f'{key}: {value}')
                    if str(value).startswith("D:"):
                        # pdf datetime format, convert to python datetime
                        value = transform_date(str(pdf.docinfo["/CreationDate"]))
                    if key == '/CreationDate':
                        key = 'Creation Date'
                    if key == '/Creator':
                        key = 'Creator'
                    if key == '/ModDate':
                        key = 'Modification Date'
                    if key == '/Producer':
                        key = 'Producer'
                    if key == '/Title':
                        key = 'Title'
                    if key == '/Subject':
                        key = 'Subject'
                    if key == '/Author':
                        key = 'Author'
                    doc[key] = value
                print('\n')
            if doc['filename']:
                doc_info.append(doc)
        except Exception as e:
            print(e)

/Author: Yousef Kimiagar
/CreationDate: D:20200210124719-05'00'
/Creator: Microsoft® Word for Office 365
/ModDate: D:20200210124719-05'00'
/Producer: Microsoft® Word for Office 365
/Title: SCHEDULE 15-2


/Author: David Robson/Duncan Robb
/CreationDate: D:20170927074532-04'00'
/Creator: Microsoft® Word 2013
/ModDate: D:20170927083115-04'00'
/Producer: Microsoft® Word 2013
/Subject: Eglinton Crosstown LRT Project
/Title: ECLRT Cyber Security Management Plan


'filename'
/Author: 
/CreationDate: D:20220506210237-04'00'
/Creator: Aspose Ltd.
/ModDate: D:20220506210237-04'00'
/Producer: Aspose.Pdf for .NET 11.4.0
/Subject: 
/Title: 


/CreationDate: D:20210831101215-04'00'
/Creator: Microsoft® Word for Microsoft 365
/ModDate: D:20220224143709-05'00'
/Producer: Microsoft® Word for Microsoft 365


/CreationDate: D:20210714131927-04'00'
/Creator: Microsoft® Word for Microsoft 365
/ModDate: D:20220224145956-05'00'
/Producer: Microsoft® Word for Microsoft 365


/CreationDate: D:20210715181444-0

In [124]:
for i in doc_info:
    for key, value in i.items():
        print(f'{key}: {value}')
    print('\n')



filename: 15-2 Conformed.pdf
Author: Yousef Kimiagar
Creation Date: None
Creator: Microsoft® Word for Office 365
Modification Date: None
Producer: Microsoft® Word for Office 365
Title: SCHEDULE 15-2


filename: 5000-00-CTSC-48PA-0002 - Cyber Security Management Plan-5000-00-CTSC-48PA-0002_PA.pdf
Author: David Robson/Duncan Robb
Creation Date: None
Creator: Microsoft® Word 2013
Modification Date: None
Producer: Microsoft® Word 2013
Subject: Eglinton Crosstown LRT Project
Title: ECLRT Cyber Security Management Plan


filename: 5000-00-WGD-48PA- 2001 - Internal Handover Plan – Cyber Systems-2700-67ASA1-02-CTSC-0004_01.pdf
Author: 
Creation Date: None
Creator: Aspose Ltd.
Modification Date: None
Producer: Aspose.Pdf for .NET 11.4.0
Subject: 
Title: 


filename: 5000-00-WGD-48PA-1001 - Baseline Controls Document - Management Controls Policy.pdf
Creation Date: None
Creator: Microsoft® Word for Microsoft 365
Modification Date: None
Producer: Microsoft® Word for Microsoft 365


filename: 5000-

In [125]:
for i in doc_info:
    print(i)

{'filename': '15-2 Conformed.pdf', 'Author': pikepdf.String("Yousef Kimiagar"), 'Creation Date': None, 'Creator': pikepdf.String("Microsoft® Word for Office 365"), 'Modification Date': None, 'Producer': pikepdf.String("Microsoft® Word for Office 365"), 'Title': pikepdf.String("SCHEDULE 15-2")}
{'filename': '5000-00-CTSC-48PA-0002 - Cyber Security Management Plan-5000-00-CTSC-48PA-0002_PA.pdf', 'Author': pikepdf.String("David Robson/Duncan Robb"), 'Creation Date': None, 'Creator': pikepdf.String("Microsoft® Word 2013"), 'Modification Date': None, 'Producer': pikepdf.String("Microsoft® Word 2013"), 'Subject': pikepdf.String("Eglinton Crosstown LRT Project"), 'Title': pikepdf.String("ECLRT Cyber Security Management Plan")}
{'filename': '5000-00-WGD-48PA- 2001 - Internal Handover Plan – Cyber Systems-2700-67ASA1-02-CTSC-0004_01.pdf', 'Author': pikepdf.String(""), 'Creation Date': None, 'Creator': pikepdf.String("Aspose Ltd."), 'Modification Date': None, 'Producer': pikepdf.String("Aspose.P

So it looks like there is a good amount of metadata already stored in the metadata of the PDF. We will aim to enhance this metadata using PDF extraction and element recognition. We'll start by reading a PDF with unstructured and seeing what elements exist on the first page/author block.

It is notable that the title metadata isn't always descriptive. I'm not sure how we can get a document description using an LLM, but I think either RAPTOR (Continuously shrinking documents (Subsection Summaries => Section Summaries => Doc Summaries) could work to shorten the tokens needed, or passing to Gemini (The 1M, soon 2M, token context window should be enough to summarize most documents) could be a good solution.)

We will start with Pulling Titles and Authors which starts with Unstructured extraction

### Unstuctured Extraction

In [126]:
pdf_file = "c:/Users/Dan/OneDrive/Documents/Crosslinx/CSDocs/5000-00-WGD-48PA-1020 - Change Control and Configuration Management Process.pdf"

from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(
    pdf_file,
    languages=['en'],
    hi_res_model_name='yolox',
    infer_table_structure=True,
    strategy='hi_res'
    )

In [127]:
print(len(elements))
metadata = elements[0].metadata.to_dict()
el = elements[1].to_dict()

for element in elements:
    element_dict = element.to_dict()
    if element_dict['type'] == 'Title':
        print(element_dict['text'])
        print('\n')
        first_title_element = element_dict
        break

for key, value in metadata.items():
    print(f'{key}: {value}')

print('\n')
for key, value in first_title_element.items():
    print(f'{key}: {value}')

504
Cyber Security – Change Control and Configuration Management Process


coordinates: {'points': ((192.29166666666666, 310.55555555555543), (192.29166666666666, 532.3611111111111), (749.2361111111112, 532.3611111111111), (749.2361111111112, 310.55555555555543)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}
last_modified: 2023-03-28T16:34:32
filetype: application/pdf
languages: ['en']
page_number: 1
file_directory: c:/Users/Dan/OneDrive/Documents/Crosslinx/CSDocs
filename: 5000-00-WGD-48PA-1020 - Change Control and Configuration Management Process.pdf


type: Title
element_id: 8df1b47d06638e6081109dc18f6b42d0
text: Cyber Security – Change Control and Configuration Management Process
metadata: {'detection_class_prob': 0.48540282249450684, 'coordinates': {'points': ((322.7745666503906, 599.533203125), (322.7745666503906, 751.4722222222222), (1370.8916015625, 751.4722222222222), (1370.8916015625, 599.533203125)), 'system': 'PixelSpace', 'layout_width': 1700, 'layo

The file has been partitioned using the yolox model. 
filename: 5000-00-WGD-48PA-1020 - Change Control and Configuration Management Process.pdf  
elements: 504  
first title element text: Cyber Security – Change Control and Configuration Management Process  
metadata:  
    **coordinates**:   
        {'points': ((192.29166666666666, 310.55555555555543), (192.29166666666666, 532.3611111111111), (749.2361111111112, 532.3611111111111), (749.2361111111112, 310.55555555555543)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}
    **last_modified**:   
        2023-03-28T16:34:32
    **filetype**:   
        application/pdf
    **languages**:   
        ['en']
    **page_number**:   
        1
    **file_directory**:  
        c:/Users/Dan/OneDrive/Documents/Crosslinx/CSDocs
    **filename**:   
        5000-00-WGD-48PA-1020 - Change Control and Configuration Management Process.pdf

#### Examining Headers and Footers

In [128]:
for i in elements:
    print(i.to_dict()['type'])
    if i.to_dict()['type'] == 'Table':
        print(i.metadata.text_as_html)
    elif i.to_dict()['type'] == 'NarrativeText':
        print(i.to_dict()['text'])
    elif i.to_dict()['type'] == 'Title':
        print(i.to_dict()['text'])
    elif i.to_dict()['type'] == 'ListItem':
        print(i.to_dict()['text'])
    elif i.to_dict()['type'] == 'UncategorizedText':
        print(i.to_dict()['text'])
    else:
        print(i.to_dict()['type'])

Image
Image
Image
Image
Title
Cyber Security – Change Control and Configuration Management Process
Image
Image
Image
Image
Image
Image
NarrativeText
© 2021 Crosslinx Transit Solutions This document is licensed to Crosslinx Transit Solutions and cannot be used, reproduced, published, and/or revealed without prior written authorization. This document has been classified as Cyber Security - Confidential. Unauthorized use or distribution is prohibited. Printed copies of this document are uncontrolled. Please access the electronic version for the most current information.
Image
Image
UncategorizedText
5000-00-WGD-48PA-1020
UncategorizedText
03
Title
Cyber Security – Change Control and Configuration Management Process 2021-03-01
Title
Doc Owner
NarrativeText
WGD Consulting
Title
DOCUMENT REVISIONS INDEX
Table
<table><thead><th>Revision</th><th>Revision Date</th><th>Author(s)</th><th>Reviewer(s)</th><th>Description of Changes</th></thead><tr><td>PA</td><td>April 8, 2020</td><td>Chris Goulopou

In [129]:
headers_list = [el for el in elements if el.to_dict()['type'] == 'Header']
footers_list = [el for el in elements if el.to_dict()['type'] == 'Footer']

In [130]:
print(f"Number of Headers: {len(headers_list)}")
print(f"Number of Footers: {len(footers_list)}")

Number of Headers: 6
Number of Footers: 4


In [131]:
for i in headers_list:
    print(i.to_dict()['text'])
for i in footers_list:
    print(i.to_dict()['text'])

Doc Owner
Doc Owner
WGD Consulting
Doc Owner
WGD Consulting
Doc Owner
Page | 22 of 25
Document Classification: CTSC/ECLRT Cyber Security - Confidential. Unauthorized use or distribution is prohibited. Printed copies of this document are uncontrolled. Please access the electronic version for the most current information.
Page | 23 of 25
Page | 24 of 25


In [132]:
tables_list = [el for el in elements if el.to_dict()['type'] == 'Table']

for i in tables_list:
    print(i.metadata.text_as_html)

<table><thead><th>Revision</th><th>Revision Date</th><th>Author(s)</th><th>Reviewer(s)</th><th>Description of Changes</th></thead><tr><td>PA</td><td>April 8, 2020</td><td>Chris Goulopoulos</td><td>Michael Godfrey Other CTSC/D/M participants</td><td>Initial draft for review</td></tr><tr><td>00</td><td>April 24, 2020</td><td>Chris Goulopoulos</td><td>Michael Godfrey Other CTSC/D/M participants</td><td>Updated to reflect comments from review</td></tr><tr><td>01</td><td>July 9, 2020</td><td>Chris Goulopoulos_</td><td>| Mark Salsberg</td><td>Updated to reflect comments from 2700- 67ASA1-01-CTSC-0012_02.pdf</td></tr><tr><td>02</td><td>August 27, 2020</td><td>Mark Salsberg</td><td>Chris Goulopoulos</td><td>Minor corrections and edits</td></tr><tr><td>03</td><td>June 29, 2021</td><td>. Chris Goulopoulos_</td><td>| Mark Salsberg</td><td>Adjusted wording for responsibilities based on CTS contractual agreements</td></tr></table>
<table><tr><td>PULDOSEC</td><td>.... ee</td><td>eeeeecceecceeceeeces

In [46]:
def count_elements(elements):
    element_count = {}
    for element in elements:
        element_type = element.category
        if element_type not in element_count:
            element_count[element_type] = 0
        element_count[element_type] += 1
    return element_count

In [134]:
element_count = count_elements(elements)
for i in element_count:
    print(f'{i}: {element_count[i]}')

Image: 37
Title: 104
NarrativeText: 139
Text: 59
Table: 9
ListItem: 146
Header: 6
Footer: 4


In [135]:
titles = [el for el in elements if el.to_dict()['type'] == 'Title']
for i in titles:
    print(i.to_dict()['text'])

Cyber Security – Change Control and Configuration Management Process
Cyber Security – Change Control and Configuration Management Process 2021-03-01
Doc Owner
DOCUMENT REVISIONS INDEX
REVISION AND CONTROL
Cyber Security – Change Control and Configuration Management Process 2021-03-01
Doc Owner
Contents
Appendix A
Appendix B
Appendix C
Appendix D
Appendix E
Appendix F
Cyber Security – Change Control and Configuration Management Process 2021-03-01
Doc Owner
Cyber Security – Change Control and Configuration Management Process 2021-03-01
Doc Owner
1 Introduction
2 Purpose
3 Objectives
4 Scope and Applicability
Cyber Security – Change Control and Configuration Management Process 2021-03-01
4.1 Applicability
4.2 Exclusions
4.3 Associated Documents
Cyber Security – Change Control and Configuration Management Process 2021-03-01
Doc Owner
5 Roles and Responsibilities
1. Cyber Security Senior Manager
2. Delegates (Functional Delegates)
3. Technical and Compliance Oversight
Change Review Group
4.

In [136]:
from io import StringIO 
from lxml import etree

table_html = tables_list[0].metadata.text_as_html

parser = etree.XMLParser(remove_blank_text=True)
file_obj = StringIO(table_html)
tree = etree.parse(file_obj, parser)
print(etree.tostring(tree, pretty_print=True).decode())

<table>
  <thead>
    <th>Revision</th>
    <th>Revision Date</th>
    <th>Author(s)</th>
    <th>Reviewer(s)</th>
    <th>Description of Changes</th>
  </thead>
  <tr>
    <td>PA</td>
    <td>April 8, 2020</td>
    <td>Chris Goulopoulos</td>
    <td>Michael Godfrey Other CTSC/D/M participants</td>
    <td>Initial draft for review</td>
  </tr>
  <tr>
    <td>00</td>
    <td>April 24, 2020</td>
    <td>Chris Goulopoulos</td>
    <td>Michael Godfrey Other CTSC/D/M participants</td>
    <td>Updated to reflect comments from review</td>
  </tr>
  <tr>
    <td>01</td>
    <td>July 9, 2020</td>
    <td>Chris Goulopoulos_</td>
    <td>| Mark Salsberg</td>
    <td>Updated to reflect comments from 2700- 67ASA1-01-CTSC-0012_02.pdf</td>
  </tr>
  <tr>
    <td>02</td>
    <td>August 27, 2020</td>
    <td>Mark Salsberg</td>
    <td>Chris Goulopoulos</td>
    <td>Minor corrections and edits</td>
  </tr>
  <tr>
    <td>03</td>
    <td>June 29, 2021</td>
    <td>. Chris Goulopoulos_</td>
    <td>| Mark

In [137]:
from IPython.core.display import HTML
HTML(table_html)

Revision,Revision Date,Author(s),Reviewer(s),Description of Changes
PA,"April 8, 2020",Chris Goulopoulos,Michael Godfrey Other CTSC/D/M participants,Initial draft for review
00,"April 24, 2020",Chris Goulopoulos,Michael Godfrey Other CTSC/D/M participants,Updated to reflect comments from review
01,"July 9, 2020",Chris Goulopoulos_,| Mark Salsberg,Updated to reflect comments from 2700- 67ASA1-01-CTSC-0012_02.pdf
02,"August 27, 2020",Mark Salsberg,Chris Goulopoulos,Minor corrections and edits
03,"June 29, 2021",. Chris Goulopoulos_,| Mark Salsberg,Adjusted wording for responsibilities based on CTS contractual agreements


In [138]:
from langchain_openai import ChatOpenAI
from langchain_core.documents import Document
from langchain.chains.summarize import load_summarize_chain

In [139]:
openai_api_key = os.environ.get('OPENAI_API_KEY')

In [140]:
os.environ['OPENAI_API_KEY'] = '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'
llm = ChatOpenAI(temperature=0,openai_api_key=openai_api_key, model_name="gpt-4o")
chain = load_summarize_chain(llm, chain_type="stuff")
chain.invoke([Document(page_content=table_html)])

{'input_documents': [Document(page_content='<table><thead><th>Revision</th><th>Revision Date</th><th>Author(s)</th><th>Reviewer(s)</th><th>Description of Changes</th></thead><tr><td>PA</td><td>April 8, 2020</td><td>Chris Goulopoulos</td><td>Michael Godfrey Other CTSC/D/M participants</td><td>Initial draft for review</td></tr><tr><td>00</td><td>April 24, 2020</td><td>Chris Goulopoulos</td><td>Michael Godfrey Other CTSC/D/M participants</td><td>Updated to reflect comments from review</td></tr><tr><td>01</td><td>July 9, 2020</td><td>Chris Goulopoulos_</td><td>| Mark Salsberg</td><td>Updated to reflect comments from 2700- 67ASA1-01-CTSC-0012_02.pdf</td></tr><tr><td>02</td><td>August 27, 2020</td><td>Mark Salsberg</td><td>Chris Goulopoulos</td><td>Minor corrections and edits</td></tr><tr><td>03</td><td>June 29, 2021</td><td>. Chris Goulopoulos_</td><td>| Mark Salsberg</td><td>Adjusted wording for responsibilities based on CTS contractual agreements</td></tr></table>')],
 'output_text': 'The

In [None]:
from unstructured.chunking.title import chunk_by_title

In [158]:
images = [el for el in elements if el.to_dict()['type'] == 'Image']

for i in images:
    print(i)

CROSSLIN TRANSIT SOLUTIONS
Oct 20, 2021 
Prepared by:   Reviewed by:   Reviewed by:   Approved by:   Accepted by:   For   Eglinton Crosstown LRT   Chris Goulopoulos  Consultant, CTS   William VanRyswyk, CPP  Chief Security Officer, CTS   Evan Harben   Cyber Security & OT Network  Specialist, CTSC  Hadi Meshgi   Technical Lead - Communication &  Cybersecurity, CTSC   Oct 20, 2021  Kyle Booth Systems Integration  Authority, CTSC   Name, Title   Signature/Date   Document No.   5000-00-WGD-48PA-1020   Rev.   03   July 26, 2021  
if iA vo hay ~ Gi | x 4 July Lp >
Nee gle “dr
     
     
     
     
     
     
     
     
     
     
     
     
     
     
     
     
     
     
     
     
  
Baseline Configuration Struc
Cyber Asset Baseline Configuration List |Review/Update Date: aor Document Classification Level: Cybersecurity Information - Confidential ‘Configuration Change st = pom i a — A cert eta pan EM Pin et ug “ 23 _|LAN Switch [Network switch '- Network Switches 24_|LAN Switch 

In [152]:
doc_text = '' 
for el in elements:
    if el.to_dict()['type'] != 'Header' and el.to_dict()['type'] != 'Footer':
        if el.to_dict()['type'] == 'Table':
            table_html = el.metadata.text_as_html

            parser = etree.XMLParser(remove_blank_text=True)
            file_obj = StringIO(table_html)
            tree = etree.parse(file_obj, parser)
            print(etree.tostring(tree, pretty_print=True).decode())
            doc_text += etree.tostring(tree, pretty_print=True).decode() + '\n'
        else:
            doc_text += el.to_dict()['text'] + '\n'

<table>
  <thead>
    <th>Revision</th>
    <th>Revision Date</th>
    <th>Author(s)</th>
    <th>Reviewer(s)</th>
    <th>Description of Changes</th>
  </thead>
  <tr>
    <td>PA</td>
    <td>April 8, 2020</td>
    <td>Chris Goulopoulos</td>
    <td>Michael Godfrey Other CTSC/D/M participants</td>
    <td>Initial draft for review</td>
  </tr>
  <tr>
    <td>00</td>
    <td>April 24, 2020</td>
    <td>Chris Goulopoulos</td>
    <td>Michael Godfrey Other CTSC/D/M participants</td>
    <td>Updated to reflect comments from review</td>
  </tr>
  <tr>
    <td>01</td>
    <td>July 9, 2020</td>
    <td>Chris Goulopoulos_</td>
    <td>| Mark Salsberg</td>
    <td>Updated to reflect comments from 2700- 67ASA1-01-CTSC-0012_02.pdf</td>
  </tr>
  <tr>
    <td>02</td>
    <td>August 27, 2020</td>
    <td>Mark Salsberg</td>
    <td>Chris Goulopoulos</td>
    <td>Minor corrections and edits</td>
  </tr>
  <tr>
    <td>03</td>
    <td>June 29, 2021</td>
    <td>. Chris Goulopoulos_</td>
    <td>| Mark

In [153]:
print(doc_text)

CROSSLIN TRANSIT SOLUTIONS
Oct 20, 2021 
Cyber Security – Change Control and Configuration Management Process
Prepared by:   Reviewed by:   Reviewed by:   Approved by:   Accepted by:   For   Eglinton Crosstown LRT   Chris Goulopoulos  Consultant, CTS   William VanRyswyk, CPP  Chief Security Officer, CTS   Evan Harben   Cyber Security & OT Network  Specialist, CTSC  Hadi Meshgi   Technical Lead - Communication &  Cybersecurity, CTSC   Oct 20, 2021  Kyle Booth Systems Integration  Authority, CTSC   Name, Title   Signature/Date   Document No.   5000-00-WGD-48PA-1020   Rev.   03   July 26, 2021  
if iA vo hay ~ Gi | x 4 July Lp >
Nee gle “dr
© 2021 Crosslinx Transit Solutions This document is licensed to Crosslinx Transit Solutions and cannot be used, reproduced, published, and/or revealed without prior written authorization. This document has been classified as Cyber Security - Confidential. Unauthorized use or distribution is prohibited. Printed copies of this document are uncontrolled. 

In [154]:
chatInstance = ChatOpenAI(temperature=0, openai_api_key=openai_api_key, model_name="gpt-4o")
chatInstance.get_num_tokens(doc_text)

14452

In [155]:
response = chatInstance.invoke(f"Clean the following document to be embedded in a vector database. Return only the cleaned document text: {doc_text}")

In [157]:
print(response.content)

Crosslinx Transit Solutions (CTS) has implemented a Cyber Security Plan (CSP) and associated policies and processes for protecting the Eglinton Crosstown Light Rail Transit (ECLRT’s) Cyber Systems from cyber security threats that could compromise the safe and reliable operation of the ECLRT.

The CTS Cyber Security - Management Controls Policy, establishes the management controls and the framework for cyber security standards, plans and processes for securing those Cyber Systems that are essential for the safe and reliable operation of the ECLRT.

The Cyber Security – Change Control and Configuration Management Standard, defined the cyber security controls and required processes for managing changes to ECLRT’s operational Cyber Systems and associated Cyber Assets and their configurations.

This document presents the processes for implementing, sustaining, and adhering to the cyber security controls for managing and controlling changes to ECLRT’s Cyber Systems and changes to the baselin

#### Conclusions
**Good Information**  
It seems like we can extract all the information with a good deal of clarity. Combining Unstructured with GPT-4o could prove incredibly useful. Basically, we would extract all the chunks, and then, for all chunks that arent headers or footers, we would pass the elements text property (text_as_html property for tables) to GPT-4o to clean the text for text elements, and summarize the tables for table elements. We would then concatenate all the information into one document, then pass that to GPT-4o to pull Title, Doc type, section headings, topics, authors, revision dates, 

In [159]:
response_2 = chatInstance.invoke(f"Extract the document title, along with the section headers from the following document: {doc_text}")

In [160]:
print(response_2.content)

**Document Title:**
Cyber Security – Change Control and Configuration Management Process

**Section Headers:**
1. Introduction
2. Purpose
3. Objectives
4. Scope and Applicability
   - 4.1 Applicability
   - 4.2 Exclusions
   - 4.3 Associated Documents
5. Roles and Responsibilities
6. Definitions and Acronyms
7. Change and Configuration Management Overview
8. Change Control and Configuration Management Process
   - 8.1 Configuration Baseline Structure/Attributes
   - 8.2 Establishing the Configuration Baseline
     - 8.2.1 Test and Commissioning/As Build - Configuration Baseline
     - 8.2.2 Production/Commercial Operations - Configuration Baseline
     - 8.2.3 Configuration Management Tools
   - 8.3 Guidelines for Establishing the Baseline Configuration
     - 8.3.1 Pre-Commissioning - Scanning for Configurations
     - 8.3.2 Production - Scanning Cyber Asset Configurations
       - 8.3.2.1 Non-Production Environment Configuration Scanning Method
       - 8.3.2.2 Production Environment

In [163]:
chat_instance_json = ChatOpenAI(temperature=0, openai_api_key=openai_api_key, model_name="gpt-4o",response_format={ "type": "json_object" })

                response_format was transferred to model_kwargs.
                Please confirm that response_format is what you intended.


In [173]:
response_2 = chat_instance_json.invoke(f"Extract the document title (as document_title), along with the section headers (as section_headers) from the following document along with document type (as document_type) (procedure, technical document, system specification, project agreement, contract, request for proposal (RFP), bid, etc.) and all the tags of the document (as tags) (Use your judgement for this. The tags could be (the tags don't have to be in this list): cybersecurity, ECLRT, change management, configuration management, *System names*, schematic, process, architecture document, As-Built, etc.) and the authors (as authors) of the document and the revision number (as revision). Return them in a JSON object: {doc_text}", response_format={ "type": "json_object" })

In [195]:
print(response_2.content)

{
  "document_title": "Cyber Security – Change Control and Configuration Management Process",
  "section_headers": [
    "Introduction",
    "Purpose",
    "Objectives",
    "Scope and Applicability",
    "Applicability",
    "Exclusions",
    "Associated Documents",
    "Roles and Responsibilities",
    "Definitions and Acronyms",
    "Change and Configuration Management Overview",
    "Change Control and Configuration Management Process",
    "Configuration Baseline Structure/Attributes",
    "Establishing the Configuration Baseline",
    "Test and Commissioning/As Build - Configuration Baseline",
    "Production/Commercial Operations - Configuration Baseline",
    "Configuration Management Tools",
    "Guidelines for Establishing the Baseline Configuration",
    "Pre-Commissioning - Scanning for Configurations",
    "Production - Scanning Cyber Asset Configurations",
    "Non-Production Environment Configuration Scanning Method",
    "Production Environment Configuration Scanning Me

### Process Flow

1. Extract doc elements
2. Remove headers and footers
3. Pass to LLM  
    a. summarize if table, extract topics in json mode  
    b. clean if else, extract topics in json mode
4. Concatenate elements
5. Extract doc metadata
6. Assign section titles to elements
7. Assign doc_wide metadata to all elements
8. Embed and store

In [171]:
## Extract PDF elements

from unstructured.partition.pdf import partition_pdf

pdf_file = "c:/Users/Dan/OneDrive/Documents/Crosslinx/CSDocs/15-2 Conformed.pdf"

elements = partition_pdf(
    pdf_file,
    languages=['en'],
    hi_res_model_name='yolox',
    infer_table_structure=True,
    strategy='hi_res'
    )

In [172]:
for v in elements:
    print(v)

Eglinton Crosstown LRT Project
Project Agreement - Schedule 15-2 Part 4 Conformed Version – Rev 2.0 (December 31, 2019)
The following Variations have been conformed to this Schedule 15-2 Part 4. Variations not listed in the table below are not included in this Schedule 15-2 Part 4.
Variation Number VC0005 VC0144 VC0152 VC0235 VC0271 VC0263 VC0315 VC0234 VC0305 VC0323 Title BOCC Relocation Tee Rail Head Hardness Advertising Display System Systemwide – Amendment to Requirements for Fare Collection System Equipment Project Agreement Amendments Systemwide – Correction of Reference in Schedule 15-2 Part 4 Article 19.1 (d) Laird Secondary Entrance Overbuild Systemwide – Replacement of the Fusible MV Switches on the Primary Feeds to the Unit Substation and Use of Gas Insulated Switchgear Removal from the Project Agreement of References to Single Revenue Vehicle Train Consists. Underground Distributed Antenna System User Agency Correction OCS Maximum Line Speed for Design Date Executed Februar

In [173]:
## Remove Header and Footer elements

def remove_headers_and_footers(elements):
    cleaned_elements = [element for element in elements if element.to_dict()['type'] not in ['Header', 'Footer']]
    return cleaned_elements

# Test header/footer removal
print(len(elements))  # Display the number of elements before cleaning

# Create a copy of the elements list to avoid modifying the original list
import copy

cleaned_elements = copy.deepcopy(remove_headers_and_footers(elements))
print(f"Number of elements after cleaning: {len(cleaned_elements)}")  # Display the number of elements after cleaning

5787
Number of elements after cleaning: 5257


In [16]:
## LLM Cleaning and Topic Extraction
import json
from langchain_openai import ChatOpenAI
import os
os.environ['OPENAI_API_KEY'] = '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'
openai_api_key = os.environ.get('OPENAI_API_KEY')
OpenCleaner = ChatOpenAI(temperature=0, openai_api_key=openai_api_key, model_name="gpt-4o")

for element in cleaned_elements:
    if element.category == 'Table':
        prompt = f"Summarize the following table in detail and return only the text summary with no extra characters:\n\n{element.metadata.text_as_html}"
    else:
        prompt = f"Clean and return the following text. Do not modify the content of the text or add any information, just remove erroneous characters, and adjust the formatting. If there is no text, return 'N/A':\n\n{element.text}"

    response = OpenCleaner.invoke(prompt)
    print(element.text)
    element.text = response.content


CROSSLIN TRANSIT SOLUTIONS
Oct 20, 2021 
Cyber Security – Change Control and Configuration Management Process
Prepared by:   Reviewed by:   Reviewed by:   Approved by:   Accepted by:   For   Eglinton Crosstown LRT   Chris Goulopoulos  Consultant, CTS   William VanRyswyk, CPP  Chief Security Officer, CTS   Evan Harben   Cyber Security & OT Network  Specialist, CTSC  Hadi Meshgi   Technical Lead - Communication &  Cybersecurity, CTSC   Oct 20, 2021  Kyle Booth Systems Integration  Authority, CTSC   Name, Title   Signature/Date   Document No.   5000-00-WGD-48PA-1020   Rev.   03   July 26, 2021  
if iA vo hay ~ Gi | x 4 July Lp >
Nee gle “dr
© 2021 Crosslinx Transit Solutions This document is licensed to Crosslinx Transit Solutions and cannot be used, reproduced, published, and/or revealed without prior written authorization. This document has been classified as Cyber Security - Confidential. Unauthorized use or distribution is prohibited. Printed copies of this document are uncontrolled. 

In [17]:
for element in cleaned_elements:
    print(element.text)

CROSSLIN TRANSIT SOLUTIONS
Oct 20, 2021
Cyber Security – Change Control and Configuration Management Process
Prepared by:   
Reviewed by:   
Reviewed by:   
Approved by:   
Accepted by:   
For Eglinton Crosstown LRT   
Chris Goulopoulos  
Consultant, CTS   
William VanRyswyk, CPP  
Chief Security Officer, CTS   
Evan Harben  
Cyber Security & OT Network Specialist, CTSC  
Hadi Meshgi  
Technical Lead - Communication & Cybersecurity, CTSC   
Oct 20, 2021  
Kyle Booth  
Systems Integration Authority, CTSC   
Name, Title   
Signature/Date   
Document No. 5000-00-WGD-48PA-1020   
Rev. 03   
July 26, 2021
if I A vo hay Gi x 4 July Lp
Nee gle "dr"
© 2021 Crosslinx Transit Solutions

This document is licensed to Crosslinx Transit Solutions and cannot be used, reproduced, published, and/or revealed without prior written authorization. This document has been classified as Cyber Security - Confidential. Unauthorized use or distribution is prohibited. Printed copies of this document are uncontrol

In [174]:
cleaned_elements_2 = copy.deepcopy(cleaned_elements)

In [175]:
## Concatenate cleaned elements into a single document text
doc_text = ''
for element in cleaned_elements_2:
    doc_text += element.text + '\n'

print(doc_text)

The following Variations have been conformed to this Schedule 15-2 Part 4. Variations not listed in the table below are not included in this Schedule 15-2 Part 4.
Variation Number VC0005 VC0144 VC0152 VC0235 VC0271 VC0263 VC0315 VC0234 VC0305 VC0323 Title BOCC Relocation Tee Rail Head Hardness Advertising Display System Systemwide – Amendment to Requirements for Fare Collection System Equipment Project Agreement Amendments Systemwide – Correction of Reference in Schedule 15-2 Part 4 Article 19.1 (d) Laird Secondary Entrance Overbuild Systemwide – Replacement of the Fusible MV Switches on the Primary Feeds to the Unit Substation and Use of Gas Insulated Switchgear Removal from the Project Agreement of References to Single Revenue Vehicle Train Consists. Underground Distributed Antenna System User Agency Correction OCS Maximum Line Speed for Design Date Executed February 15, 2017 June 23, 2017 July 7, 2017 June 29, 2018 September 24, 2018 October 31, 2018 March 22, 2019 April 5, 2019 May

In [176]:
from io import StringIO 
from lxml import etree

doc_text = '' 
for el in cleaned_elements_2:
    if el.to_dict()['type'] != 'Header' and el.to_dict()['type'] != 'Footer':
        if el.category == 'Table':
            table_html = el.metadata.text_as_html

            parser = etree.XMLParser(remove_blank_text=True)
            file_obj = StringIO(table_html)
            tree = etree.parse(file_obj, parser)
            # print(etree.tostring(tree, pretty_print=True).decode())
            el.html = etree.tostring(tree, pretty_print=True).decode()
            print(el.html)
            doc_text += etree.tostring(tree, pretty_print=True).decode() + '\n'
        else:
            doc_text += el.to_dict()['text'] + '\n'


<table>
  <thead>
    <th>Variation Number</th>
    <th>Title</th>
    <th>Date Executed</th>
  </thead>
  <tr>
    <td>VC0005</td>
    <td>BOCC Relocation</td>
    <td>February 15, 2017</td>
  </tr>
  <tr>
    <td>VC0144</td>
    <td>Tee Rail Head Hardness</td>
    <td>June 23, 2017</td>
  </tr>
  <tr>
    <td>VCO0152</td>
    <td>Advertising Display System</td>
    <td>July 7, 2017</td>
  </tr>
  <tr>
    <td>VC0235</td>
    <td>Systemwide &#8212; Amendment to Requirements for Fare Collection System Equipment</td>
    <td>June 29, 2018</td>
  </tr>
  <tr>
    <td>VvC0271</td>
    <td>Project Agreement Amendments</td>
    <td>September 24, 2018</td>
  </tr>
  <tr>
    <td>VC0263</td>
    <td>Systemwide &#8212; Correction of Reference in Schedule 15-2 Part 4 Article 19.1 (d)</td>
    <td>October 31, 2018</td>
  </tr>
  <tr>
    <td>VC0315</td>
    <td>Laird Secondary Entrance Overbuild</td>
    <td>March 22, 2019</td>
  </tr>
  <tr>
    <td>VC0234</td>
    <td>Systemwide &#8212; Replac

In [177]:
print(doc_text)

The following Variations have been conformed to this Schedule 15-2 Part 4. Variations not listed in the table below are not included in this Schedule 15-2 Part 4.
<table>
  <thead>
    <th>Variation Number</th>
    <th>Title</th>
    <th>Date Executed</th>
  </thead>
  <tr>
    <td>VC0005</td>
    <td>BOCC Relocation</td>
    <td>February 15, 2017</td>
  </tr>
  <tr>
    <td>VC0144</td>
    <td>Tee Rail Head Hardness</td>
    <td>June 23, 2017</td>
  </tr>
  <tr>
    <td>VCO0152</td>
    <td>Advertising Display System</td>
    <td>July 7, 2017</td>
  </tr>
  <tr>
    <td>VC0235</td>
    <td>Systemwide &#8212; Amendment to Requirements for Fare Collection System Equipment</td>
    <td>June 29, 2018</td>
  </tr>
  <tr>
    <td>VvC0271</td>
    <td>Project Agreement Amendments</td>
    <td>September 24, 2018</td>
  </tr>
  <tr>
    <td>VC0263</td>
    <td>Systemwide &#8212; Correction of Reference in Schedule 15-2 Part 4 Article 19.1 (d)</td>
    <td>October 31, 2018</td>
  </tr>
  <tr>
 

In [187]:
prompt = f"Extract the document title (as document_title), the document authors (as authors), the date of publication (as date), document type (as document_type) (i.e. project agreement, process, contract, request for proposal, interface control document (ICD), as-built, etcetera), the revision number (as revision_number), and all section headings (as section_headings) EXACTLY as they appear in the text (they will be used to classify other lines of text in the document as being part of that section). Return all of these in JSON format:\n\n{doc_text}"

doc_metadata = OpenCleaner.invoke(prompt, response_format={ "type": "json_object" })

In [188]:
print(doc_metadata.content)

{
  "document_title": "SCHEDULE 15-2 DESIGN AND CONSTRUCTION",
  "authors": "Confidential – Economic Interests of Ontario",
  "date": "© Copyright 2015",
  "document_type": "Project Agreement",
  "revision_number": "N/A",
  "section_headings": [
    "ARTICLE 1 INTRODUCTION",
    "1.1 General Overview",
    "1.2 Systems Element Summary",
    "1.4 Cyber-Security",
    "Backup Power Supply for Systems Elements",
    "ARTICLE 2 TRACTION POWER SYSTEM (TPS)",
    "2.1 Scope of Work",
    "2.2 General Requirements",
    "2.3 Codes, Standards and Manuals",
    "2.4 Performance Requirements",
    "2.5 Traction Power Substation Locations",
    "2.6 Traction Power Requirements",
    "ARTICLE 3 OVERHEAD CATENARY SYSTEM",
    "3.1 Scope of work",
    "3.2 General Requirements",
    "3.3 Operational Requirements",
    "3.4 Design Parameters",
    "3.5 Electrical Clearances",
    "3.6 Height, Depth and Gradient Requirements",
    "3.7 Horizontal and Vertical Clearances",
    "3.8 Span Lengths, Stagge

In [189]:
doc_metadata_dict = json.loads(doc_metadata.content)
print(doc_metadata_dict.keys())

dict_keys(['document_title', 'authors', 'date', 'document_type', 'revision_number', 'section_headings'])


In [190]:
section_headings = doc_metadata_dict['section_headings']

print(section_headings)

['ARTICLE 1 INTRODUCTION', '1.1 General Overview', '1.2 Systems Element Summary', '1.4 Cyber-Security', 'Backup Power Supply for Systems Elements', 'ARTICLE 2 TRACTION POWER SYSTEM (TPS)', '2.1 Scope of Work', '2.2 General Requirements', '2.3 Codes, Standards and Manuals', '2.4 Performance Requirements', '2.5 Traction Power Substation Locations', '2.6 Traction Power Requirements', 'ARTICLE 3 OVERHEAD CATENARY SYSTEM', '3.1 Scope of work', '3.2 General Requirements', '3.3 Operational Requirements', '3.4 Design Parameters', '3.5 Electrical Clearances', '3.6 Height, Depth and Gradient Requirements', '3.7 Horizontal and Vertical Clearances', '3.8 Span Lengths, Staggers and Sweeps', '3.9 Loading and Overload Factors', '3.10 Strength Requirements', '3.11 OCS Wire Tensions and Tension Lengths', '3.12 OCS Poles and Foundations', '3.13 Sectionalizing Requirements', '3.14 System Safety and System Assurance Requirements', '3.15 Pole Deflection', '3.16 OCS Pole Grounding and Bonding', '3.17 Bridge

In [197]:
test_cleaned_elements = copy.deepcopy(cleaned_elements_2)

In [198]:
print(f"Number of Title elements before reclassification: {len([element for element in test_cleaned_elements if element.category == 'Title'])}")

count = 0
for element in test_cleaned_elements:
    if element.category == 'Title':
        if element.text not in section_headings:
            element.category = 'NarrativeText'
            print(element.category)
    else:
        print('Title Found')

print(f"Number of Title elements after reclassification: {len([element for element in test_cleaned_elements if element.category == 'Title'])}")
print(f"Number of Title elements after reclassification: {len([element for element in cleaned_elements if element.category == 'Title'])}")

Number of Title elements before reclassification: 503
Title Found
Title Found
NarrativeText
Title Found
Title Found
NarrativeText
Title Found
Title Found
NarrativeText
NarrativeText
NarrativeText
NarrativeText
NarrativeText
Title Found
Title Found
NarrativeText
Title Found
NarrativeText
Title Found
Title Found
Title Found
NarrativeText
NarrativeText
NarrativeText
NarrativeText
Title Found
Title Found
NarrativeText
Title Found
NarrativeText
Title Found
Title Found
Title Found
NarrativeText
NarrativeText
NarrativeText
NarrativeText
Title Found
NarrativeText
NarrativeText
NarrativeText
Title Found
Title Found
Title Found
Title Found
NarrativeText
NarrativeText
NarrativeText
NarrativeText
Title Found
NarrativeText
Title Found
NarrativeText
Title Found
Title Found
Title Found
NarrativeText
Title Found
NarrativeText
Title Found
Title Found
NarrativeText
NarrativeText
Title Found
Title Found
Title Found
Title Found
Title Found
Title Found
Title Found
Title Found
Title Found
Title Found
Title 

In [199]:
print (len(section_headings))

164


In [200]:
element_count = count_elements(test_cleaned_elements)
print(element_count)
print(len(section_headings))


{'NarrativeText': 924, 'Table': 14, 'UncategorizedText': 240, 'Title': 159, 'ListItem': 3920}
164


In [194]:
for element in test_cleaned_elements:
    if element.category == 'Title':
        print(element.to_dict())

{'type': 'Title', 'element_id': '1483eb118a81e6650133376e2cebaae7', 'text': 'ARTICLE 1 INTRODUCTION', 'metadata': {'detection_class_prob': 0.6244787573814392, 'coordinates': {'points': ((197.59585571289062, 463.0555555555554), (197.59585571289062, 496.5888888888887), (774.4722222222222, 496.5888888888887), (774.4722222222222, 463.0555555555554)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-03-17T21:01:46', 'filetype': 'application/pdf', 'languages': ['en'], 'page_number': 7, 'parent_id': 'd2f1becc030d43dfb238d43631102c40', 'file_directory': 'c:/Users/Dan/OneDrive/Documents/Crosslinx/CSDocs', 'filename': '15-2 Conformed.pdf'}}
{'type': 'Title', 'element_id': '97fbca1f90258036964ec3a97060a611', 'text': '1.1 General Overview', 'metadata': {'detection_class_prob': 0.8238425254821777, 'coordinates': {'points': ((195.7701416015625, 534.7222222222222), (195.7701416015625, 568.2555555555556), (568.3888888888889, 568.2555555555556), (568.38888888

In [228]:
chunks = chunk_by_title(test_cleaned_elements)

In [230]:
print(len(chunks))

161


In [229]:
for chunk in chunks:
    print(chunk)
    print("\n\n" + "-"*80)
    input()
    

CROSSLIN TRANSIT SOLUTIONS

Oct 20, 2021


--------------------------------------------------------------------------------
Cyber Security – Change Control and Configuration Management Process

Prepared by: Chris Goulopoulos Consultant, CTS
Reviewed by: William VanRyswyk, CPP Chief Security Officer, CTS
Reviewed by: Evan Harben Cyber Security & OT Network Specialist, CTSC
Approved by: Hadi Meshgi Technical Lead - Communication & Cybersecurity, CTSC Oct 20, 2021
Accepted by: Kyle Booth Systems Integration Authority, CTSC
For Eglinton Crosstown LRT
Document No. 5000-00-WGD-48PA-1020
Rev. 03
July 26, 2021


--------------------------------------------------------------------------------
if iA vo hay Gi x 4 July Lp

Nee gle “dr

This document has been classified as Cyber Security - Confidential. Unauthorized use or distribution is prohibited. Printed copies of this document are uncontrolled. Please access the electronic version for the most current information.

Sure, please provide the te

In [195]:
element_count = count_elements(test_cleaned_elements)
for i in element_count:
    print(f'{i}: {element_count[i]}')

NarrativeText: 924
Table: 14
UncategorizedText: 240
Title: 159
ListItem: 3920


In [49]:
from unstructured.chunking.title import chunk_by_title

chunks = chunk_by_title(test_cleaned_elements)

In [68]:
element_count = count_elements(test_cleaned_elements)
for i in element_count:
    print(f'{i}: {element_count[i]}')

Image: 37
NarrativeText: 205
UncategorizedText: 59
Table: 9
Title: 38
ListItem: 146


In [114]:
for element in test_cleaned_elements:
    print(element.page_number)

AttributeError: 'Image' object has no attribute 'page_number'

In [201]:
for element in test_cleaned_elements:
    if element.category == 'Table':
        print(element.html)

<table>
  <thead>
    <th>Variation Number</th>
    <th>Title</th>
    <th>Date Executed</th>
  </thead>
  <tr>
    <td>VC0005</td>
    <td>BOCC Relocation</td>
    <td>February 15, 2017</td>
  </tr>
  <tr>
    <td>VC0144</td>
    <td>Tee Rail Head Hardness</td>
    <td>June 23, 2017</td>
  </tr>
  <tr>
    <td>VCO0152</td>
    <td>Advertising Display System</td>
    <td>July 7, 2017</td>
  </tr>
  <tr>
    <td>VC0235</td>
    <td>Systemwide &#8212; Amendment to Requirements for Fare Collection System Equipment</td>
    <td>June 29, 2018</td>
  </tr>
  <tr>
    <td>VvC0271</td>
    <td>Project Agreement Amendments</td>
    <td>September 24, 2018</td>
  </tr>
  <tr>
    <td>VC0263</td>
    <td>Systemwide &#8212; Correction of Reference in Schedule 15-2 Part 4 Article 19.1 (d)</td>
    <td>October 31, 2018</td>
  </tr>
  <tr>
    <td>VC0315</td>
    <td>Laird Secondary Entrance Overbuild</td>
    <td>March 22, 2019</td>
  </tr>
  <tr>
    <td>VC0234</td>
    <td>Systemwide &#8212; Replac

In [207]:
def chunk_by_previous_title(elements):
    chunks = []
    current_chunk = {}
    current_title = "Front Page (No Title Found)"

    for element in elements:
        if not chunks and not current_chunk:
            current_chunk['section_title'] = current_title
            current_chunk['page'] = element.metadata.page_number
            current_chunk['authors'] = doc_metadata_dict['authors']
            current_chunk['published_date'] = doc_metadata_dict['date']
            current_chunk['document_title'] = doc_metadata_dict['document_title']
            current_chunk['document_type'] = doc_metadata_dict['document_type']
            current_chunk['revision_number'] = doc_metadata_dict['revision_number']
            current_chunk['content'] = ''
            current_chunk['elementIDs'] = []

        if element.category == 'Title':
            if current_chunk:
                chunks.append(current_chunk)
                current_chunk = {}
                current_title = element.text
                current_chunk['section_title'] = current_title
                current_chunk['page'] = element.metadata.page_number
                current_chunk['authors'] = doc_metadata_dict['authors']
                current_chunk['published_date'] = doc_metadata_dict['date']
                current_chunk['document_title'] = doc_metadata_dict['document_title']
                current_chunk['document_type'] = doc_metadata_dict['document_type']
                current_chunk['revision_number'] = doc_metadata_dict['revision_number']
                current_chunk['content'] = ''
                current_chunk['elementIDs'] = []

        if element.category == 'Table':
            current_chunk['content'] += element.html + '\n'
            current_chunk['elementIDs'].append(element.id)
        else:
            current_chunk['content'] += element.text + '\n'
            current_chunk['elementIDs'].append(element.id)

    if current_chunk:
        chunks.append(current_chunk)

    return chunks

In [208]:
# Chunk the elements
chunks = chunk_by_previous_title(test_cleaned_elements)

In [209]:
# Print the chunks
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1} \n")
    print(chunk)
    print("Content: \n")
    print(chunk['content'])
    print("\n\n" + "End of Chunk" + "-"*80 + "\n\n")

Chunk 1 

{'section_title': 'Front Page (No Title Found)', 'page': 1, 'authors': 'Confidential – Economic Interests of Ontario', 'published_date': '© Copyright 2015', 'document_title': 'SCHEDULE 15-2 DESIGN AND CONSTRUCTION', 'document_type': 'Project Agreement', 'revision_number': 'N/A', 'content': 'The following Variations have been conformed to this Schedule 15-2 Part 4. Variations not listed in the table below are not included in this Schedule 15-2 Part 4.\n<table>\n  <thead>\n    <th>Variation Number</th>\n    <th>Title</th>\n    <th>Date Executed</th>\n  </thead>\n  <tr>\n    <td>VC0005</td>\n    <td>BOCC Relocation</td>\n    <td>February 15, 2017</td>\n  </tr>\n  <tr>\n    <td>VC0144</td>\n    <td>Tee Rail Head Hardness</td>\n    <td>June 23, 2017</td>\n  </tr>\n  <tr>\n    <td>VCO0152</td>\n    <td>Advertising Display System</td>\n    <td>July 7, 2017</td>\n  </tr>\n  <tr>\n    <td>VC0235</td>\n    <td>Systemwide &#8212; Amendment to Requirements for Fare Collection System Equi

In [205]:
unstructured_chunking = chunk_by_title(test_cleaned_elements)

for i in unstructured_chunking:
    print(i.to_dict())

print(len(unstructured_chunking))

{'type': 'CompositeElement', 'element_id': 'e43e65224c5a2af8c98f66b42c1f5d45', 'text': 'The following Variations have been conformed to this Schedule 15-2 Part 4. Variations not listed in the table below are not included in this Schedule 15-2 Part 4.', 'metadata': {'file_directory': 'c:/Users/Dan/OneDrive/Documents/Crosslinx/CSDocs', 'filename': '15-2 Conformed.pdf', 'filetype': 'application/pdf', 'languages': ['en'], 'last_modified': '2024-03-17T21:01:46', 'page_number': 1, 'orig_elements': 'eJx1UcGu2yAQ/BXE+cUG7CR2rsm5fVLSXqLIwrCOkTBYgPMSPb1/L6Ck7aG9scPM7uzs+RODhglM6JTEO4Rb3pOG0KpdE0LaddsQBv1AKNBN0wpZ4TeEJwhc8sAj/xMLa51Uhgfwudb8YZfQjaCuY4gIY4REzRP+UDKMEaXbjM5WmZB05zNt26J9Q4xUBbu8od/1dluQXK9JU9T/ArIiItg/fIApbfGu7qCPMxeAv+KHhAAiKGs6obn33exsH2mkaGm7aSJhUBo6qVxkWfdIHcSu/OHB+fLATfndwMGpG5QHK5YUli/3znqvlbmX+2NEPX52MXyCpKfrFUN7awbrJpDFLIcXIzzmzODzrJXgyVb5/NbcXBd+zUmeMRh8yaAP3WSlGhTkEzHC6hWpVnR7YnRH6K7eJPEchZ1Zph5cCjgj7s9hJRtoD0KQisi6kkPPqiY+NhWlhIma5KAC3NPN8GkENFit7YcyV/STO5V9ejTyG6AewCDx2g

In [210]:
for i in chunks:
    print(i)

{'section_title': 'Front Page (No Title Found)', 'page': 1, 'authors': 'Confidential – Economic Interests of Ontario', 'published_date': '© Copyright 2015', 'document_title': 'SCHEDULE 15-2 DESIGN AND CONSTRUCTION', 'document_type': 'Project Agreement', 'revision_number': 'N/A', 'content': 'The following Variations have been conformed to this Schedule 15-2 Part 4. Variations not listed in the table below are not included in this Schedule 15-2 Part 4.\n<table>\n  <thead>\n    <th>Variation Number</th>\n    <th>Title</th>\n    <th>Date Executed</th>\n  </thead>\n  <tr>\n    <td>VC0005</td>\n    <td>BOCC Relocation</td>\n    <td>February 15, 2017</td>\n  </tr>\n  <tr>\n    <td>VC0144</td>\n    <td>Tee Rail Head Hardness</td>\n    <td>June 23, 2017</td>\n  </tr>\n  <tr>\n    <td>VCO0152</td>\n    <td>Advertising Display System</td>\n    <td>July 7, 2017</td>\n  </tr>\n  <tr>\n    <td>VC0235</td>\n    <td>Systemwide &#8212; Amendment to Requirements for Fare Collection System Equipment</td>

In [212]:
chunk_tags = []
for i, chunk in enumerate(chunks):
    response = OpenCleaner.invoke(f"This is a chunk of a larger document. Please extract tags from the following text to be used as metadata for filtering and search purposes. Return the tags (as 'tags') in JSON format. Limit the number of tags to the 10 most relevant. This is the chunk: {chunk['content']}", response_format={ "type": "json_object" })
    tags = json.loads(response.content)['tags']
    print(tags)
    chunk_tags.append({'Chunk': i, 'tags': json.loads(response.content)['tags']})

print(chunk_tags)


['Variations', 'Schedule 15-2', 'Design and Construction', 'Systems', 'Fare Collection System', 'Advertising Display System', 'OCS Maximum Line Speed', 'Underground Distributed Antenna System', 'Traction Power System', 'Cyber-Security']
['article', 'introduction', 'document', 'section', 'content', 'metadata', 'filtering', 'search', 'text', 'chunk']
['Design and Construction Requirements', 'North America design standards', 'Systems designs', 'ECLRT extension', 'Traction Power System (TPS)', 'Overhead Catenary System (OCS)', 'Signalling and Train Control System (S&TCS)', 'Communications and Controls', 'Fare Collection system', 'Training support elements']
['Traction Power System', 'Overhead Catenary System', 'Revenue Vehicles', 'Maintenance Vehicles', 'Signalling and Train Control System', 'Communications Systems', 'Operations Control Centre', 'Security Operations Office', 'Fare Collection', 'Trackwork']
['Cyber-Security', 'Control and Communication Systems', 'Rail Transit', 'APTA Recomm

In [213]:
print(f'chunk_tags: {len(chunk_tags)}')
print(f'chunks: {len(chunks)}')

chunk_tags: 160
chunks: 160


In [214]:
for i, chunk in enumerate(chunks):
    chunk['tags'] = chunk_tags[i]['tags']

In [126]:
for chunk in chunks:
    print(chunk)

{'section_title': 'Front Page (No Title Found)', 'page': 1, 'authors': ['Chris Goulopoulos', 'William VanRyswyk, CPP', 'Evan Harben', 'Hadi Meshgi', 'Kyle Booth'], 'published_date': '2021-10-20', 'document_title': 'Cyber Security – Change Control and Configuration Management Process', 'revision_number': '03', 'content': 'CROSSLIN TRANSIT SOLUTIONS\nOct 20, 2021 \nCyber Security – Change Control and Configuration Management Process\nPrepared by:   Reviewed by:   Reviewed by:   Approved by:   Accepted by:   For   Eglinton Crosstown LRT   Chris Goulopoulos  Consultant, CTS   William VanRyswyk, CPP  Chief Security Officer, CTS   Evan Harben   Cyber Security & OT Network  Specialist, CTSC  Hadi Meshgi   Technical Lead - Communication &  Cybersecurity, CTSC   Oct 20, 2021  Kyle Booth Systems Integration  Authority, CTSC   Name, Title   Signature/Date   Document No.   5000-00-WGD-48PA-1020   Rev.   03   July 26, 2021  \nif iA vo hay ~ Gi | x 4 July Lp >\nNee gle “dr\n© 2021 Crosslinx Transit 

In [127]:
prompt = f"""You are an advanced language model with expertise in document analysis. Your task is to analyze the provided chunk of text and extract the following metadata:

7. **Keywords:** Key phrases or terms that summarize the content.
8. **Summary:** A brief summary or abstract of the content.
9. **Category:** The type or category of the content (e.g., "Technical Details," "Introduction," "Conclusion").
10. **References:** Any referenced documents or sources within the content.
14. **Related Sections:** Titles or identifiers of related sections.

Return these metadata in JSON format.
Here is the chunk of text for you to analyze: {chunk['content']}"""

In [215]:
more_meta = []
for chunk in chunks:
    response = OpenCleaner.invoke(f"""You are an advanced language model with expertise in document analysis. Your task is to analyze the provided chunk of text and extract the following metadata:

7. **Keywords:** Key phrases or terms that summarize the content.
8. **Summary:** A brief summary or abstract of the content.
9. **Category:** The type or category of the content (e.g., "Technical Details," "Introduction," "Conclusion").
10. **References:** Any referenced documents or sources within the content.
14. **Related Sections:** Titles or identifiers of related sections.

Return these metadata in JSON format.
Here is the chunk of text for you to analyze: {chunk['content']}""", response_format={ "type": "json_object" })
    metadata = json.loads(response.content)
    print(metadata)
    more_meta.append(metadata)

{'Keywords': ['Variations', 'Schedule 15-2 Part 4', 'Design and Construction Requirements', 'Systems', 'Confidential', 'Economic Interests of Ontario'], 'Summary': 'This document outlines the variations conformed to Schedule 15-2 Part 4, detailing various design and construction requirements for systems. It includes a table listing specific variations with their titles and execution dates. The document also contains a comprehensive table of contents covering various articles and sections related to design, construction, and operational requirements for different systems, including traction power, overhead catenary, signalling, communications, and more. The document is marked as confidential and emphasizes the economic interests of Ontario.', 'Category': 'Technical Details', 'References': ['Ontario Infrastructure and Lands Corporation', 'Queen’s Printer for Ontario © Copyright 2015'], 'Related Sections': ['ARTICLE 1 INTRODUCTION', 'ARTICLE 2 TRACTION POWER SYSTEM (TPS)', 'ARTICLE 3 OVER

In [216]:
chunks_clone = copy.deepcopy(chunks)

In [217]:
for i, chunk in enumerate(chunks_clone):
    for key, value in more_meta[i].items():
        chunk[key] = value

    print(chunk)

{'section_title': 'Front Page (No Title Found)', 'page': 1, 'authors': 'Confidential – Economic Interests of Ontario', 'published_date': '© Copyright 2015', 'document_title': 'SCHEDULE 15-2 DESIGN AND CONSTRUCTION', 'document_type': 'Project Agreement', 'revision_number': 'N/A', 'content': 'The following Variations have been conformed to this Schedule 15-2 Part 4. Variations not listed in the table below are not included in this Schedule 15-2 Part 4.\n<table>\n  <thead>\n    <th>Variation Number</th>\n    <th>Title</th>\n    <th>Date Executed</th>\n  </thead>\n  <tr>\n    <td>VC0005</td>\n    <td>BOCC Relocation</td>\n    <td>February 15, 2017</td>\n  </tr>\n  <tr>\n    <td>VC0144</td>\n    <td>Tee Rail Head Hardness</td>\n    <td>June 23, 2017</td>\n  </tr>\n  <tr>\n    <td>VCO0152</td>\n    <td>Advertising Display System</td>\n    <td>July 7, 2017</td>\n  </tr>\n  <tr>\n    <td>VC0235</td>\n    <td>Systemwide &#8212; Amendment to Requirements for Fare Collection System Equipment</td>

In [218]:
print(test_cleaned_elements[0].metadata.to_dict())

{'detection_class_prob': 0.9196842908859253, 'coordinates': {'points': ((199.92169189453125, 203.1963653564453), (199.92169189453125, 277.03338623046875), (1508.4277777777777, 277.03338623046875), (1508.4277777777777, 203.1963653564453)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-03-17T21:01:46', 'filetype': 'application/pdf', 'languages': ['en'], 'page_number': 1, 'parent_id': 'd2f1becc030d43dfb238d43631102c40', 'file_directory': 'c:/Users/Dan/OneDrive/Documents/Crosslinx/CSDocs', 'filename': '15-2 Conformed.pdf'}


In [219]:
for chunk in chunks_clone:
    chunk['filename'] = test_cleaned_elements[0].metadata.to_dict()['filename']
    chunk['filetype'] = test_cleaned_elements[0].metadata.to_dict()['filetype']
    chunk['languages'] = test_cleaned_elements[0].metadata.to_dict()['languages']

In [220]:
print(chunks_clone[0].keys())

dict_keys(['section_title', 'page', 'authors', 'published_date', 'document_title', 'document_type', 'revision_number', 'content', 'elementIDs', 'tags', 'Keywords', 'Summary', 'Category', 'References', 'Related Sections', 'filename', 'filetype', 'languages'])


In [154]:
from pinecone import Pinecone, ServerlessSpec, PodSpec  
import openai
import time  

os.environ['PINECONE_API_KEY'] = '57208fe4-cd6b-45a2-83fd-12ee06690b67'
pinecone_api_key = os.environ.get('PINECONE_API_KEY')

pc = Pinecone(api_key=pinecone_api_key)
spec = ServerlessSpec(cloud='aws', region='us-east-1')

index_name = 'cassie-unstructured-advanced'

pc.create_index(  
    index_name,  
    dimension=1536,  # dimensionality of text-embedding-ada-002  
    metric='cosine',  
    spec=spec  
) 

while not pc.describe_index(index_name).status['ready']:  
    time.sleep(1) 

In [146]:
from langchain_openai import OpenAIEmbeddings  
# get openai api key from platform.openai.com  
model_name = 'text-embedding-3-small'  
embeddings = OpenAIEmbeddings(  
    model=model_name,  
    openai_api_key=openai_api_key  
)  

In [221]:
index = pc.Index(index_name)

In [158]:
from langchain_pinecone import PineconeVectorStore  
text_field = "text"  
vectorstore = PineconeVectorStore(
    index, embeddings, text_field  
)

In [234]:
double_chunks_clone = copy.deepcopy(chunks_clone)

In [235]:
print(double_chunks_clone[10])



In [236]:
def clean_metadata(metadata):
    cleaned_metadata = {}
    for key, value in metadata.items():
        if isinstance(value, (str, int, float, bool)):
            cleaned_metadata[key] = value
        elif isinstance(value, list):
            cleaned_metadata[key] = [str(v) for v in value]
        else:
            cleaned_metadata[key] = str(value)
    return cleaned_metadata

In [237]:
for chunk in double_chunks_clone:
    content = chunk.pop('content')
    keywords = chunk.pop('tags')
    keywords = chunk.pop('elementIDs')
    metadata = clean_metadata(chunk)
    try:
        vectorstore.add_texts([content], metadatas=[metadata])
        print(f"Added chunk: {metadata['section_title']}")
    except:
        print(f"Failed to add chunk: {metadata['section_title']}")

Added chunk: Front Page (No Title Found)
Added chunk: ARTICLE 1 INTRODUCTION
Added chunk: 1.1 General Overview
Added chunk: 1.2 Systems Element Summary
Added chunk: 1.4 Cyber-Security
Added chunk: Backup Power Supply for Systems Elements
Added chunk: ARTICLE 2 TRACTION POWER SYSTEM (TPS)
Added chunk: 2.1 Scope of Work
Added chunk: 2.2 General Requirements
Added chunk: 2.3 Codes, Standards and Manuals
Added chunk: 2.4 Performance Requirements
Added chunk: ARTICLE 3 OVERHEAD CATENARY SYSTEM
Added chunk: 3.1 Scope of work
Added chunk: 3.2 General Requirements
Added chunk: 3.3 Operational Requirements
Added chunk: 3.4 Design Parameters
Added chunk: 3.5 Electrical Clearances
Added chunk: 3.6 Height, Depth and Gradient Requirements
Added chunk: 3.7 Horizontal and Vertical Clearances
Added chunk: 3.10 Strength Requirements
Added chunk: 3.11 OCS Wire Tensions and Tension Lengths
Added chunk: 3.12 OCS Poles and Foundations
Added chunk: 3.13 Sectionalizing Requirements
Added chunk: 3.14 System S