# Unstructured PDF Extractuion Pipeline Setup and Testing
### This file serves as a playground/workspace to demo Unstructured functionality before it is put into production. This is where we will test the extraction, chunking, metadata extraction, metadata tagging, and vector embedding of PDFs for CASSIE

#### Changelog
##### May 18th, 2024
**7:00 AM**  
* File created  
* Metadata extraction testing  

### Metadata Extraction from PDF Files

#### Imports

In [1]:
import pikepdf
import sys

#### Initial Extraction of Metadata

In [5]:
pdf_file = 'C:/Users/Dan/OneDrive/Documents/Crosslinx/CSDocs/Security Docs/5000-00-CTSC-48PA-0004_01.pdf'

pdf = pikepdf.Pdf.open(pdf_file)
docInfo = pdf.docinfo

for key, value in docInfo.items():
    print(f'{key}: {value}')

/CreationDate: D:20181004144423-04'00'
/Creator: Microsoft® Word 2013
/ModDate: D:20181004155108-04'00'
/Producer: Microsoft® Word 2013


#### Transform Date Function

In [6]:
import pikepdf
import datetime
import re
from dateutil.tz import tzutc, tzoffset
import sys

In [7]:
pdf_date_pattern = re.compile(''.join([
    r"(D:)?",
    r"(?P<year>/d/d/d/d)",
    r"(?P<month>/d/d)",
    r"(?P<day>/d/d)",
    r"(?P<hour>/d/d)",
    r"(?P<minute>/d/d)",
    r"(?P<second>/d/d)",
    r"(?P<tz_offset>[+-zZ])?",
    r"(?P<tz_hour>/d/d)?",
    r"'?(?P<tz_minute>/d/d)?'?"]))

In [8]:
def transform_date(date_str):
    """
    Convert a pdf date such as "D:20120321183444+07'00'" into a usable datetime
    http://www.verypdf.com/pdfinfoeditor/pdf-date-format.htm
    (D:YYYYMMDDHHmmSSOHH'mm')
    :param date_str: pdf date string
    :return: datetime object
    """
    global pdf_date_pattern
    match = re.match(pdf_date_pattern, date_str)
    if match:
        date_info = match.groupdict()

        for k, v in date_info.items():  # transform values
            if v is None:
                pass
            elif k == 'tz_offset':
                date_info[k] = v.lower()  # so we can treat Z as z
            else:
                date_info[k] = int(v)

        if date_info['tz_offset'] in ('z', None):  # UTC
            date_info['tzinfo'] = tzutc()
        else:
            multiplier = 1 if date_info['tz_offset'] == '+' else -1
            date_info['tzinfo'] = tzoffset(None, multiplier*(3600 * date_info['tz_hour'] + 60 * date_info['tz_minute']))

        for k in ('tz_offset', 'tz_hour', 'tz_minute'):  # no longer needed
            del date_info[k]

        return datetime.datetime(**date_info)

#### Extract Dates from PDF

In [15]:
pdf = pikepdf.Pdf.open(pdf_file)
docinfo = pdf.docinfo
for key, value in docinfo.items():
    if str(value).startswith("D:"):
        # pdf datetime format, convert to python datetime
        value = transform_date(str(pdf.docinfo["/CreationDate"]))
    print(key, ":", value)

/CreationDate : 2018-10-04 14:44:23-04:00
/Creator : Microsoft® Word 2013
/ModDate : 2018-10-04 14:44:23-04:00
/Producer : Microsoft® Word 2013


So it looks like the created and modified dates can be extracted from PDFs, even though I've downloaded them much later. I will try this for all the PDF files in the CSDocs folder to ensure it works properly

In [10]:
import os

In [45]:
CSDocs_path = 'C:/Users/Dan/OneDrive/Documents/Crosslinx/CSDocs/'

doc_info = []


for file in os.listdir(CSDocs_path):
        try:
            doc = {}
            if file.endswith('.pdf'):
                doc = {'filename': file}
                pdf = pikepdf.Pdf.open(CSDocs_path + file)
                docInfo = pdf.docinfo
                for key, value in docInfo.items():
                    print(f'{key}: {value}')
                    if str(value).startswith("D:"):
                        # pdf datetime format, convert to python datetime
                        value = transform_date(str(pdf.docinfo["/CreationDate"]))
                    if key == '/CreationDate':
                        key = 'Creation Date'
                    if key == '/Creator':
                        key = 'Creator'
                    if key == '/ModDate':
                        key = 'Modification Date'
                    if key == '/Producer':
                        key = 'Producer'
                    if key == '/Title':
                        key = 'Title'
                    if key == '/Subject':
                        key = 'Subject'
                    if key == '/Author':
                        key = 'Author'
                    doc[key] = value
                print('\n')
            if doc['filename']:
                doc_info.append(doc)
        except Exception as e:
            print(e)

/Author: Yousef Kimiagar
/CreationDate: D:20200210124719-05'00'
/Creator: Microsoft® Word for Office 365
/ModDate: D:20200210124719-05'00'
/Producer: Microsoft® Word for Office 365
/Title: SCHEDULE 15-2


/Author: David Robson/Duncan Robb
/CreationDate: D:20170927074532-04'00'
/Creator: Microsoft® Word 2013
/ModDate: D:20170927083115-04'00'
/Producer: Microsoft® Word 2013
/Subject: Eglinton Crosstown LRT Project
/Title: ECLRT Cyber Security Management Plan


'filename'
/Author: 
/CreationDate: D:20220506210237-04'00'
/Creator: Aspose Ltd.
/ModDate: D:20220506210237-04'00'
/Producer: Aspose.Pdf for .NET 11.4.0
/Subject: 
/Title: 


/CreationDate: D:20210831101215-04'00'
/Creator: Microsoft® Word for Microsoft 365
/ModDate: D:20220224143709-05'00'
/Producer: Microsoft® Word for Microsoft 365


/CreationDate: D:20210714131927-04'00'
/Creator: Microsoft® Word for Microsoft 365
/ModDate: D:20220224145956-05'00'
/Producer: Microsoft® Word for Microsoft 365


/CreationDate: D:20210715181444-0

In [46]:
for i in doc_info:
    for key, value in i.items():
        print(f'{key}: {value}')
    print('\n')



filename: 15-2 Conformed.pdf
Author: Yousef Kimiagar
Creation Date: 2020-02-10 12:47:19-05:00
Creator: Microsoft® Word for Office 365
Modification Date: 2020-02-10 12:47:19-05:00
Producer: Microsoft® Word for Office 365
Title: SCHEDULE 15-2


filename: 5000-00-CTSC-48PA-0002 - Cyber Security Management Plan-5000-00-CTSC-48PA-0002_PA.pdf
Author: David Robson/Duncan Robb
Creation Date: 2017-09-27 07:45:32-04:00
Creator: Microsoft® Word 2013
Modification Date: 2017-09-27 07:45:32-04:00
Producer: Microsoft® Word 2013
Subject: Eglinton Crosstown LRT Project
Title: ECLRT Cyber Security Management Plan


filename: 5000-00-WGD-48PA- 2001 - Internal Handover Plan – Cyber Systems-2700-67ASA1-02-CTSC-0004_01.pdf
Author: 
Creation Date: 2022-05-06 21:02:37-04:00
Creator: Aspose Ltd.
Modification Date: 2022-05-06 21:02:37-04:00
Producer: Aspose.Pdf for .NET 11.4.0
Subject: 
Title: 


filename: 5000-00-WGD-48PA-1001 - Baseline Controls Document - Management Controls Policy.pdf
Creation Date: 2021-08

In [None]:
for i in doc_info:
    print(i)