# Import libraries

In [3]:
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine

load_dotenv()
import PyPDF2


In [2]:
def extract_pdf_metadata(pdf_path):
    """
    Extracts metadata from a PDF file using PyPDF2 library.

    Parameters:
    - pdf_path (str): The path to the PDF file.

    Returns:
    str: Metadata information from the PDF file.
    """
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        metadata = pdf_reader.metadata
        print(metadata)
    return metadata

In [1]:
bucket_links = {
    "../resources/Level1_combined.pdf" : "s3://cfainstitute-learning-outcomes-raw/grobid/Grobid_RR_2024_Level1_combined.txt",
    "../resources/Level2_combined.pdf": "s3://cfainstitute-learning-outcomes-raw/grobid/Grobid_RR_2024_Level2_combined.txt",
    "../resources/Level3_combined.pdf": "s3://cfainstitute-learning-outcomes-raw/grobid/Grobid_RR_2024_Level3_combined.txt"
}

In [4]:
pdf_paths = ['../resources/Level1_combined.pdf', '../resources/Level2_combined.pdf', '../resources/Level3_combined.pdf']
metadata = []
for pdf_path in pdf_paths:
    extracted_text = extract_pdf_metadata(pdf_path)
    extracted_text = {key[1:]: value for key, value in extracted_text.items()}
    extracted_text['s3_bucket_link'] = bucket_links[pdf_path]
    metadata.append(extracted_text)

{'/CreationDate': "D:20230130124011-05'00'", '/Creator': 'Adobe InDesign 17.4 (Windows)', '/ModDate': "D:20230208154528-05'00'", '/Producer': 'Adobe PDF Library 16.0.7', '/Trapped': '/False'}
{'/CreationDate': "D:20230410072753-04'00'", '/Creator': 'Adobe InDesign 17.4 (Windows)', '/ModDate': "D:20230410115010-04'00'", '/Producer': 'Adobe PDF Library 16.0.7', '/Trapped': '/False'}
{'/CreationDate': "D:20230615133905-04'00'", '/Creator': 'Adobe InDesign 17.4 (Windows)', '/ModDate': "D:20230615134206-04'00'", '/Producer': 'Adobe PDF Library 16.0.7', '/Trapped': '/False'}


In [5]:
import pandas as pd

md = pd.DataFrame(metadata)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [6]:
md.head()

Unnamed: 0,CreationDate,Creator,ModDate,Producer,Trapped,s3_bucket_link
0,D:20230130124011-05'00',Adobe InDesign 17.4 (Windows),D:20230208154528-05'00',Adobe PDF Library 16.0.7,/False,s3://cfainstitute-learning-outcomes-raw/grobid...
1,D:20230410072753-04'00',Adobe InDesign 17.4 (Windows),D:20230410115010-04'00',Adobe PDF Library 16.0.7,/False,s3://cfainstitute-learning-outcomes-raw/grobid...
2,D:20230615133905-04'00',Adobe InDesign 17.4 (Windows),D:20230615134206-04'00',Adobe PDF Library 16.0.7,/False,s3://cfainstitute-learning-outcomes-raw/grobid...


# Upload to snowflake

In [7]:
# Connect to Snowflake using the extracted credentials

snowflake_account = os.getenv('snowflake_account')
snowflake_user = os.getenv('snowflake_user')
snowflake_password = os.getenv('snowflake_password')
snowflake_warehouse = os.getenv('snowflake_warehouse')
snowflake_schema = os.getenv('snowflake_schema')

engine = create_engine(
    'snowflake://{user}:{password}@{account_identifier}/'.format(
        user=snowflake_user,
        password=snowflake_password,
        account_identifier=snowflake_account,
    )
)
connection = engine.connect()


### Create table and db

In [8]:
target_table = 'metadata_pypdf'

target_database = 'MetadataDB'


create_database_query = f"CREATE DATABASE IF NOT EXISTS {target_database}"

connection.execute(create_database_query)
use_database_query = f"USE DATABASE {target_database}"
connection.execute(use_database_query)
connection.execute("USE WAREHOUSE TEST")


  connection.execute(create_database_query)


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x107d71e10>

In [9]:
headers = md.columns.tolist()
columns_definition = ', '.join([f"{header} STRING" for header in headers])
connection.execute(f"""
    CREATE TABLE IF NOT EXISTS {target_table} (
        {columns_definition}
    )
    """)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x117515190>

In [10]:
md.to_csv('../resources/metadata/metadata_pypdf.csv', index=False)

# Stage the file

In [12]:
connection.execute(f"TRUNCATE TABLE {target_table}")

connection.execute(f"PUT file://../resources/metadata/metadata_pypdf.csv @%{target_table}")


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x152f10e90>

# Load data from stage

In [13]:
connection.execute(f"COPY INTO {target_table} ON_ERROR=CONTINUE FILE_FORMAT = (FIELD_OPTIONALLY_ENCLOSED_BY = '\"' SKIP_HEADER=1 PARSE_HEADER = FALSE)")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x127642050>