In [1]:
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine
import pandas as pd

load_dotenv()
from lxml import etree
import os

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:

def extract(path, pdf_content_list,bucket_links):
    xml_file_path = os.path.abspath(path)

    if os.path.exists(xml_file_path):
        tree = etree.parse(xml_file_path)
        root = tree.getroot()
        namespaces = {
            'tei': 'http://www.tei-c.org/ns/1.0',
            'xlink': 'http://www.w3.org/1999/xlink'
        }

        def get_first_item(xpath_result):
            if xpath_result:
                xpath_result[0] = xpath_result[0].replace('\n', '').replace('\t','')
                return f"{xpath_result[0]}"  
            else:
                return "No Data"

        # Extract metadata using XPath expressions
        metadata_dict = {
            "Title": get_first_item(root.xpath('//tei:titleStmt/tei:title[@level="a" and @type="main"]/text()', namespaces=namespaces)),
            "Publisher": get_first_item(root.xpath('//tei:publicationStmt/tei:publisher/text()', namespaces=namespaces)),
            "AvailabilityStatus": get_first_item(root.xpath('//tei:availability/@status', namespaces=namespaces)),
            "Analytic": get_first_item(root.xpath('//tei:analytic/text()', namespaces=namespaces)),
            "ImprintedDate": get_first_item(root.xpath('//tei:imprint/tei:date/text()', namespaces=namespaces)),
            "AppInfoDescription": get_first_item(root.xpath('//tei:application/tei:desc/text()', namespaces=namespaces)),
            "Abstract": get_first_item(root.xpath('//tei:profileDesc/tei:abstract/tei:p/text()', namespaces=namespaces)),
        }
        metadata_dict['s3_bucket_link'] = bucket_links[path]

        pdf_content_list.append(metadata_dict)
    else:
        print(f"The file {xml_file_path} does not exist.")



In [3]:
paths = ['./resources/metadata/Level1_combined.grobid.tei.xml',
         './resources/metadata/Level2_combined.grobid.tei.xml',
         './resources/metadata/Level3_combined.grobid.tei.xml',
        ]
bucket_links = {
    "./resources/metadata/Level1_combined.grobid.tei.xml" : "s3://cfainstitute-learning-outcomes-raw/grobid/Grobid_RR_2024_Level1_combined.txt",
    "./resources/metadata/Level2_combined.grobid.tei.xml": "s3://cfainstitute-learning-outcomes-raw/grobid/Grobid_RR_2024_Level2_combined.txt",
    "./resources/metadata/Level3_combined.grobid.tei.xml": "s3://cfainstitute-learning-outcomes-raw/grobid/Grobid_RR_2024_Level3_combined.txt"
}
pdf_content_list=[]
for path in paths:
        extract(path,pdf_content_list,bucket_links)

In [4]:

md = pd.DataFrame(pdf_content_list)

In [5]:
snowflake_account = os.getenv('snowflake_account')
snowflake_user = os.getenv('snowflake_user')
snowflake_password = os.getenv('snowflake_password')
snowflake_warehouse = os.getenv('snowflake_warehouse')
snowflake_schema = os.getenv('snowflake_schema')
snowflake_database = 'your_database'

engine = create_engine(
    'snowflake://{user}:{password}@{account_identifier}/'.format(
        user=snowflake_user,
        password=snowflake_password,
        account_identifier=snowflake_account,
    )
)
connection = engine.connect()


DBAPIError: (snowflake.connector.errors.ForbiddenError) 250001 (08001): Failed to connect to DB. Verify the account name is correct: None.snowflakecomputing.com:443. HTTP 403: Forbidden
(Background on this error at: https://sqlalche.me/e/14/dbapi)

In [None]:
target_table = 'metadata_grobid'

target_database = 'MetadataDB'


create_database_query = f"CREATE DATABASE IF NOT EXISTS {target_database}"

connection.execute(create_database_query)
use_database_query = f"USE DATABASE {target_database}"
connection.execute(use_database_query)
connection.execute("USE WAREHOUSE TEST")


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x135e78050>

In [None]:
headers = md.columns.tolist()
columns_definition = ', '.join([f"{header} STRING" for header in headers])
connection.execute(f"""
    CREATE TABLE IF NOT EXISTS {target_table} (
        {columns_definition}
    )
    """)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x135e9e3d0>

In [None]:
md.to_csv('./resources/metadata/metadata_grobid.csv', index=False)

In [None]:
connection.execute(f"TRUNCATE TABLE {target_table}")

connection.execute(f"PUT file://resources/metadata/metadata_grobid.csv @%{target_table}")


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x136c5f650>

In [None]:
connection.execute(f"COPY INTO {target_table} ON_ERROR=CONTINUE FILE_FORMAT = (FIELD_OPTIONALLY_ENCLOSED_BY = '\"' SKIP_HEADER=1 PARSE_HEADER = FALSE)")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x136b84c10>