In [2]:
import os
from dotenv import load_dotenv
load_dotenv()
import pandas as pd
import boto3
import csv
import xml.etree.ElementTree as ET

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [1]:


# List of XML file paths
xml_file_paths = ['resources/grobid/xml/Level1_combined.grobid.tei.xml', 'resources/grobid/xml/Level2_combined.grobid.tei.xml','resources/grobid/xml/Level3_combined.grobid.tei.xml']

# Create a CSV file
csv_filename = 'resources/grobid/outcomes.csv'

with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    
    # Write header
    csv_writer.writerow(['Topic', 'Learning Outcomes Section'])
    
    # Iterate through each XML file
    for xml_file_path in xml_file_paths:
        print(xml_file_path)
        # Read the XML data from the file
        with open(xml_file_path, 'r') as file:
            xml_data = file.read()
        # Parse the XML data
        root = ET.fromstring(xml_data)

        for div_element in root.findall('.//{http://www.tei-c.org/ns/1.0}div'):
                head_element = div_element.find('.//{http://www.tei-c.org/ns/1.0}head')

                p_elements = div_element.findall('.//{http://www.tei-c.org/ns/1.0}p')

                combined_p_text = ' '.join(p_element.text for p_element in p_elements if p_element.text)
                if combined_p_text != '':
                    csv_writer.writerow([head_element.text if head_element is not None else '', combined_p_text])

print(f"Combined CSV file '{csv_filename}' created successfully.")



resources/grobid/xml/Level1_combined.grobid.tei.xml
resources/grobid/xml/Level2_combined.grobid.tei.xml
resources/grobid/xml/Level3_combined.grobid.tei.xml
Combined CSV file 'resources/grobid/outcomes.csv' created successfully.


In [None]:
def remove_extra_whitespaces(value):
    if isinstance(value, str):
        return ' '.join(value.split())
    else:
        return value

def preprocess_text(value):
    if isinstance(value, str):
        return f'"{value}"'
    else:
        return value

In [None]:

aws_access_key_id = os.getenv('aws_access_key_id')
aws_secret_access_key = os.getenv('aws_secret_access_key')
bucket_name = 'cfainstitute-learning-outcomes-raw'

df = pd.read_csv('resources/grobid/outcomes.csv', dtype=str)
df = df.map(remove_extra_whitespaces)
df = df.map(preprocess_text)

df.to_csv('resources/grobid/outcomes_processed.csv', index=False)

s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)

csv_data = df.to_csv(index=False)

s3_key = 'outcomes_processed.csv'

s3.put_object(Body=csv_data, Bucket=bucket_name, Key=s3_key)

print(f"CSV file has been uploaded to S3 at s3://{bucket_name}/{s3_key}")


# Upload to snowflake from local

In [4]:
from sqlalchemy import create_engine

snowflake_account = os.getenv('snowflake_account')
snowflake_user = os.getenv('snowflake_user')
snowflake_password = os.getenv('snowflake_password')
snowflake_warehouse = os.getenv('snowflake_warehouse')
snowflake_schema = os.getenv('snowflake_schema')
snowflake_database = 'your_database'

engine = create_engine(
    'snowflake://{user}:{password}@{account_identifier}/'.format(
        user=snowflake_user,
        password=snowflake_password,
        account_identifier=snowflake_account,
    )
)
connection = engine.connect()


## Create table and db

In [5]:
target_database = 'CFAInstitute'
target_table = 'Learning_Outcomes'

create_database_query = f"CREATE DATABASE IF NOT EXISTS {target_database}"
connection.execute(create_database_query)

use_database_query = f"USE DATABASE {target_database}"
connection.execute(use_database_query)
connection.execute("USE WAREHOUSE TEST")

create_table_query = f"""
CREATE TABLE IF NOT EXISTS {target_table} (
    "Learning Outcomes Section" VARCHAR,
    Topic VARCHAR
)
"""

connection.execute(create_table_query)


  connection.execute(create_database_query)


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x11f9ee9d0>

In [6]:
connection.execute(f"TRUNCATE TABLE {target_table}")

connection.execute(f"PUT file://resources/grobid/outcomes_processed.csv @%{target_table}")

connection.execute(f"COPY INTO {target_table} ON_ERROR=CONTINUE FILE_FORMAT = (FIELD_OPTIONALLY_ENCLOSED_BY = '\"' SKIP_HEADER=1 PARSE_HEADER = FALSE)")


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x11f80cb10>

# upload to snowflake from s3

In [12]:
connection.execute("""CREATE OR REPLACE FILE FORMAT mycsvformat
   TYPE = 'CSV'
   FIELD_DELIMITER = '|'
   SKIP_HEADER = 1;""")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x120c4f3d0>

 Goto iam of aws and create a role
 use above role arn to create storage integration
 Create STORAGE INTEGRATION that can connect to aws account

In [32]:
# cursor.execute("""CREATE OR REPLACE STORAGE INTEGRATION s3_int2
#   TYPE = EXTERNAL_STAGE
#   STORAGE_PROVIDER = 'S3'
#   STORAGE_AWS_ROLE_ARN = 'arn:aws:iam::640055273174:role/s3-read'
#   ENABLED = TRUE
#   STORAGE_ALLOWED_LOCATIONS = ('*')
# """)

<snowflake.connector.cursor.SnowflakeCursor at 0x12a118490>

 go to trusted relationships of iam role created in above step from aws console and change it to below template
```
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "s3.amazonaws.com",
                "AWS": "<user arn>"
            },
            "Action": "sts:AssumeRole",
            "Condition": {
                "StringEquals": {
                    "sts:ExternalId": "<external id>"
                }
            }
        }
    ]
}
```
run "DESC Integration s3_int2"  inside snowflake to get STORAGE_AWS_IAM_USER_ARN and STORAGE_AWS_EXTERNAL_ID of storage integration crreated

In [65]:
# connection.execute("""CREATE OR REPLACE STAGE external_stage
#   FILE_FORMAT = mycsvformat
#   URL = 's3://cfainstitute-learning-outcomes-raw/outcomes.csv'
#   STORAGE_INTEGRATION = s3_int2;
# """)

<snowflake.connector.cursor.SnowflakeCursor at 0x10940f950>

In [None]:
# Upload to snowflake from s3

In [13]:
connection.execute("""
copy into Learning_Outcomes
from 's3://cfainstitute-learning-outcomes-raw/outcomes_processed.csv'
storage_integration = s3_int2
FORCE = TRUE
ON_ERROR = CONTINUE
  file_format = (type = csv FIELD_OPTIONALLY_ENCLOSED_BY = '"' SKIP_HEADER=1 PARSE_HEADER = FALSE);
""")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x120d23d50>