In [38]:
import warnings
warnings.filterwarnings("ignore")
import boto3
from botocore.exceptions import ClientError
from sqlalchemy import MetaData, Table, Column, String, Integer
from snowflake_helper import getSnowflakeEngine
from snowflake.connector import connect
import pandas as pd

In [2]:
# function to upload files to S3
def uploadFiletoS3(file_name, bucket, object_name):
    try:  
        # get S3 object
        s3 = boto3.client('s3')

        # upload file
        with open(file_name, "rb") as f:
             s3.upload_fileobj(f, bucket, object_name)
    except ClientError as e:
        print("Error in S3 Upload")
        print(e)

In [4]:
# function to upload csv and log file for web scrapping to S3
def upload_scrape_file():
    # initialize source locations
    data_source = "..\..\data\scrape-data\cfa-data.csv"
    log_source = "..\..\logs\scrape-log\webscrapping.log"
    snowflake_source = "..\..\logs\data-load-log\snowflake-upload.log"

    # S3 bucket name
    bucket = "cfa-data-t2"
    
    # initialize destination locations
    data_destination = "csv-files/cfa-data.csv"
    log_destination = "log-files/webscrapping.log"
    snowflake_destination = "log-files/snowflake-upload.log"

    # upload csv file
    uploadFiletoS3(data_source, bucket, data_destination)

    # upload log file
    uploadFiletoS3(log_source, bucket, log_destination)

    # upload log file
    uploadFiletoS3(snowflake_source, bucket, snowflake_destination)

In [26]:
# function to upload files to upload text file for PDF data extraction using PyPDF2 and Grobid to S3
def upload_extracted_data_files():

    # List of file paths for PyPDF2 and Grobid
    pyPDF2_files = [
        {"path": "..\\..\\data\\extracted-pdf-data_PyPDF2\\PyPDF_RR__2024_levelI_combined.txt", "destination": "pypdf2-files/"},
        {"path": "..\\..\\data\\extracted-pdf-data_PyPDF2\\PyPDF_RR__2024_levelII_combined.txt", "destination": "pypdf2-files/"},
        {"path": "..\\..\\data\\extracted-pdf-data_PyPDF2\\PyPDF_RR__2024_levelIII_combined.txt", "destination": "pypdf2-files/"},
    ]
    
    grobid_files = [
        {"path": "..\\..\\data\\extracted-pdf-data_Grobid\\Grobid_RR_2024_2_combined.txt", "destination": "grobid-files/"},
        {"path": "..\\..\\data\\extracted-pdf-data_Grobid\\Grobid_RR_2024_1_combined.txt", "destination": "grobid-files/"},
        {"path": "..\\..\\data\\extracted-pdf-data_Grobid\\Grobid_RR_2024_3_combined.txt", "destination": "grobid-files/"},
    ]

    
    # S3 bucket name
    bucket = "cfa-data-t2"
    
    # Uploading extracted data files for PyPDF2
    for file_info in pyPDF2_files:
        uploadFiletoS3(file_info["path"], bucket, file_info["destination"] + file_info["path"].split("\\")[-1])
    
    # Uploading extracted data files for Grobid
    for file_info in grobid_files:
        uploadFiletoS3(file_info["path"], bucket, file_info["destination"] + file_info["path"].split("\\")[-1])

    S3_urls = []
    for file_info in grobid_files:
        S3_urls.append("https://{}.s3.amazonaws.com/Grobid/{}".format(bucket, file_info["path"].split("\\")[-1]))
    
    return S3_urls

In [41]:
# update S3 url in csv
def UpdateCSV(s3_urls):
    df = pd.read_csv("../../data/extracted-pdf-data_Grobid/grobid_metadata.csv")
    df["S3 URL"] = s3_urls
    
    csv_location = "..\..\data\extracted-pdf-data_Grobid\grobid_metadata_new.csv"
    df.to_csv(csv_location, index=False,sep="\t")

In [42]:
# create metdata table
def createMetadataTable(engine):
    try:
        print('-------Starting Snowflake table creation-------')
        # Define metadata
        metadata = MetaData()
    
        # Define table structure
        table_name = 'meta-data'
        topics_table = Table(
            table_name,
            metadata,
            Column('Filename', String),
            Column('Time', String),
            Column('MD5 Identifier', String),
            Column('Encoding Version', String),
            Column('Lang', String),
            Column('Application Identifier', String),
            Column('Application Description', String),
            Column('Application Version', String),
            Column('Application Reference URL', String),
            Column('S3 URL', String) 
        )
    
        # create or replace table in Snowflake
        topics_table.drop(engine, checkfirst=True)  # Drop table if exists
        topics_table.create(engine)
        print('Table created')
        print('-------Ending Snowflake table creation-------')
        return table_name
    except:
        print("Error creating Snowflake Table")
        return False

In [39]:
def uploadDataToSnowflake(engine, table_name):
    file_format_name = 'meta_file_format'
    field_delimiter = '\t'
    skip_header = 1
    skip_blank_lines = True
    trim_space = True
    field_optionally_enclosed_by = None

    file_path = "../../data/extracted-pdf-data_Grobid/grobid_metadata_new.csv"
    stage_name = "metadata_csv_stage"
    
    # Create or replace file format
    create_file_format_sql = f"""
    CREATE OR REPLACE FILE FORMAT {file_format_name}
    TYPE = 'CSV'
    FIELD_DELIMITER = '{field_delimiter}'
    SKIP_HEADER = {skip_header}
    SKIP_BLANK_LINES = {skip_blank_lines}
    TRIM_SPACE = {trim_space}
    """

    # create or replace Stage
    create_stage = f"""CREATE OR REPLACE STAGE {stage_name} DIRECTORY = ( ENABLE = true );"""
    
    # Put file format
    put_command = f"""PUT 'file://{file_path}' @{stage_name}"""

    # Copy to table
    copy_sql = f"""
        COPY INTO "{table_name}" FROM '@{stage_name}'
        FILE_FORMAT = (FORMAT_NAME = {file_format_name})
        """

    try:
        print('-------Starting Data Upload to Snowflake-------')
        
        with engine.connect() as connection: 
            # execute file format
            connection.execute(create_file_format_sql)
            print('File Format created')
            
            # execute stage creation
            connection .execute(create_stage)
            print('Stage created')
            
            # put file in stage
            connection.execute(put_command)
            print('Put file into stage')

            # put file in stage
            connection.execute(copy_sql)
            print('Copied file into table')

        print('-------Ending Data Upload to Snowflake-------')
    except: 
        print("Error creating Uploading to Snowflake")

In [43]:
# upload MetaData to Snowflake
def upload_metadata():
    snowflake_database="ASSIGNMENT_2"
    snowflake_schema="RR_SCHEMA"
    snowflake_warehouse="WH_2"
    engine = getSnowflakeEngine(snowflake_database, snowflake_schema, snowflake_warehouse)
    
    # create table 
    table_name = createMetadataTable(engine)

    # uploading data to snowflake
    uploadDataToSnowflake(engine, table_name)

In [44]:
if __name__ == "__main__":
    
    # upload data to S3
    upload_scrape_file()
    
    # upload extractes pdf data to S3 and get urls 
    s3_urls = upload_extracted_data_files()

    # update metadata csv
    UpdateCSV(s3_urls)

    # upload metadata
    upload_metadata()

-------Starting Snowflake table creation-------
Table created
-------Ending Snowflake table creation-------
-------Starting Data Upload to Snowflake-------
File Format created
Stage created
Put file into stage
Copied file into table
-------Ending Data Upload to Snowflake-------
