## Loading files to S3

In [1]:
# Imports
from aws_snowflake_config import aws_s3_connection, snowflake_connection
import logging
from botocore.exceptions import ClientError
import os

In [2]:
s3_client, bucket_name = aws_s3_connection()

In [6]:
# Get a list of all the files with the extension in the directory_path
def get_files_in_directory(directory_path, extension):
    files=[]
    for f in os.listdir(directory_path):
        if f.endswith(extension):
            files.append(f)
    return files

In [7]:
# Function  to upload the file of type extension to S3 bucket
def upload_file_to_s3(file_path, type, extension):
    try:
        # Fetching all files in the directory with the given extension
        list_of_files = get_files_in_directory(file_path,extension)
        print("\n",type.rstrip("/"),"files:",)
        print(list_of_files)
        
        # Upload if files exist
        if list_of_files:
            for file in list_of_files:
                try:
                    file_full_path=os.path.join(file_path, file)    # file path in local directory
                    key_value = type + file   # file path in S3 bucket
                    
                    # Upload the file to S3 using boto3
                    response = s3_client.upload_file(file_full_path, bucket_name, key_value)
                    print(f'File uploaded successfully: {file_full_path} -> s3://{bucket_name}/{key_value}')
                except ClientError as e:
                    logging.error(e)
        else:
            print(f'No files found in the {file_path} directory.')
    except Exception as e:
        print(f'Error uploading file {file_path}: {e}')

upload_file_to_s3('../web-scraping-and-dataset/','CSV_Data/','.csv')
upload_file_to_s3('../pdf-extractions/pypdf','PyPDF/','.txt')
upload_file_to_s3('../pdf-extractions/grobid','Grobid/','.txt')



 CSV_Data files:
['scraped_data.csv']
File uploaded successfully: ../web-scraping-and-dataset/scraped_data.csv -> s3://bigdata-group3-assignment2/CSV_Data/scraped_data.csv

 PyPDF files:
['PyPDF_RR_2024_l1_combined.txt', 'PyPDF_RR_2024_l2_combined.txt', 'PyPDF_RR_2024_l3_combined.txt']
File uploaded successfully: ../pdf-extractions/pypdf\PyPDF_RR_2024_l1_combined.txt -> s3://bigdata-group3-assignment2/PyPDF/PyPDF_RR_2024_l1_combined.txt
File uploaded successfully: ../pdf-extractions/pypdf\PyPDF_RR_2024_l2_combined.txt -> s3://bigdata-group3-assignment2/PyPDF/PyPDF_RR_2024_l2_combined.txt
File uploaded successfully: ../pdf-extractions/pypdf\PyPDF_RR_2024_l3_combined.txt -> s3://bigdata-group3-assignment2/PyPDF/PyPDF_RR_2024_l3_combined.txt

 Grobid files:
['Grobid_RR_2024_l1_combined.txt', 'Grobid_RR_2024_l2_combined.txt', 'Grobid_RR_2024_l3_combined.txt']
File uploaded successfully: ../pdf-extractions/grobid\Grobid_RR_2024_l1_combined.txt -> s3://bigdata-group3-assignment2/Grobid/Grob

## Loading metadata of text files to snowflake

In [8]:
from sqlalchemy import create_engine, text
import datetime as dt
import mimetypes
import pandas as pd

In [9]:
# Getting metadata of the text files

def getting_metadata(path_to_txt_file):
    files_metadata_df=pd.DataFrame(columns=['file_name','type', 'creation_date','size','s3_url'])
    files=get_files_in_directory(path_to_txt_file, '.txt')
    for f in files:
        file_full_path=os.path.join(path_to_txt_file, f)
        file_stats = os.stat(file_full_path)
        
        file_type = mimetypes.guess_type(file_full_path)[0]     #file type
        if not file_type:
            file_type = 'Unknown'
        f_creation_date= file_stats.st_ctime    # file creation time
        f_size = file_stats.st_size     # file size in bytes
        f_url = f"https://{bucket_name}.s3.amazonaws.com/Grobid/{f}" # file url
        
        files_metadata_df.loc[len(files_metadata_df)]=[f, file_type, dt.datetime.fromtimestamp(int(f_creation_date)).strftime('%Y-%m-%d %H:%M:%S'), f_size, f_url]
        
    return files_metadata_df
    
f_metadata = getting_metadata('../pdf-extractions/grobid')
print(f_metadata)

                        file_name        type        creation_date   size  \
0  Grobid_RR_2024_l1_combined.txt  text/plain  2024-02-15 19:23:24  43570   
1  Grobid_RR_2024_l2_combined.txt  text/plain  2024-02-15 19:23:24  44561   
2  Grobid_RR_2024_l3_combined.txt  text/plain  2024-02-15 19:23:24  28088   

                                              s3_url  
0  https://bigdata-group3-assignment2.s3.amazonaw...  
1  https://bigdata-group3-assignment2.s3.amazonaw...  
2  https://bigdata-group3-assignment2.s3.amazonaw...  


In [18]:
# dataframe to csv
f_metadata.to_csv('grobid_text_metadata.csv', header=True, index=False)

In [15]:
# Creating SQLAlchemy connection for snowflake
user, password, account, warehouse, database, schema = snowflake_connection()
sql_engine = create_engine(f'snowflake://{user}:{password}@{account}/?warehouse={warehouse}&database={database}&schema={schema}')

Successfully Connected to Snowflake


In [16]:
def create_internal_stage(engine, stage_name):
    create_stage_query = f"""
    CREATE STAGE IF NOT EXISTS {stage_name};
    """
    with engine.connect() as connection:
        connection.execute(text(create_stage_query))

create_internal_stage(sql_engine, 'grobid_metadata_stage')

In [19]:
def put_data_into_stage(engine, csv_file_path, stage_name):
    put_data_query = f"""
    PUT file://{csv_file_path} @{stage_name};
    """
    with engine.connect() as connection:
        connection.execute(text(put_data_query))

put_data_into_stage(sql_engine, './grobid_text_metadata.csv', 'grobid_metadata_stage')

In [24]:
def create_table_with_csv_structure(engine, table_name):
    create_table_query = f"""
    CREATE OR REPLACE TABLE {table_name} (
        file_name STRING,
        type STRING,
        creation_date DATETIME,
        size NUMBER,
        s3_url STRING
    );
    """
    with engine.connect() as connection:
        connection.execute(text(create_table_query))
        
create_table_with_csv_structure(sql_engine, 'grobid_metadata')

In [25]:
def load_data_from_stage_to_table(engine, table_name, stage_name):
    copy_into_query = f"""
    COPY INTO {table_name} FROM @{stage_name} FILE_FORMAT = (TYPE = CSV SKIP_HEADER = 1) ON_ERROR = 'CONTINUE';
    """
    with engine.connect() as connection:
        connection.execute(text(copy_into_query))
        
load_data_from_stage_to_table(sql_engine, 'grobid_metadata','grobid_metadata_stage')