## Upload files to S3 Bucket

### Importing required packages

In [8]:
import os
import boto3
from dotenv import load_dotenv

In [9]:
load_dotenv('../config/.env',override=True)

True

In [10]:
def loadenv():
    s3_bucket_name = os.getenv("s3_bucket_name")
    s3_pypdf = os.getenv("s3_pypdf")
    s3_grobid = os.getenv("s3_grobid")
    access_key = os.getenv("access_key")
    secret_key = os.getenv("secret_key")
    region = os.getenv("region")
    return s3_bucket_name, s3_pypdf, s3_grobid, access_key, secret_key, region

In [11]:
s3_bucket_name, s3_pypdf, s3_grobid, access_key, secret_key, region = loadenv()

### Function to upload .txt files to a particular folder inside s3 bucket

In [12]:
def upload_text_files_to_s3_folder(local_path, bucket_name, s3_folder):
    # Create an S3 client
    s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name = region)

    # Iterate through all files in the local directory
    for filename in os.listdir(local_path):
        if filename.endswith(".txt"):
            local_file_path = os.path.join(local_path, filename)
            s3_object_key = f"{s3_folder}/{filename}"

            # Check if the file already exists in S3
            try:
                s3.head_object(Bucket=bucket_name, Key=s3_object_key)
                print(f"File {filename} already exists in S3. Overwriting...")
            except Exception as e:
                # If the file doesn't exist, upload it
                try:
                    s3.upload_file(local_file_path, bucket_name, s3_object_key)
                    print(f"File {filename} uploaded successfully to S3: s3://{bucket_name}/{s3_object_key}")
                except Exception as upload_error:
                    print(f"Error uploading file {filename} to S3: {upload_error}")
        elif filename == "metadata_output.csv":
            local_file_path = os.path.join(local_path, filename)
            s3_object_key = f"{s3_folder}/{filename}"

            # Check if the file already exists in S3
            try:
                s3.head_object(Bucket=bucket_name, Key=s3_object_key)
                print(f"File {filename} already exists in S3. Overwriting...")
            except Exception as e:
                # If the file doesn't exist, upload it
                try:
                    s3.upload_file(local_file_path, bucket_name, s3_object_key)
                    print(f"File {filename} uploaded successfully to S3: s3://{bucket_name}/{s3_object_key}")
                except Exception as upload_error:
                    print(f"Error uploading file {filename} to S3: {upload_error}")

In [13]:
def upload_text_files_to_s3_root(local_path, s3_bucket_name):
    # Create an S3 client
    s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name = region)

    # List all files in the local path
    local_files = os.listdir(local_path)

    for file_name in local_files:
        if file_name == '224_links.txt':  # Upload only text files, adjust the condition based on your file types
            local_file_path = os.path.join(local_path, file_name)

            # Specify the S3 key (file path within the bucket)
            s3_key = file_name  # This will upload directly to the root of the S3 bucket

            # Upload the file to S3
            try:
                s3.upload_file(local_file_path, s3_bucket_name, s3_key)
                print(f"Successfully uploaded {file_name} to S3 bucket {s3_bucket_name}")
            except Exception as e:
                print(f"Error uploading {file_name} to S3: {e}")
        elif file_name == 'metadata_output.csv':  # Upload only text files, adjust the condition based on your file types
            local_file_path = os.path.join(local_path, file_name)

            # Specify the S3 key (file path within the bucket)
            s3_key = file_name  # This will upload directly to the root of the S3 bucket

            # Upload the file to S3
            try:
                s3.upload_file(local_file_path, s3_bucket_name, s3_key)
                print(f"Successfully uploaded {file_name} to S3 bucket {s3_bucket_name}")
            except Exception as e:
                print(f"Error uploading {file_name} to S3: {e}")

### Uploading .txt files generated using PyPDF

In [14]:
local_path = '../sample_output/PyPDF'

# Upload only new text files or overwrite existing ones in the specified S3 folder
upload_text_files_to_s3_folder(local_path, s3_bucket_name, s3_pypdf)

File PyPDF_RR_2024_l1_combined.txt already exists in S3. Overwriting...
File PyPDF_RR_2024_l2_combined.txt already exists in S3. Overwriting...
File PyPDF_RR_2024_l3_combined.txt already exists in S3. Overwriting...


### Uploading .txt files generated using Grobid

In [15]:
local_path = '../sample_output/Grobid'

# Upload only new text files or overwrite existing ones in the specified S3 folder
upload_text_files_to_s3_folder(local_path, s3_bucket_name, s3_grobid)

File Grobid_RR_2024_l1_combined.txt already exists in S3. Overwriting...
File Grobid_RR_2024_l2_combined.txt already exists in S3. Overwriting...
File Grobid_RR_2024_l3_combined.txt already exists in S3. Overwriting...


### Uploading metadata.csv and 224_Links.txt

In [16]:
local_path = '../sample_output/'
upload_text_files_to_s3_root(local_path, s3_bucket_name)

Successfully uploaded 224_links.txt to S3 bucket cfa-pdfs
Successfully uploaded metadata_output.csv to S3 bucket cfa-pdfs
