# Computing metadata for s3 files

## Importing required packages

In [25]:
import csv
from collections import defaultdict
import boto3
import os
from dotenv import load_dotenv

## Setting up environment

In [26]:
load_dotenv('../config/.env',override=True)

True

In [27]:
def loadenv():
    s3_bucket_name = os.getenv("s3_bucket_name")
    s3_pypdf = os.getenv("s3_pypdf")
    s3_grobid = os.getenv("s3_grobid")
    access_key = os.getenv("access_key")
    secret_key = os.getenv("secret_key")
    region = os.getenv("region")
    return s3_bucket_name, s3_pypdf, s3_grobid, access_key, secret_key, region

In [28]:
bucket_name, pypdf_folder_key, grobid_folder_key, access_key, secret_key, region = loadenv()
output_csv_file = '../sample_output/metadata_output.csv'
# Create an S3 client
s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name = region)

## Generating metadata

In [29]:
# List objects for pypdf folder
# pypdf_folder_key = 'pypdf/'
try:
    response_pypdf = s3.list_objects_v2(Bucket=bucket_name, Prefix=pypdf_folder_key)
    pypdf_objects = response_pypdf.get('Contents', [])
except Exception as e:
    print(f"Error listing pypdf objects: {e}")
    pypdf_objects = []

# List objects for grobid folder
# grobid_folder_key = 'grobid/'
try:
    response_grobid = s3.list_objects_v2(Bucket=bucket_name, Prefix=grobid_folder_key)
    grobid_objects = response_grobid.get('Contents', [])
except Exception as e:
    print(f"Error listing grobid objects: {e}")
    grobid_objects = []

# Combine objects from both folders
objects = pypdf_objects + grobid_objects

# Process objects and store metadata
pypdf_metadata = defaultdict(dict)
grobid_metadata = defaultdict(dict)

for obj in objects:
    file_key = obj['Key']
    if 'Grobid' in file_key:
        row_name = file_key.split('_', 1)[1]
        grobid_metadata[row_name]['File Key'] = row_name
        grobid_metadata[row_name]['Last Modified'] = obj['LastModified']
        grobid_metadata[row_name]['ETag'] = obj['ETag']
        grobid_metadata[row_name]['Size'] = obj['Size']
        grobid_metadata[row_name]['S3 Link'] = f"https://{bucket_name}.s3.amazonaws.com/{file_key}"
        grobid_metadata[row_name]['File Type'] = file_key.split('.')[-1].lower()
    elif 'PyPDF' in file_key:
        row_name = file_key.split('_', 1)[1]
        pypdf_metadata[row_name]['File Key'] = row_name
        pypdf_metadata[row_name]['Last Modified'] = obj['LastModified']
        pypdf_metadata[row_name]['ETag'] = obj['ETag']
        pypdf_metadata[row_name]['Size'] = obj['Size']
        pypdf_metadata[row_name]['S3 Link'] = f"https://{bucket_name}.s3.amazonaws.com/{file_key}"
        pypdf_metadata[row_name]['File Type'] = file_key.split('.')[-1].lower()

# Combine PyPDF and Grobid metadata into rows
rows = []
for row_name, grobid_info in grobid_metadata.items():
    row = {}
    row.update(grobid_info)
    row.update(pypdf_metadata.get(row_name, {}))
    rows.append(row)

# Save metadata to CSV file
with open(output_csv_file, 'w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['File Key', 'Last Modified (Grobid)', 'ETag (Grobid)', 'Size (Grobid)', 'S3 Link (Grobid)', 'File Type (Grobid)',
                  'Last Modified (PyPDF)', 'ETag (PyPDF)', 'Size (PyPDF)', 'S3 Link (PyPDF)', 'File Type (PyPDF)']
    csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    # Write header row
    csv_writer.writeheader()

    # Write metadata for each row
    for row in rows:
        csv_writer.writerow({
            'File Key': row.get('File Key', ''),
            'Last Modified (Grobid)': row.get('Last Modified', ''),
            'ETag (Grobid)': row.get('ETag', ''),
            'Size (Grobid)': row.get('Size', ''),
            'S3 Link (Grobid)': row.get('S3 Link', ''),
            'File Type (Grobid)': row.get('File Type', ''),
            'Last Modified (PyPDF)': row.get('Last Modified', ''),
            'ETag (PyPDF)': row.get('ETag', ''),
            'Size (PyPDF)': row.get('Size', ''),
            'S3 Link (PyPDF)': row.get('S3 Link', ''),
            'File Type (PyPDF)': row.get('File Type', ''),
        })

print(f"Metadata saved to {output_csv_file}")


Metadata saved to ../sample_output/metadata_output.csv
