In [1]:
""" upload one directory from the current working directory to aws """
from pathlib import Path
import os
import glob
import boto3


def upload_dir(localDir, awsInitDir, bucketName, tag, prefix='/'):
    """
    from current working directory, upload a 'localDir' with all its subcontents (files and subdirectories...)
    to a aws bucket
    Parameters
    ----------
    localDir :   localDirectory to be uploaded, with respect to current working directory
    awsInitDir : prefix 'directory' in aws
    bucketName : bucket in aws
    tag :        tag to select files, like *png
                 NOTE: if you use tag it must be given like --tag '*txt', in some quotation marks... for argparse
    prefix :     to remove initial '/' from file names

    Returns
    -------
    None
    """
    s3 = boto3.resource('s3')
    cwd = str(Path.cwd())
    p = Path(os.path.join(Path.cwd(), localDir))
    mydirs = list(p.glob('**'))
    for mydir in mydirs:
        fileNames = glob.glob(os.path.join(mydir, tag))
        fileNames = [f for f in fileNames if not Path(f).is_dir()]
        rows = len(fileNames)
        for i, fileName in enumerate(fileNames):
            fileName = str(fileName).replace(cwd, '')
            if fileName.startswith(prefix):  # only modify the text if it starts with the prefix
                fileName = fileName.replace(prefix, "", 1) # remove one instance of prefix
            print(f"fileName {fileName}")

            awsPath = os.path.join(awsInitDir, str(fileName))
            s3.meta.client.upload_file(fileName, bucketName, awsPath)

In [2]:
upload_dir('imagenet-mini', '', 'zhuangwei-bucket', '*.JPEG')

In [3]:
bucket_name = 'zhuangwei-bucket'
client = boto3.client('s3')
s3 = boto3.resource('s3')
bucket_obj = s3.Bucket(bucket_name)

response = client.list_objects(Bucket=bucket_name, Prefix='imagenet-mini/train/n01440764')['Contents']

In [4]:
response

[{'Key': 'imagenet-mini/train/n01440764/n01440764_10043.JPEG',
  'LastModified': datetime.datetime(2022, 6, 27, 21, 32, 45, tzinfo=tzutc()),
  'ETag': '"1e2d14641e2091e222e80fe4a309a3f5"',
  'Size': 68487,
  'StorageClass': 'STANDARD',
  'Owner': {'DisplayName': 'usrdaws4alnair',
   'ID': 'eecea9d3a80c45e553445e77d78c203925fe9b169a35da7089fd4e9332fb9b74'}},
 {'Key': 'imagenet-mini/train/n01440764/n01440764_10470.JPEG',
  'LastModified': datetime.datetime(2022, 6, 27, 21, 32, 49, tzinfo=tzutc()),
  'ETag': '"1de77225c793ce27769db6338135d3c4"',
  'Size': 127866,
  'StorageClass': 'STANDARD',
  'Owner': {'DisplayName': 'usrdaws4alnair',
   'ID': 'eecea9d3a80c45e553445e77d78c203925fe9b169a35da7089fd4e9332fb9b74'}},
 {'Key': 'imagenet-mini/train/n01440764/n01440764_10744.JPEG',
  'LastModified': datetime.datetime(2022, 6, 27, 21, 32, 50, tzinfo=tzutc()),
  'ETag': '"ae78f8d3de8dd50e0653d099ff218c1b"',
  'Size': 106594,
  'StorageClass': 'STANDARD',
  'Owner': {'DisplayName': 'usrdaws4alnair

In [58]:
data = []
for k in list(map(lambda x: x['Key'], response)):
    data.append(client.get_object(Bucket=bucket_name, Key=k)['Body'].read())

In [66]:
bucket_obj.download_file(Key='imagenet-mini/train/n01440764', Filename='/tmp/n01440764')

ClientError: An error occurred (404) when calling the HeadObject operation: Not Found

In [44]:
paginator = client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=bucket_name)

dataset_info = {}
for bucket in page_iterator:
    for obj in bucket['Contents']:
        if obj['Key'] not in keys: continue
        metadata = client.head_object(Bucket=bucket_name, Key=obj['Key'])
        dataset_info.update({obj['Key']: metadata})


In [46]:
len(dataset_info)

1000

In [33]:
import sys

sys.getsizeof(data)

336