# Download files from AWS Registry of Open Data

This notebook allows to download files from a particular dataset stored in AWS's S3, given its Amazon Resource Name (ARN)

See https://registry.opendata.aws/

The downloading process relies on the usage of the package boto3. See also

https://boto3.amazonaws.com/v1/documentation/api/latest/index.html

https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-examples.html


PS. Feel free to adjust the code

In [7]:
# ! pip3 install boto3

In [8]:
import os
import boto3

In [9]:
from botocore import UNSIGNED
from botocore.client import Config

In [10]:
# Print out the objects' name in a specified bucket, given the s3 resource.
# Optionally, filters specific names

def print_all_objects_name(s3_resource, bucket_name, name_word = ""):
    
    # select bucket
    my_bucket = s3_resource.Bucket(bucket_name)
    
    # print out name
    for s3_object in my_bucket.objects.all():
        filename = s3_object.key
        if name_word == "" or name_word in filename:
            print(filename)
        

In [11]:
# Print out the objects' name in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def print_all_objects_name_in_folder(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        filename = obj.key
        if name_word == "" or name_word in filename:  
            print(filename)
            

In [12]:
# Download all objects in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def download_all_objects_in_folder(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        path, filename = os.path.split(obj.key)
        if name_word == "" or name_word in filename:
            my_bucket.download_file(obj.key, filename)
        

In [13]:
# Download all objects in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def download_all_objects_in_folder_with_complete_name(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        path, filename = os.path.split(obj.key)
        complete_filename = path.replace('/', '|') + '|' + filename
        if name_word == "" or name_word in filename:
            my_bucket.download_file(obj.key, complete_filename)
        

In [15]:
# Download all S3 objects in a specified bucket, given the s3 resource.
# Optionally, filters specific names

def download_all_objects(s3_resource, bucket_name, name_word = ""):
    
    # select bucket
    my_bucket = s3_resource.Bucket(bucket_name)
    
    # download file into current directory
    for s3_object in my_bucket.objects.all():
        filename = s3_object.key
        if name_word == "" or name_word in filename:
            my_bucket.download_file(s3_object.key, filename)
        

In [16]:
# Initiate S3 resource

s3_resource = boto3.resource('s3', config=Config(signature_version=UNSIGNED))


In [17]:
# Set S3 resource name of interest

BUCKET_NAME = 'noaa-gsod-pds' 


In [18]:
# Particular word in the filename, if it is of interest
# e.g. csv, json, parquet

WORD_IN_FILENAME = '' 


In [19]:
#print_all_objects_name(s3_resource, BUCKET_NAME)

print_all_objects_name(s3_resource, BUCKET_NAME, WORD_IN_FILENAME)


1929/03005099999.csv
1929/03075099999.csv
1929/03091099999.csv
1929/03159099999.csv
1929/03262099999.csv
1929/03311099999.csv
1929/03379099999.csv
1929/03396099999.csv
1929/03497099999.csv
1929/03601099999.csv
1929/03777099999.csv
1929/03795099999.csv
1929/03804099999.csv
1929/03811099999.csv
1929/03856099999.csv
1929/03864099999.csv
1929/03894099999.csv
1929/03953099999.csv
1929/03973099999.csv
1929/03980099999.csv
1929/99006199999.csv
1930/03005099999.csv
1930/03026099999.csv
1930/03075099999.csv
1930/03091099999.csv
1930/03159099999.csv
1930/03262099999.csv
1930/03311099999.csv
1930/03379099999.csv
1930/03396099999.csv
1930/03497099999.csv
1930/03559099999.csv
1930/03601099999.csv
1930/03777099999.csv
1930/03795099999.csv
1930/03804099999.csv
1930/03811099999.csv
1930/03856099999.csv
1930/03864099999.csv
1930/03894099999.csv
1930/03953099999.csv
1930/03973099999.csv
1930/03980099999.csv
1930/99006199999.csv
1931/03005099999.csv
1931/03026099999.csv
1931/03075099999.csv
1931/03091099

1937/35663099999.csv
1937/35700099999.csv
1937/35791099999.csv
1937/35849099999.csv
1937/36003099999.csv
1937/36046099999.csv
1937/36177099999.csv
1937/36335099999.csv
1937/36714099999.csv
1937/36870099999.csv
1937/36974099999.csv
1937/37000099999.csv
1937/37031099999.csv
1937/37050099999.csv
1937/37099099999.csv
1937/37193099999.csv
1937/37228099999.csv
1937/37260099999.csv
1937/37395099999.csv
1937/37472099999.csv
1937/37484099999.csv
1937/37515099999.csv
1937/37545099999.csv
1937/37735099999.csv
1937/37789099999.csv
1937/37850099999.csv
1937/37907099999.csv
1937/37989099999.csv
1937/38001099999.csv
1937/38062099999.csv
1937/38198099999.csv
1937/38262099999.csv
1937/38457099999.csv
1937/38507099999.csv
1937/38683099999.csv
1937/38750099999.csv
1937/38763099999.csv
1937/38836099999.csv
1937/38862099999.csv
1937/38880099999.csv
1937/38895099999.csv
1937/38911099999.csv
1937/38987099999.csv
1937/69117499999.csv
1937/72226513821.csv
1937/72248513944.csv
1937/72253512909.csv
1937/72264093

1941/10609099999.csv
1941/10633099999.csv
1941/10635099999.csv
1941/10637099999.csv
1941/10642099999.csv
1941/10653099999.csv
1941/10655099999.csv
1941/10657099999.csv
1941/10659099999.csv
1941/10677099999.csv
1941/10685099999.csv
1941/10687499999.csv
1941/10707099999.csv
1941/10710099999.csv
1941/10711099999.csv
1941/10726099999.csv
1941/10727099999.csv
1941/10728099999.csv
1941/10732099999.csv
1941/10737099999.csv
1941/10738099999.csv
1941/10745099999.csv
1941/10746099999.csv
1941/10752099999.csv
1941/10755099999.csv
1941/10763099999.csv
1941/10765099999.csv
1941/10776099999.csv
1941/10803099999.csv
1941/10835499999.csv
1941/10839099999.csv
1941/10840099999.csv
1941/10845099999.csv
1941/10846099999.csv
1941/10852099999.csv
1941/10853099999.csv
1941/10856099999.csv
1941/10857099999.csv
1941/10858099999.csv
1941/10860099999.csv
1941/10864099999.csv
1941/10865099999.csv
1941/10866099999.csv
1941/10868099999.csv
1941/10894099999.csv
1941/10935099999.csv
1941/10947099999.csv
1941/10953099

KeyboardInterrupt: 

In [None]:
# Set out the related path, if it is of interest

PATH_NAME = '2018/'


In [None]:
print_all_objects_name_in_folder(s3_resource, BUCKET_NAME, PATH_NAME)

# print_all_objects_name_in_folder(s3_resource, BUCKET_NAME, PATH_NAME, WORD_IN_FILENAME)


In [None]:
! pwd

In [None]:
! ls -la

In [None]:
#download_all_objects_in_folder(s3_resource, BUCKET_NAME, PATH_NAME)

#download_all_objects_in_folder(s3_resource, BUCKET_NAME, PATH_NAME, WORD_IN_FILENAME)

download_all_objects_in_folder_with_complete_name(s3_resource, BUCKET_NAME, PATH_NAME, WORD_IN_FILENAME)

In [None]:
! ls -la

Some notes