# Download files from AWS Registry of Open Data

This notebook allows to download files from a particular dataset stored in AWS's S3, given its Amazon Resource Name (ARN)

See https://registry.opendata.aws/

The downloading process relies on the usage of the package boto3. See also

https://boto3.amazonaws.com/v1/documentation/api/latest/index.html

https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-examples.html


PS. Feel free to adjust the code

In [1]:
import os
import boto3
from botocore import UNSIGNED
from botocore.client import Config

In [3]:
# Print out the objects' name in a specified bucket, given the s3 resource.
# Optionally, filters specific names

def print_all_objects_name(s3_resource, bucket_name, name_word = ""):
    
    # select bucket
    my_bucket = s3_resource.Bucket(bucket_name)
    
    # print out name
    for s3_object in my_bucket.objects.all():
        filename = s3_object.key
        if name_word == "" or name_word in filename:
            print(filename)
        

In [4]:
# Print out the objects' name in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def print_all_objects_name_in_folder(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        filename = obj.key
        if name_word == "" or name_word in filename:  
            print(filename)
            

In [5]:
# Download all objects in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def download_all_objects_in_folder(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        path, filename = os.path.split(obj.key)
        if name_word == "" or name_word in filename:
            my_bucket.download_file(obj.key, filename)
        

In [6]:
# Download all objects in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def download_all_objects_in_folder_with_complete_name(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        path, filename = os.path.split(obj.key)
        complete_filename = path.replace('/', '|') + '|' + filename
        if name_word == "" or name_word in filename:
            my_bucket.download_file(obj.key, complete_filename)
        

In [8]:
def download_all_objects_in_folder_exact_match(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    if name_word != "":
        for obj in objects:
            path, filename = os.path.split(obj.key)
            if name_word == filename:
                my_bucket.download_file(obj.key, filename)
    else:
        for obj in objects:
            path, filename = os.path.split(obj.key)
            my_bucket.download_file(obj.key, filename)

In [9]:
# Initiate S3 resource

s3_resource = boto3.resource('s3', config=Config(signature_version=UNSIGNED))


In [10]:
# Set S3 resource name of interest

BUCKET_NAME = 'noaa-gsod-pds' 


In [15]:
# Pasta para onde vai ser feito o download:

%cd /home/saltedcookie/Desktop/projetoABD/data14-15
#%cd /home/saltedcookie/Desktop/projetoABD/dataPT

/home/saltedcookie/Desktop/projetoABD/data14-15


In [13]:
! pwd

/home/saltedcookie/Documents/GitHub/ABD_Andre-Nuno


In [25]:
! ls -la

total 1684
drwxrwxr-x 3 saltedcookie saltedcookie    4096 mai 10 14:36 .
drwxr-xr-x 7 saltedcookie saltedcookie    4096 mai  6 14:36 ..
-rw-rw-r-- 1 saltedcookie saltedcookie 1710754 mai 10 14:36 Download-AWS-Data.ipynb
drwxrwxr-x 2 saltedcookie saltedcookie    4096 mai 10 12:52 .ipynb_checkpoints


In [16]:
# Downloaded all info on Lisbon station

#for i in range(1920, 2023):
#    download_all_objects_in_folder_with_complete_name(s3_resource, BUCKET_NAME, str(i) + "/" + PATH_NAME)



# Download of all info in a range

for i in [2014, 2015]:
    download_all_objects_in_folder_with_complete_name(s3_resource, BUCKET_NAME, str(i))

In [None]:
# Concatenate all downloaded files into one

! cat *csv > Data2018-2022.csv

In [None]:
# Remove extra headings resulting from naïve concatenation

! awk 'NR==1 || !/^"STATION"/' combined.csv > noaa.csv

In [None]:
! ls -la