## A quick tutorial to demonstrate how to access and use Airborne SMCE s3 buckets in Python

Note: this is a work in progress and others with more experience should feel free to expand this tutorial

**sources:** <br>
https://sciwiki.fredhutch.org/compdemos/aws-s3/

### 1. Import libriaries and data

In [1]:
# Import modules 
import os, time, sys
from pathlib import Path
import boto3
import io # for StringIO, BytesIO. or could just be: from io import StringIO, BytesIO
# load pandas and dask to demo how to read/write to s3 tabular datasets
import numpy as np
import pandas as pd
import dask.dataframe as dd

### 2. Setup s3 client connection

In [2]:
s3 = boto3.client("s3")
s3_resource = boto3.resource('s3')

# set SMCE username - used later when creating a scratch space on s3
user_name = os.popen('whoami').read().strip()
print("*** SMCE username: " + user_name + " ***")

*** SMCE username: sserbin ***


### 3. Show all availible buckets

List the bucket names instead of displaying all of the metadata.  If you would like to show all the metadata you can use:

```print(response)```


In [3]:
response = s3.list_buckets() # show all buckets
for bucket in response['Buckets']:
    print(bucket['Name'])

aeronet-disaster-backup
airborne-data-transfer
airborne-pcluster
airborne-scratch
airborne-smce-dev-state
airborne-smce-prod-conda-store-bucket
airborne-smce-prod-state
airborne-smce-prod-user-bucket
config-bucket-445567107118
config-bucket-smce-445567107118
cur-reports-smce-445567107118
gliht
gliht-processed
gliht-raw
isofit-data
parallelcluster-0895790225836e61-v1-do-not-delete
parallelcluster-3cbb946982171420-v1-do-not-delete
smdc-prod-ap-southeast-2-445567107118-obs-outbound-bucket
smdc-prod-eu-north-1-445567107118-obs-outbound-bucket
smdc-prod-me-south-1-445567107118-obs-outbound-bucket
smdc-prod-us-east-1-445567107118-obs-outbound-bucket
smdc-prod-us-east-2-445567107118-obs-outbound-bucket
smdc-prod-us-west-1-445567107118-obs-outbound-bucket
smdc-prod-us-west-2-445567107118-obs-outbound-bucket
uas-data-storage
whymsie
whymsie-processed
whymsie-raw


### 3. Define an s3 bucket you want to work with and list the contents

In [4]:
bucket_name = "airborne-smce-prod-user-bucket" # substitute your actual bucket name

# List objects in a bucket
bucket_ls = s3.list_objects_v2(Bucket=bucket_name)
for item in bucket_ls['Contents']:
    print(item['Key'])

/scratch/sserbin/dummy_csv_s3.csv
aist21/Matlab_data_format.zip
aist21/gliht/bnl2016/Brookhaven_Jun2016_mosaicked_refl_VIs.tar.gz
dask_data_parts/0.part
dask_data_parts/1.part
dask_data_parts/2.part
edlang/.ipynb_checkpoints/untitled-checkpoint.txt
edlang/.~test.txt
edlang/.~untitled.txt
edlang/test.txt
hyrsense_data/hyrsense.tar.gz
nquinteros/.ipynb_checkpoints/test-checkpoint.txt
nquinteros/.~test.txt
nquinteros/.~untitled.txt
nquinteros/test.txt
scratch/sserbin/.ipynb_checkpoints/dummy_csv_s3-checkpoint.csv
scratch/sserbin/dask_data_parts/0.part
scratch/sserbin/dask_data_parts/1.part
scratch/sserbin/dask_data_parts/2.part
scratch/sserbin/dummy_csv_s3.csv
scratch/sserbin/dummy_csv_s3_local.csv
sserbin/.ipynb_checkpoints/test-checkpoint.txt
sserbin/.test.txt.swp
sserbin/.test.txt.swx
sserbin/.~test.txt
sserbin/test.txt
tutorial_data/aiml_tutorials/aqi/AirQualityUCI.csv
tutorial_data/aiml_tutorials/aqi/AirQualityUCI.xlsx
tutorial_data/hyrsense/hyrsense.tar.gz


### 4. Read some data from s3. For this we will source data from the an AI/ML tutorial csv file

In [5]:
# Create the connection to the csv file stored on s3
csv_filename = 'AirQualityUCI.csv'
csv_file_path = 'tutorial_data/aiml_tutorials/aqi/'
csv_file = s3.get_object(Bucket=bucket_name, Key=os.path.join(csv_file_path,csv_filename))
#print(csv_file)

# Now read the data into a Pandas dataframe
df = pd.read_csv(io.BytesIO(csv_file['Body'].read()), sep=';', decimal=',')
df.head()

# Finally display the data below confirming you have read in the data from s3

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10/03/2004,18.00.00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,,
1,10/03/2004,19.00.00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,,
2,10/03/2004,20.00.00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,,
3,10/03/2004,21.00.00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,,
4,10/03/2004,22.00.00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,,


### 5. Write data to s3

In [6]:
# generate a pandas data frame of random numbers:
df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))
df.head()

Unnamed: 0,A,B,C,D
0,5,68,51,47
1,21,2,68,37
2,89,91,65,82
3,31,83,61,35
4,68,7,70,70


In [7]:
# First set s3 location and file name of the csv file
bucket_name = bucket_name # same bucket name from above
#print(bucket_name)
s3_object_key = os.path.join('scratch',user_name,'dummy_csv_s3.csv') # this defines the full path
#print(s3_object_key)

# Create the csv file from memory
csv_buffer = io.StringIO()
#print(csv_buffer)
df.to_csv(csv_buffer, index=False) # Write DataFrame to CSV string in buffer. Drop the index column
#df.to_csv(csv_buffer) # Write DataFrame to CSV string in buffer
df.head()

Unnamed: 0,A,B,C,D
0,5,68,51,47
1,21,2,68,37
2,89,91,65,82
3,31,83,61,35
4,68,7,70,70


In [8]:
# Show the csv_buffer
print(csv_buffer.getvalue())

A,B,C,D
5,68,51,47
21,2,68,37
89,91,65,82
31,83,61,35
68,7,70,70
81,28,2,83
12,14,44,0
94,78,9,72
10,24,8,76
49,42,66,8
79,39,97,46
56,48,45,26
35,6,49,27
63,18,29,88
48,26,60,18
86,87,7,23
54,86,58,46
3,50,53,96
50,90,38,39
22,51,97,98
5,77,80,98
12,97,96,29
75,65,95,98
78,74,99,6
95,0,10,89
35,41,1,65
88,68,79,81
74,11,93,73
73,84,76,4
52,28,78,22
93,56,99,85
31,98,1,0
75,38,73,79
89,29,70,26
50,59,46,92
5,74,15,25
99,98,76,39
54,63,21,47
40,30,98,25
13,72,84,45
74,19,23,27
33,31,22,6
87,24,1,96
47,32,32,2
71,15,33,7
39,66,28,70
46,85,67,20
78,66,47,43
68,64,30,69
38,18,30,86
89,30,90,23
33,64,52,57
29,93,91,8
47,26,10,17
25,2,33,2
97,77,21,23
96,91,97,57
15,56,21,23
35,41,47,10
24,53,14,74
45,4,8,2
53,77,53,50
91,19,82,87
96,52,17,44
67,34,79,76
57,63,21,44
2,79,11,71
6,39,52,97
84,27,80,21
73,31,35,81
22,26,85,55
68,81,18,68
97,13,17,96
46,20,48,45
22,15,31,38
0,8,4,55
57,34,83,66
71,65,88,93
67,93,12,72
11,2,79,78
46,77,62,13
3,13,39,56
77,33,68,24
80,30,39,89
41,81,58,3
80,22,10,

In [9]:
### save it in s3:
#s3_object = s3_resource.Object(bucket_name, s3_object_key)
#print(s3_object)
s3_resource.Object(bucket_name, s3_object_key).put(Body=csv_buffer.getvalue())
print(f"Successfully wrote data to 's3://{bucket_name}/{s3_object_key}'")

Successfully wrote data to 's3://airborne-smce-prod-user-bucket/scratch/sserbin/dummy_csv_s3.csv'


In [10]:
### convert data frame to dask:
dask_df = dd.from_pandas(df, 3)

# save dask data frame to s3 in parts:
save_dask_loc = os.path.join('s3://{}/','scratch',user_name,'dask_data_parts')
print(save_dask_loc)
dask_df.to_csv(save_dask_loc.format(bucket_name))

s3://{}/scratch/sserbin/dask_data_parts


['airborne-smce-prod-user-bucket/scratch/sserbin/dask_data_parts/0.part',
 'airborne-smce-prod-user-bucket/scratch/sserbin/dask_data_parts/1.part',
 'airborne-smce-prod-user-bucket/scratch/sserbin/dask_data_parts/2.part']

### 6. Read dask from s3

In [11]:
# We will create a dask df using the dask dataframe you just saved on the scratch space on s3
save_dask_loc = os.path.join('s3://{}/','scratch',user_name,'dask_data_parts','*')
print(save_dask_loc)
dask_df2 = dd.read_csv("s3://{}/dask_data_parts/*".format(bucket_name))

s3://{}/scratch/sserbin/dask_data_parts/*


In [12]:
# Now display the new datagframe
dask_df2.head()

Unnamed: 0.1,Unnamed: 0,A,B,C,D
0,0,97,91,18,17
1,1,51,87,32,50
2,2,0,49,38,90
3,3,99,32,39,39
4,4,72,18,40,10


In [13]:
print(dask_df2)

Dask DataFrame Structure:
              Unnamed: 0      A      B      C      D
npartitions=3                                       
                   int64  int64  int64  int64  int64
                     ...    ...    ...    ...    ...
                     ...    ...    ...    ...    ...
                     ...    ...    ...    ...    ...
Dask Name: to_string_dtype, 2 expressions
Expr=ArrowStringConversion(frame=FromMapProjectable(296386a))


### 7. Upload a file to s3

In [18]:
# write the example data frame to a local file. We will use our local tutorial folder and make a file in scratch space
tutorials_scratch = os.path.join(os.path.expanduser("~"),"tutorials","scratch")
if not os.path.exists(tutorials_scratch):
    dest_dir_path = Path(tutorials_scratch)
    dest_dir_path.mkdir(parents=True, mode=0o777, exist_ok=True) # create the directory for the copied data, if needed

# convert the existing df to a csv file and store locally
filename_csv = os.path.join(tutorials_scratch,'dummy_csv_s3_local.csv')
df.to_csv(filename_csv, index=False)

# upload file to s3 from local storage:
s3_object_key = os.path.join('scratch',user_name,'dummy_csv_s3_local.csv') # this defines the full path
print(s3_object_key)
s3.upload_file(Filename=filename_csv, Bucket=bucket_name, Key=s3_object_key)
#s3.upload_file(local_file_path, bucket_name, s3_key)

print(f"Successfully wrote data to 's3://{bucket_name}/{s3_object_key}'")

scratch/sserbin/dummy_csv_s3_local.csv
Successfully wrote data to 's3://airborne-smce-prod-user-bucket/scratch/sserbin/dummy_csv_s3_local.csv'


### 8. Download a file from s3

In [15]:
# second argument is the remote name/key, third argument is local name
filename_csv = os.path.join(tutorials_scratch,'dummy_csv_s3_local_s3.csv')
s3.download_file(bucket_name, s3_object_key, filename_csv)
#s3.download_file(Bucket, Key, Filename)

### 9. Write data to s3 using a custom s3 upload function

In [17]:
def upload_file_to_s3_folder(file_path, bucket_name, folder_name, s3_file_name):
    """
    Uploads a file to a specified "folder" within an S3 bucket.

    Args:
        file_path (str): The local path to the file you want to upload.
        bucket_name (str): The name of your S3 bucket.
        folder_name (str): The name of the "folder" within the S3 bucket.
        s3_file_name (str): The desired name of the file in S3.
    """
    s3 = boto3.client('s3')

    # Construct the full S3 key (object name) including the folder prefix
    s3_key = f"{folder_name}/{s3_file_name}"

    try:
        s3.upload_file(file_path, bucket_name, s3_key)
        print(f"Successfully uploaded '{file_path}' to s3://{bucket_name}/{s3_key}")
    except Exception as e:
        print(f"Error uploading file: {e}")

In [19]:
## Use the existing demo files and paths for this example
# Example usage:
#local_file = 'my_local_file.txt'  # Replace with your local file path
#s3_bucket = 'your-s3-bucket-name'  # Replace with your S3 bucket name
#s3_folder = 'new_data_folder'  # The "new folder" name
#s3_object_name = 'uploaded_document.txt'  # The name of the file in S3

local_file = filename_csv
s3_bucket = bucket_name
s3_folder = os.path.join('scratch',user_name,'file_upload')
s3_object_name = 'dummy_csv_s3_upload.csv'  # The name of the file in S3

# upload the file using the function
upload_file_to_s3_folder(local_file, s3_bucket, s3_folder, s3_object_name)

Successfully uploaded '/home/jovyan/tutorials/scratch/dummy_csv_s3_local.csv' to s3://airborne-smce-prod-user-bucket/scratch/sserbin/file_upload/dummy_csv_s3_upload.csv
