## A quick tutorial to demonstrate how to access and use Airborne SMCE s3 buckets in Python

Note: this is a work in progress and others with more experience should feel free to expand this tutorial

**sources:** <br>
https://sciwiki.fredhutch.org/compdemos/aws-s3/

### 1. Import libriaries and data

In [20]:
# Import modules 
import os, time, sys
from pathlib import Path
import boto3
import io # for StringIO, BytesIO. or could just be: from io import StringIO, BytesIO
# load pandas and dask to demo how to read/write to s3 tabular datasets
import numpy as np
import pandas as pd
import dask.dataframe as dd

### 2. Setup s3 client connection

In [25]:
s3 = boto3.client("s3")
s3_resource = boto3.resource('s3')

# set SMCE username - used later when creating a scratch space on s3
user_name = os.popen('whoami').read().strip()
print("*** SMCE username: " + user_name + " ***")

*** SMCE username: sserbin ***


### 3. Show all availible buckets

List the bucket names instead of displaying all of the metadata.  If you would like to show all the metadata you can use:

```print(response)```


In [7]:
response = s3.list_buckets() # show all buckets
for bucket in response['Buckets']:
    print(bucket['Name'])

aeronet-disaster-backup
airborne-data-transfer
airborne-pcluster
airborne-scratch
airborne-smce-dev-state
airborne-smce-prod-conda-store-bucket
airborne-smce-prod-state
airborne-smce-prod-user-bucket
config-bucket-445567107118
config-bucket-smce-445567107118
cur-reports-smce-445567107118
gliht
gliht-processed
gliht-raw
isofit-data
parallelcluster-0895790225836e61-v1-do-not-delete
parallelcluster-3cbb946982171420-v1-do-not-delete
smdc-prod-ap-southeast-2-445567107118-obs-outbound-bucket
smdc-prod-eu-north-1-445567107118-obs-outbound-bucket
smdc-prod-me-south-1-445567107118-obs-outbound-bucket
smdc-prod-us-east-1-445567107118-obs-outbound-bucket
smdc-prod-us-east-2-445567107118-obs-outbound-bucket
smdc-prod-us-west-1-445567107118-obs-outbound-bucket
smdc-prod-us-west-2-445567107118-obs-outbound-bucket
uas-data-storage
whymsie
whymsie-processed
whymsie-raw


### 3. Define an s3 bucket you want to work with and list the contents

In [8]:
bucket_name = "airborne-smce-prod-user-bucket" # substitute your actual bucket name

# List objects in a bucket
bucket_ls = s3.list_objects_v2(Bucket=bucket_name)
for item in bucket_ls['Contents']:
    print(item['Key'])

aist21/Matlab_data_format.zip
aist21/gliht/bnl2016/Brookhaven_Jun2016_mosaicked_refl_VIs.tar.gz
edlang/.ipynb_checkpoints/untitled-checkpoint.txt
edlang/.~test.txt
edlang/.~untitled.txt
edlang/test.txt
hyrsense_data/hyrsense.tar.gz
nquinteros/.ipynb_checkpoints/test-checkpoint.txt
nquinteros/.~test.txt
nquinteros/.~untitled.txt
nquinteros/test.txt
sserbin/.ipynb_checkpoints/test-checkpoint.txt
sserbin/.test.txt.swp
sserbin/.test.txt.swx
sserbin/.~test.txt
sserbin/test.txt
tutorial_data/aiml_tutorials/aqi/AirQualityUCI.csv
tutorial_data/aiml_tutorials/aqi/AirQualityUCI.xlsx
tutorial_data/hyrsense/hyrsense.tar.gz


### 4. Read some data from s3. For this we will source data from the an AI/ML tutorial csv file

In [59]:
# Create the connection to the csv file stored on s3
csv_filename = 'AirQualityUCI.csv'
csv_file_path = 'tutorial_data/aiml_tutorials/aqi/'
csv_file = s3.get_object(Bucket=bucket_name, Key=os.path.join(csv_file_path,csv_filename))
#print(csv_file)

# Now read the data into a Pandas dataframe
df = pd.read_csv(io.BytesIO(csv_file['Body'].read()), sep=';', decimal=',')
df.head()

# Finally display the data below confirming you have read in the data from s3

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10/03/2004,18.00.00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,,
1,10/03/2004,19.00.00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,,
2,10/03/2004,20.00.00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,,
3,10/03/2004,21.00.00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,,
4,10/03/2004,22.00.00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,,


### 5. Write data to s3

In [69]:
# generate a pandas data frame of random numbers:
df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))
df.head()

Unnamed: 0,A,B,C,D
0,97,91,18,17
1,51,87,32,50
2,0,49,38,90
3,99,32,39,39
4,72,18,40,10


In [70]:
# First set s3 location and file name of the csv file
bucket_name = bucket_name # same bucket name from above
#print(bucket_name)
s3_object_key = os.path.join('scratch',user_name,'dummy_csv_s3.csv') # this defines the full path
#print(s3_object_key)

# Create the csv file from memory
csv_buffer = io.StringIO()
#print(csv_buffer)
df.to_csv(csv_buffer, index=False) # Write DataFrame to CSV string in buffer. Drop the index column
#df.to_csv(csv_buffer) # Write DataFrame to CSV string in buffer
df.head()

Unnamed: 0,A,B,C,D
0,97,91,18,17
1,51,87,32,50
2,0,49,38,90
3,99,32,39,39
4,72,18,40,10


In [71]:
# Show the csv_buffer
print(csv_buffer.getvalue())

A,B,C,D
97,91,18,17
51,87,32,50
0,49,38,90
99,32,39,39
72,18,40,10
88,95,1,58
0,22,79,97
68,26,69,52
20,31,1,26
42,21,72,49
70,5,19,43
4,99,89,33
33,36,96,48
67,23,72,30
5,54,68,60
93,70,57,38
79,63,7,38
28,80,22,97
93,86,46,19
84,90,45,23
7,40,53,92
30,79,73,39
55,27,75,86
39,67,97,78
14,76,98,79
78,17,41,96
10,53,66,93
6,62,37,25
43,73,90,54
54,3,44,13
12,4,38,15
24,77,66,53
79,15,81,78
45,4,44,6
24,51,32,41
63,30,45,62
3,45,7,70
64,49,92,10
31,1,13,88
66,89,41,46
43,11,51,41
91,49,95,66
62,91,89,78
95,6,62,85
26,99,98,6
92,62,67,96
8,72,99,58
58,29,74,3
61,76,7,54
20,72,62,57
68,49,3,83
18,16,28,89
14,40,30,27
52,5,24,65
89,29,3,34
85,51,68,43
36,37,10,17
60,70,22,74
0,74,17,97
30,25,7,53
27,61,45,90
53,10,7,26
95,15,90,92
27,60,0,29
43,13,25,3
44,44,56,84
76,68,0,67
28,57,45,99
65,57,39,82
53,7,14,20
87,73,25,43
39,13,59,82
8,10,28,12
32,60,14,54
23,53,91,61
60,38,60,38
31,66,3,13
80,69,4,93
52,64,43,12
36,26,22,59
92,47,97,7
48,81,99,89
64,22,80,13
5,84,61,72
10,88,60,37
46,10,73,

In [72]:
### save it in s3:
#s3_object = s3_resource.Object(bucket_name, s3_object_key)
#print(s3_object)
s3_resource.Object(bucket_name, s3_object_key).put(Body=csv_buffer.getvalue())
print(f"Successfully wrote data to 's3://{bucket_name}/{s3_object_key}'")

Successfully wrote data to 's3://airborne-smce-prod-user-bucket/scratch/sserbin/dummy_csv_s3.csv'


In [76]:
### convert data frame to dask:
dask_df = dd.from_pandas(df, 3)

# save dask data frame to s3 in parts:
save_dask_loc = os.path.join('s3://{}/','scratch',user_name,'dask_data_parts')
print(save_dask_loc)
dask_df.to_csv(save_dask_loc.format(bucket_name))

s3://{}/scratch/sserbin/dask_data_parts


['airborne-smce-prod-user-bucket/scratch/sserbin/dask_data_parts/0.part',
 'airborne-smce-prod-user-bucket/scratch/sserbin/dask_data_parts/1.part',
 'airborne-smce-prod-user-bucket/scratch/sserbin/dask_data_parts/2.part']

### 6. Read dask from s3

In [78]:
# We will create a dask df using the dask dataframe you just saved on the scratch space on s3
save_dask_loc = os.path.join('s3://{}/','scratch',user_name,'dask_data_parts','*')
print(save_dask_loc)
dask_df2 = dd.read_csv("s3://{}/dask_data_parts/*".format(bucket_name))

s3://{}/scratch/sserbin/dask_data_parts/*


In [79]:
# Now display the new datagframe
dask_df2.head()

Unnamed: 0.1,Unnamed: 0,A,B,C,D
0,0,97,91,18,17
1,1,51,87,32,50
2,2,0,49,38,90
3,3,99,32,39,39
4,4,72,18,40,10


In [80]:
print(dask_df2)

Dask DataFrame Structure:
              Unnamed: 0      A      B      C      D
npartitions=3                                       
                   int64  int64  int64  int64  int64
                     ...    ...    ...    ...    ...
                     ...    ...    ...    ...    ...
                     ...    ...    ...    ...    ...
Dask Name: to_string_dtype, 2 expressions
Expr=ArrowStringConversion(frame=FromMapProjectable(8fade93))


### 7. Upload a file to s3

In [90]:
# write the example data frame to a local file. We will use our local tutorial folder and make a file in scratch space
tutorials_scratch = os.path.join(os.path.expanduser("~"),"tutorials","scratch")
if not os.path.exists(tutorials_scratch):
    dest_dir_path = Path(tutorials_scratch)
    dest_dir_path.mkdir(parents=True, mode=0o777, exist_ok=True) # create the directory for the copied data, if needed

# convert the existing df to a csv file and store locally
tutorials_scratch
filename_csv = os.path.join(tutorials_scratch,'dummy_csv_s3_local.csv')
df.to_csv(filename_csv, index=False)

# upload file to s3 from local storage:
s3_object_key = os.path.join('scratch',user_name,'dummy_csv_s3_local.csv') # this defines the full path
print(s3_object_key)
s3.upload_file(Filename=filename_csv, Bucket=bucket_name, Key=s3_object_key)
#s3.upload_file(local_file_path, bucket_name, s3_key)

print(f"Successfully wrote data to 's3://{bucket_name}/{s3_object_key}'")

scratch/sserbin/dummy_csv_s3_local.csv
Successfully wrote data to 's3://airborne-smce-prod-user-bucket/scratch/sserbin/dummy_csv_s3_local.csv'


### 8. Download a file from s3

In [91]:
# second argument is the remote name/key, third argument is local name
filename_csv = os.path.join(tutorials_scratch,'dummy_csv_s3_local_s3.csv')
s3.download_file(bucket_name, s3_object_key, filename_csv)
#s3.download_file(Bucket, Key, Filename)