In [1]:
import boto3
from pprint import pprint as pp
import json
import pandas as pd

# Connecting to the S3 Bucket

In [24]:
s3_client = boto3.client('s3')  # low level
s3_resource = boto3.resource('s3')  # high level, object-orientated

# Listing the Buckets

In [26]:
bucket_list = s3_client.list_buckets()  # List of all buckets

# Listing all objects inside a bucket

In [25]:
bucket_name = 'data-eng-resources'
bucket = s3_resource.Bucket(bucket_name)  # Creating an instance of the bucket
objects = bucket.objects
for object in objects.all():
    print(object.key)

Data210/Fish/Megha.csv
Data210/fish/Diana.csv
Data210/fish/Lilly.csv
Data210/fish/Megha.csv
Data210/fish/Tom.csv
Data210/fish/andre.csv
Data210/fish/edmund.csv
Data210/fish/ethan.csv
Data210/fish/ewan.csv
Data210/fish/harry.csv
Data210/fish/rob.csv
big-data/CAT.csv
big-data/TSLA.csv
big-data/adventureworks/
big-data/adventureworks/employees.csv
big-data/adventureworks/people.csv
big-data/adventureworks/salespeople.csv
big-data/adventureworks/territories.csv
big-data/big.txt
big-data/movie-ratings/movie_ratings.txt
big-data/s3wordcount.txt
big-data/spartans.csv
big-data/tv_shows.json
big-data/vgsales.csv
big-data/weekday_mapper.py
my_dictionary.json
python/
python/chatbot-intent.json
python/fish-market-mon.csv
python/fish-market-tues.csv
python/fish-market.csv
python/happiness-2019.csv


# Reading a JSON file from S3 bucket then using json.loads to convert from binary string to json object

In [5]:
s3_object = s3_client.get_object(Bucket=bucket_name, Key='python/chatbot-intent.json')
str_body = s3_object['Body'].read()
data = json.loads(str_body)
pp(data)

{'intents': [{'context': {'clear': False,
                          'in': '',
                          'out': 'GreetingUserRequest'},
              'entities': [],
              'entityType': 'NA',
              'extension': {'entities': False, 'function': '', 'responses': []},
              'intent': 'Greeting',
              'responses': ['Hi human, please tell me your GeniSys user',
                            'Hello human, please tell me your GeniSys user',
                            'Hola human, please tell me your GeniSys user'],
              'text': ['Hi',
                       'Hi there',
                       'Hola',
                       'Hello',
                       'Hello there',
                       'Hya',
                       'Hya there']},
             {'context': {'clear': True,
                          'in': 'GreetingUserRequest',
                          'out': ''},
              'entities': [{'entity': 'HUMAN', 'rangeFrom': 3, 'rangeTo': 4},
           

# Reading a CSV from s3 bucket into a pandas dataframe

In [6]:
s3_object = s3_client.get_object(Bucket=bucket_name, Key='python/happiness-2019.csv')
df = pd.read_csv(s3_object['Body'])
print(df.head())
print(df.info())

   Overall rank Country or region  Score  GDP per capita  Social support  \
0             1           Finland  7.769           1.340           1.587   
1             2           Denmark  7.600           1.383           1.573   
2             3            Norway  7.554           1.488           1.582   
3             4           Iceland  7.494           1.380           1.624   
4             5       Netherlands  7.488           1.396           1.522   

   Healthy life expectancy  Freedom to make life choices  Generosity  \
0                    0.986                         0.596       0.153   
1                    0.996                         0.592       0.252   
2                    1.028                         0.603       0.271   
3                    1.026                         0.591       0.354   
4                    0.999                         0.557       0.322   

   Perceptions of corruption  
0                      0.393  
1                      0.410  
2                

# Uploading a file to the S3 bucket

In [None]:
# s3_client.upload_file(Filename='data210.json', Bucket=bucket_name, Key='Data210/ethan.json')

# Deleting a file from the S3 bucket

In [None]:
# s3_client.delete_object(Bucket=bucket_name, Key='Data210/ethan.json')

# Using put_object to upload the serialised python dictionary straight to the S3 bucket

In [None]:
# s3_client.put_object(Body=json.dumps(dict_to_upload), Bucket=bucket_name, Key='Data210/ethan.json')

# Using put_object to upload a pandas dataframe directly from python to th S3 bucket

In [None]:
# s3_object = s3_client.put_object(Body=df.to_csv(index=False), Bucket=bucket_name, Key='Data210/ethan_csv.csv')

In [None]:
# print(type(df.to_csv(index=False)))

# Lab: Collect the 3 fish data files, transform data averaged by species, then reupload the new data

### Creating a function which returns a pandas dataframe from the S3 Bucket

In [37]:
def get_pandas_from_bucket(bucket_name, key_name):
    s3_object = s3_client.get_object(Bucket=bucket_name, Key=key_name)
    return pd.read_csv(s3_object['Body'])

### Creating a function which uploads a pandas dataframe to a S3 Bucket

In [44]:
def put_pandas_to_bucket(dataframe, bucket_name, key_name):
    s3_client.put_object(Body=dataframe.to_csv().encode('utf-8'), Bucket=bucket_name, Key=key_name)

### Collecting the 3 fish files and concatenating them in to a single dataframe

In [38]:
files = ['python/fish-market.csv', 'python/fish-market-mon.csv', 'python/fish-market-tues.csv']
dataframes = [get_pandas_from_bucket(bucket_name, file) for file in files]

df = pd.concat(dataframes)
df

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.000000,23.200000,25.400000,30.000000,11.520000,4.020000
1,Bream,290.000000,24.000000,26.300000,31.200000,12.480000,4.305600
2,Bream,340.000000,23.900000,26.500000,31.100000,12.377800,4.696100
3,Bream,363.000000,26.300000,29.000000,33.500000,12.730000,4.455500
4,Bream,430.000000,26.500000,29.000000,34.000000,12.444000,5.134000
...,...,...,...,...,...,...,...
154,Smelt,13.336589,12.212875,21.365101,13.858652,6.443556,3.291263
155,Smelt,13.887449,15.310886,20.384650,15.245665,7.932646,1.950612
156,Smelt,17.686340,20.487403,13.919309,21.916424,10.822290,1.312548
157,Smelt,29.578849,14.613228,16.548283,19.982241,12.237406,2.421519


### Transforming the data by calculating the averages for each species. Then uploading that as a new file.

In [52]:
fish_agg = df.groupby(['Species']).agg(avg_weight=('Weight', 'mean'), 
                                       avg_length1=('Length1', 'mean'), 
                                       avg_length2=('Length2', 'mean'), 
                                       avg_length3=('Length3', 'mean'), 
                                       avg_height=('Height', 'mean'), 
                                       avg_width=('Width', 'mean'))
put_pandas_to_bucket(fish_agg, bucket_name, 'Data210/fish/ethan.csv')

### Retrieving the new transformed file, checking that it uploaded successfully

In [53]:
get_pandas_from_bucket(bucket_name, 'Data210/fish/ethan.csv')

Unnamed: 0,Species,avg_weight,avg_length1,avg_length2,avg_length3,avg_height,avg_width
0,Bream,621.034259,33.30624,36.638641,41.387836,18.699552,6.17562
1,Parkki,157.127433,22.067386,24.161097,25.846385,12.465449,3.73623
2,Perch,385.464303,29.162474,31.024313,32.855416,10.897862,5.481619
3,Pike,721.859936,46.044273,48.242703,52.279477,11.566545,5.719653
4,Roach,155.495394,23.992542,25.703156,28.12681,10.112788,4.338152
5,Smelt,14.434189,14.423119,15.041544,15.985054,6.029688,1.940348
6,Whitefish,534.355346,32.79305,34.970353,36.995182,13.099232,6.1953
