In [None]:
import boto3
import configparser
import json
import pandas as pd
import os

In [None]:
# Load configuration
config = configparser.ConfigParser()
config.read('dwh.cfg')

LOG_DATA=config.get("S3","LOG_DATA")
SONG_DATA=config.get("S3","SONG_DATA")
LOG_JSON_PATH=config.get("S3","LOG_JSON_PATH")

def list_objects_v2(bucket : str, prefix : str = None) -> pd.DataFrame:
    """
    List objects in an S3 bucket with a given prefix.
    """
    s3 = boto3.client('s3')
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
    return pd.DataFrame(response['Contents'])

def get_s3_object(bucket : str, key : str) -> dict:
    """
    Get an object from S3 and return it as a DataFrame.
    """
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=key)
    return obj['Body']

def download_s3_file(bucket : str, key : str, local_path : str) -> None:
    """
    Download a file from S3 to a local path.
    """
    s3 = boto3.client('s3')
    s3.download_file(bucket, key, local_path)

def get_bucket_region(bucket : str) -> str:
    """
    Get the region of an S3 bucket.
    """
    s3 = boto3.client('s3')
    response = s3.get_bucket_location(Bucket=bucket)
    return response['LocationConstraint'] if response['LocationConstraint'] else 'us-east-1'

def read_json_from_s3(bucket: str, key: str) -> dict:
    """
    Read a JSON file from S3 and return it as a dictionary.
    """
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=key)
    data = obj['Body'].read()
    return json.loads(data)

### List log data files

In [4]:
# List objects in the S3 bucket
bucket_name = LOG_DATA.split('/')[2]
list_objects_v2(bucket_name, 'log_data')

Unnamed: 0,Key,LastModified,ETag,Size,StorageClass
0,log_data/,2019-04-17 12:02:38+00:00,"""d41d8cd98f00b204e9800998ecf8427e""",0,STANDARD
1,log_data/2018/11/2018-11-01-events.json,2019-04-17 12:03:13+00:00,"""21bae37b41c56b66973312a07322b5e4""",7151,STANDARD
2,log_data/2018/11/2018-11-02-events.json,2019-04-17 12:03:13+00:00,"""c726b249410a532cce10fa166ce8616c""",83585,STANDARD
3,log_data/2018/11/2018-11-03-events.json,2019-04-17 12:03:13+00:00,"""696af259d3203446b846de7937b5810c""",54084,STANDARD
4,log_data/2018/11/2018-11-04-events.json,2019-04-17 12:03:13+00:00,"""3d10cad17e2279b29da00756a885659b""",85671,STANDARD
5,log_data/2018/11/2018-11-05-events.json,2019-04-17 12:03:13+00:00,"""5055eafc2dcd43d7a39486683857da01""",189295,STANDARD
6,log_data/2018/11/2018-11-06-events.json,2019-04-17 12:03:13+00:00,"""c0594b7fad6851d5d47650caffa48ea0""",85373,STANDARD
7,log_data/2018/11/2018-11-07-events.json,2019-04-17 12:03:13+00:00,"""e6b529f7305fe26eca5e5ef85aa2e014""",97519,STANDARD
8,log_data/2018/11/2018-11-08-events.json,2019-04-17 12:03:13+00:00,"""8a037906ea14c4bd6ea23e2d5333aa13""",102218,STANDARD
9,log_data/2018/11/2018-11-09-events.json,2019-04-17 12:03:13+00:00,"""ea4f34c437d8caa2634ff98bef315f2a""",134804,STANDARD


In [5]:
# Get sample data from S3
pd.read_json(get_s3_object(bucket_name, 'log_data/2018/11/2018-11-08-events.json'), lines=True)

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,,Logged In,Dominick,M,0,Norris,,free,"Los Angeles-Long Beach-Anaheim, CA",GET,Home,1.540976e+12,44,,200,1541635950796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",45
1,Slipknot,Logged In,Aiden,M,0,Ramirez,192.57424,paid,"New York-Newark-Jersey City, NY-NJ-PA",PUT,NextSong,1.540284e+12,19,Opium Of The People (Album Version),200,1541639510796,"""Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537....",20
2,Cat Stevens,Logged In,Aiden,M,1,Ramirez,170.57914,paid,"New York-Newark-Jersey City, NY-NJ-PA",PUT,NextSong,1.540284e+12,19,Don't Be Shy,200,1541639702796,"""Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537....",20
3,Collective Soul,Logged In,Aiden,M,2,Ramirez,273.47546,paid,"New York-Newark-Jersey City, NY-NJ-PA",PUT,NextSong,1.540284e+12,19,Run (LP Version),200,1541639872796,"""Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537....",20
4,Taylor Swift,Logged In,Aiden,M,3,Ramirez,233.89995,paid,"New York-Newark-Jersey City, NY-NJ-PA",PUT,NextSong,1.540284e+12,19,Love Story,200,1541640145796,"""Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537....",20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,,Logged In,Dominick,M,1,Norris,,free,"Los Angeles-Long Beach-Anaheim, CA",GET,Settings,1.540976e+12,365,,200,1541713358796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",45
219,Irish Tenors,Logged In,Rylan,M,2,George,187.48036,free,"Birmingham-Hoover, AL",PUT,NextSong,1.541020e+12,393,Danny Boy,200,1541713521796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",16
220,Justin Bieber,Logged In,Rylan,M,3,George,212.53179,free,"Birmingham-Hoover, AL",PUT,NextSong,1.541020e+12,393,Runaway Love,200,1541713708796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",16
221,,Logged In,Dominick,M,0,Norris,,free,"Los Angeles-Long Beach-Anaheim, CA",GET,Home,1.540976e+12,401,,200,1541719996796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",45


### List log files

In [6]:
list_objects_v2(bucket_name, 'log_json_path')

Unnamed: 0,Key,LastModified,ETag,Size,StorageClass
0,log_json_path.json,2019-04-02 16:58:44+00:00,"""6ee138e5504cffd12cded1323dfbd6a4""",456,STANDARD


In [7]:
# Download a file from S3
file_name = 'log_json_path.json'
dowload_path = os.path.join(os.getcwd(), file_name)
download_s3_file(bucket_name, file_name, dowload_path)
# Get sample data from S3
pd.read_json(get_s3_object(bucket_name, 'log_json_path.json'))

Unnamed: 0,jsonpaths
0,$['artist']
1,$['auth']
2,$['firstName']
3,$['gender']
4,$['itemInSession']
5,$['lastName']
6,$['length']
7,$['level']
8,$['location']
9,$['method']


### List song data

In [8]:
list_objects_v2(bucket_name, 'song_data')

Unnamed: 0,Key,LastModified,ETag,Size,StorageClass
0,song_data/,2019-04-17 03:12:52+00:00,"""d41d8cd98f00b204e9800998ecf8427e""",0,STANDARD
1,song_data/A/A/A/TRAAAAK128F9318786.json,2019-04-17 03:20:41+00:00,"""48e073986610c4997c26f8a394b8fd6e""",225,STANDARD
2,song_data/A/A/A/TRAAAAV128F421A322.json,2019-04-17 03:20:41+00:00,"""fc84ae8a04e72df00ac2a25218f14a18""",284,STANDARD
3,song_data/A/A/A/TRAAABD128F429CF47.json,2019-04-17 03:20:41+00:00,"""df6d7bb1676146a859af35cb3468e8dd""",249,STANDARD
4,song_data/A/A/A/TRAAACN128F9355673.json,2019-04-17 03:20:41+00:00,"""86f1807dd1cfa21826cf374c6c2625a7""",243,STANDARD
...,...,...,...,...,...
995,song_data/A/B/R/TRABRFP128F42B7B7A.json,2019-04-17 03:20:51+00:00,"""977f88d5aeca9208780bbbf30dae214e""",257,STANDARD
996,song_data/A/B/R/TRABRHL128F4274DB0.json,2019-04-17 03:20:51+00:00,"""ff3e142ed97c25a461133c67c16d15d0""",241,STANDARD
997,song_data/A/B/R/TRABRHT12903CF274F.json,2019-04-17 03:20:51+00:00,"""12e3bc2bfbaf6ea0f224798647244890""",255,STANDARD
998,song_data/A/B/R/TRABRJZ12903CE9196.json,2019-04-17 03:20:51+00:00,"""3b09e7cd7d50350f0376a544ff680fec""",234,STANDARD


In [18]:
# Get sample data from S3
pd.read_json(get_s3_object(bucket_name, 'song_data/A/A/A/TRAAABD128F429CF47.json'), lines=True)

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARMJAGH1187FB546F3,35.14968,"Memphis, TN",-90.04892,The Box Tops,148.03546,1,SOCIWDW12A8C13D406,Soul Deep,1969
