In [2]:
#data management library
import pandas as pd

#s3 SDK library
import boto3

#random uuid generator library for unique bucket name
import uuid

In [124]:
#encode data source url
url = 'https://raw.githubusercontent.com/localytics/data-viz-challenge/master/data.json'

In [125]:
#use read_json to download and format json downloaded from data source
df = pd.read_json(url, orient='split')

In [126]:
#unpack state info from location nested json object
df['state'] = [location['state'] for location in df['location']]

In [127]:
#drop all unnecessary columns
df.drop(columns=['category','event_name','marital_status','session_id','location'], inplace=True)

In [128]:
#filter for only Female gender and California state entries
df = df[(df['gender']=='F') & (df['state']=='CA')]

In [129]:
#format client_time column to have the 'yyyy-mm-dd' format
df['client_time'] = pd.to_datetime(df.loc[:,'client_time']).apply(lambda x: x.strftime('%Y-%m-%d'))

In [130]:
df.head()

Unnamed: 0,age,amount,client_time,device,gender,state
7,35-44,,2014-03-01,android,F,CA
68,55+,,2014-03-01,android,F,CA
71,35-44,33.0,2014-03-01,android,F,CA
72,35-44,,2014-03-01,android,F,CA
90,45-54,,2014-03-01,android,F,CA


In [131]:
#fill null values of amount with 0
df.loc[:,'amount'].fillna(0, inplace=True)

In [133]:
#creating grouping by gender, age, device and client_time
grouped = df.groupby(['gender','age','device','client_time'])

In [134]:
#create DF with count column
df = grouped.count()

In [135]:
#create sum from amount column as sum
df['sum'] = grouped['amount'].sum()

In [136]:
#drop state column which is no longer needed
df.drop(columns='state', inplace=True)

In [137]:
#rename amount column as count
df.rename(columns={'amount':'count'}, inplace=True)

In [82]:
#drop index
df.reset_index(inplace=True)

In [119]:
df.head()

Unnamed: 0,gender,age,device,client_time,count,sum
0,F,18-24,android,2014-03-01,13,31.0
1,F,18-24,android,2014-03-02,5,69.0
2,F,18-24,android,2014-03-03,6,40.0
3,F,18-24,android,2014-03-04,7,49.0
4,F,18-24,android,2014-03-05,5,113.0


In [114]:
#download DF to a csv file with name total_events
df.to_csv('total_events.csv')

In [105]:
#initialize s3 resource
s3_resource = boto3.resource('s3')

In [106]:
#function to generate randomized bucket name from prefix
def create_bucket_name(bucket_prefix):
    return ''.join([bucket_prefix, str(uuid.uuid4())])

In [107]:
#function that creates bucket with randomized name
def create_bucket(bucket_prefix, s3_connection):
    session = boto3.session.Session()
    current_region = session.region_name
    bucket_name = create_bucket_name(bucket_prefix)
    bucket_response = s3_connection.create_bucket(
        Bucket=bucket_name,
        CreateBucketConfiguration={
        'LocationConstraint': current_region})
    print(bucket_name, current_region)
    return bucket_name, bucket_response

In [108]:
#create bucket and get name from bucket
bucket_name, response = create_bucket('ecosia', s3_resource)

ecosiaf11a7f7f-61ab-417f-a960-2fd1734ab1bb eu-central-1


In [115]:
#upload file to s3
s3_resource.Object(bucket_name, 'total_events.csv').upload_file(
    Filename='total_events.csv')

In [122]:
#get file back from s3
s3_file = pd.read_csv('https://s3.eu-central-1.amazonaws.com/ecosiaf11a7f7f-61ab-417f-a960-2fd1734ab1bb/total_events.csv', index_col=0)

In [123]:
s3_file.head()

Unnamed: 0,gender,age,device,client_time,count,sum
0,F,18-24,android,2014-03-01,13,31.0
1,F,18-24,android,2014-03-02,5,69.0
2,F,18-24,android,2014-03-03,6,40.0
3,F,18-24,android,2014-03-04,7,49.0
4,F,18-24,android,2014-03-05,5,113.0
