Converting pip cells

In [None]:
%horus convert

In [None]:
pip install --upgrade pip

In [None]:
# Install requirements
!pip install -r Requirements.txt 

In [None]:
# View AWS Configuration
!aws configure set default.region us-east-1
!aws configure list


Restart kernel to load the new libraries.

----Code Start----

Import statements

In [None]:
# Prerequisite - Imports 
import os
from ray.util import inspect_serializability
import ray
import pyarrow.fs as pq
import pandas as pd

# Making use of datetime for dates, works for every day of the year (30,31,28 day problems go away)
from datetime import date, timedelta
from dateutil.relativedelta import relativedelta

Connect to ray cluster

In [None]:
# Prerequisite - Connect to the Ray cluster on OpenShift
from ray.util.client import ray as rayclient
if rayclient.is_connected():
    ray.util.disconnect()

ray.util.connect('{ray_head}:10001'.format(ray_head=os.environ['RAY_CLUSTER']))

Reusable definitions here

In [None]:
# Prerequisite - Specify reusable definitions

# metricName = 'cluster_version'
metricName = 'cluster_feature_set'

bucketName = 'DH-SECURE-THANOS-RAY-USE'
endpoint = 'https://s3.upshift.redhat.com'

prefixPathRead = 'raydev'
prefixPathWrite = 'raydev-write-demo'

year = '2021'
month = '01'
day = '01'

# Read path
read_path = f'{prefixPathRead}/metric={metricName}'

# Write path
write_path = f'{prefixPathWrite}/metric={metricName}'


Create a filesystem object

In [None]:
# Prerequisite - Create S3FileSystem in PyArrow
# Why: Allows us to specify a custom endpoint
AWS_ACCESS_KEY = %env AWS_ACCESS_KEY_ID
AWS_SECRET_KEY = %env AWS_SECRET_ACCESS_KEY
fs_pyarrow = pq.S3FileSystem(access_key=AWS_ACCESS_KEY, secret_key=AWS_SECRET_KEY, endpoint_override=endpoint)

Compact everyday and store as one parquet (365 files)

In [None]:
#%%capture output 
firstDay = date(2021,1,1) #January onwards
lastDay = firstDay + relativedelta(months = 1)
duration = lastDay - firstDay
for _ in range(12):
    for i in range(duration.days):
        day = firstDay + timedelta(days = i)
        # Set year, month and and day here
        year, month, day = (day.year, day.month, day.day)        
        # Code here:
        # <Read dataframe>
        currentReadPath = f's3://{bucketName}/{read_path}/year={year}/month={month:02d}/day={day:02d}'
        print("Reading from:",currentReadPath)
        df = ray.data.read_parquet(paths=currentReadPath, filesystem=fs_pyarrow)
        # <Writeback single parquet for the day>
        currentWritePath = f's3://{bucketName}/{write_path}/year={year}/month={month:02d}/day={day:02d}'
        print("Writing to:", currentWritePath)
        df.repartition(1).write_parquet(path=currentWritePath, filesystem=fs_pyarrow)
    firstDay = lastDay
    lastDay = firstDay + relativedelta(months = 1)
    duration = lastDay

Trying to see if Ray remote functions make a difference
- Not working as of now, requires initializing S3.

In [None]:
#%%capture output 
@ray.remote
def compaction():
    firstDay = date(2021,1,1)
    lastDay = firstDay + relativedelta(months = 1)
    duration = lastDay - firstDay
    # Trying to initialize S3 for remote Ray function
    InitializeS3()
    for _ in range(12):
        for i in range(duration.days):
            day = firstDay + timedelta(days = i)
            # Set year, month and and day here
            year, month, day = (day.year, day.month, day.day)
            currentReadPath = f's3://{bucketName}/{read_path}/year={year}/month={month:02d}/day={day:02d}'
            print("Reading from:",currentReadPath)
            # Code here:
            # <Read dataframe>
            df = ray.data.read_parquet(paths=currentReadPath, filesystem=fs_pyarrow)
            # <Writeback single parquet for the day>
            currentWritePath = f's3://{bucketName}/{write_path}/year={year}/month={month:02d}/day={day:02d}'
            df.repartition(1).write_parquet(path=currentWritePath, filesystem=fs_pyarrow)
            print("Writing to:", currentWritePath)
        firstDay = lastDay
        lastDay = firstDay + relativedelta(months = 1)
        duration = lastDay

In [None]:
compaction.remote()

Display cached output

In [None]:
output.show()

Run compaction section wise for a single day:

In [None]:
# Prerequisite - Specify day to compact 
compactDay = date(2021, 1, 1)
month = compactDay.month
day = compactDay.day
year = compactDay.year

In [None]:
# (1/2) Read one days worth
currentReadPath = f's3://{bucketName}/{read_path}/year={year}/month={month:02d}/day={day:02d}'
print("Reading from:",currentReadPath)
# Code here:
# <Read dataframe>
df = ray.data.read_parquet(paths=currentReadPath, filesystem=fs_pyarrow)


In [None]:
# View dataframe details (Number of blocks, schema)
print(df)

In [None]:
# (2/2) Compact and write back to S3
currentWritePath = f's3://{bucketName}/{write_path}/year={year}/month={month:02d}/day={day:02d}'
print("Writing to:", currentWritePath)
df.repartition(1).write_parquet(path=currentWritePath, filesystem=fs_pyarrow)