In [2]:
import boto3
from botocore.exceptions import NoCredentialsError
import s3fs
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from io import StringIO, BytesIO
from datetime import datetime, timedelta
from botocore.client import Config

## Adapter Layer

In [3]:
def read_parq_to_df(bucket_name, key, s3_params):
    
     # Create an S3 client
    s3 = boto3.client('s3', **s3_params["client_kwargs"], endpoint_url=s3_params["endpoint_url"],
                      aws_access_key_id=s3_params["key"], aws_secret_access_key=s3_params["secret"])
 
    # List objects in the bucket with a specific prefix
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=key)

    # Extract object keys from the response
    object_keys = [obj['Key'] for obj in response.get('Contents', [])]
    
    response = s3.get_object(Bucket=bucket_name, Key=key)
    data_bytes = response['Body'].read()

    table = pq.read_table(BytesIO(data_bytes))
    df = table.to_pandas()
    return df

def write_df_to_s3(df, bucket_name: str,key: str):
    
    sts_client = boto3.client('sts')
    
    assumed_role = sts_client.assume_role(
    RoleArn='arn:aws:iam::211125758361:role/ETL_S3',
    RoleSessionName='SESSION_NAME')

    credentials = assumed_role['Credentials']

    s3 = boto3.client('s3',
    aws_access_key_id=credentials['AccessKeyId'],
    aws_secret_access_key=credentials['SecretAccessKey'],
    aws_session_token=credentials['SessionToken'],
    region_name='ap-south-1')

    response = s3.list_buckets()
    
    # Convert DataFrame to PyArrow Table
    table = pa.Table.from_pandas(df)

    # Write Parquet file to an in-memory buffer
    buffer = BytesIO()
    pq.write_table(table, buffer)

    # Upload the buffer to S3
    s3 = boto3.client('s3')
    buffer.seek(0)  # Reset the buffer position to the beginning
    s3.upload_fileobj(buffer, bucket_name, key)

    #Print to confirm
    print(f"Data written to S3 bucket: {bucket_name}/{key}")

def read_from_s3(bucket_name: str, key: str):
    sts_client = boto3.client('sts')
    
    assumed_role = sts_client.assume_role(
    RoleArn='arn:aws:iam::211125758361:role/ETL_S3',
    RoleSessionName='SESSION_NAME')

    credentials = assumed_role['Credentials']

    s3 = boto3.client('s3',
    aws_access_key_id=credentials['AccessKeyId'],
    aws_secret_access_key=credentials['SecretAccessKey'],
    aws_session_token=credentials['SessionToken'],
    region_name='ap-south-1')
    
    # Download the Parquet file from S3 into an in-memory buffer
    buffer = BytesIO()
    s3.download_fileobj(bucket_name, key, buffer)

    # Read the Parquet file from the buffer
    buffer.seek(0)  # Reset the buffer position to the beginning
    table = pq.read_table(buffer)

    # Convert PyArrow Table to Pandas DataFrame
    df = table.to_pandas()

    # Now, 'df' contains the data from the Parquet file
    return df

def return_objects(s3_url, s3_params, arg_date):
    arg_date = datetime.strptime(arg_date, '%Y-%m-%d')
    
   # Parse the S3 URL to get bucket name and prefix
    s3_url_parts = s3_url.split("/")
    bucket_name = s3_url_parts[2]
    prefix = "/".join(s3_url_parts[3:])

    # Initialize S3 client
    s3 = boto3.client(
        's3',
        aws_access_key_id=s3_params["key"],
        aws_secret_access_key=s3_params["secret"],
        endpoint_url=s3_params["endpoint_url"],
        region_name=s3_params["client_kwargs"]["region_name"]
    )

    try:
        # List objects in the bucket
        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

        try:
            object_keys = [obj['Key'] for obj in response.get('Contents', []) if datetime.strptime(obj['Key'].split('/')[3], '%Y-%m-%d') >= arg_date]
        except ValueError as date_parse_error:
            print(f"Date parsing error: {date_parse_error}")
            print("Falling back to all object keys.")
            # Fall back to all object keys if date parsing fails
            object_keys = [obj['Key'] for obj in response.get('Contents', [])]
        
        return object_keys

    except NoCredentialsError:
        print("Credentials not available or not valid.")
        return []

## Application Layer

In [4]:
def extract(src_bucket, object_keys):
    df = pd.concat([read_parq_to_df(src_bucket, key = obj, s3_params=s3_params) for obj in object_keys], ignore_index=True)
    return df

def transform(df, arg_date):
    df = df[df['date'] >= arg_date]
    df.loc[:, 'date'] = df['date'].dt.date
    aggregations = {
    'open': 'mean',
    'high': 'max',
    'low': 'min',
    'close': 'mean',
    'volume': 'sum'
    }
    result_df = df.groupby('date').agg(aggregations)
    result_df.drop('volume',axis=1,inplace=True)
    result_df['prev_close'] = result_df['close'].shift(1)
    result_df['change_prev_closing_%'] = ((result_df['close'] - result_df['prev_close'])/result_df['prev_close'])*100
    return result_df

def load(trg_bucket, df, trg_key, trg_format):
    key = trg_key + datetime.today().strftime("%Y%m%d_%H%M%S") + trg_format
    write_df_to_s3(df,trg_bucket, key)
    return True

def etl_report(src_bucket, trg_bucket, object_keys, arg_date, trg_key, trg_format):
    df = extract(src_bucket, object_keys)
    df = transform(df, arg_date)
    load(trg_bucket, df, trg_key, trg_format)
    return True

In [17]:
# main function entrypoint

def main():
    # Parameters/Configurations
    # Later read config
    url = "s3://desiquant/data/candles/NIFTY50/EQ.parquet.gz"
    s3_params = {
    "endpoint_url": "https://cbabd13f6c54798a9ec05df5b8070a6e.r2.cloudflarestorage.com",
    "key": "5c8ea9c516abfc78987bc98c70d2868a", 
    "secret": "0cf64f9f0b64f6008cf5efe1529c6772daa7d7d0822f5db42a7c6a1e41b3cadf", 
    "client_kwargs": {
    "region_name": "auto"},
    }
    key = 'etl_nifty_report_' + datetime.today().strftime("%Y%m%d_%H%M%S") + '.parquet'
    src_bucket ='desiquant'
    trg_bucket = 'etl-nifty50'
    src_format = '%Y%m%d'
    arg_date = '2023-10-10'
    trg_key = 'etl_nifty_report_'
    trg_format = '.parquet'

    # Init
    sts_client = boto3.client('sts')
    
    assumed_role = sts_client.assume_role( RoleArn='arn:aws:iam::211125758361:role/ETL_S3',
                                       RoleSessionName='SESSION_NAME')

    credentials = assumed_role['Credentials']

    s3 = boto3.client('s3',
    aws_access_key_id=credentials['AccessKeyId'],
    aws_secret_access_key=credentials['SecretAccessKey'],
    aws_session_token=credentials['SessionToken'],
    region_name='ap-south-1')
    
    # run application
    object_keys = return_objects(url, s3_params, arg_date)
    etl_report(src_bucket, trg_bucket, object_keys, arg_date, trg_key, trg_format)

In [18]:
main()

Date parsing error: time data 'EQ.parquet.gz' does not match format '%Y-%m-%d'
Falling back to all object keys.
Data written to S3 bucket: etl-nifty50/etl_nifty_report_20240123_124136.parquet


## Reading the file

In [21]:
sts_client = boto3.client('sts')
    
assumed_role = sts_client.assume_role( RoleArn='arn:aws:iam::211125758361:role/ETL_S3',
                                       RoleSessionName='SESSION_NAME')

credentials = assumed_role['Credentials']

s3 = boto3.client('s3',
    aws_access_key_id=credentials['AccessKeyId'],
    aws_secret_access_key=credentials['SecretAccessKey'],
    aws_session_token=credentials['SessionToken'],
    region_name='ap-south-1')

buffer = BytesIO()

s3.download_fileobj(trg_bucket, 'etl_nifty_report_20240123_124136.parquet', buffer)

# Read the Parquet file from the buffer
# Reset the buffer position to the beginning
buffer.seek(0) 
table = pq.read_table(buffer)

# Convert PyArrow Table to Pandas DataFrame
df = table.to_pandas()

# Now, 'df' contains the data from the Parquet file
df.head()

Unnamed: 0_level_0,open,high,low,close,prev_close,change_prev_closing_%
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-10-10,19652.5412,19717.8,19565.45,19652.9416,,
2023-10-11,19809.464933,19839.2,19756.95,19809.6836,19652.9416,0.79755
2023-10-12,19806.4024,19843.3,19772.65,19806.229467,19809.6836,-0.017437
2023-10-13,19722.9664,19805.4,19635.3,19721.895733,19806.229467,-0.425794
2023-10-16,19751.463467,19781.3,19691.85,19751.343333,19721.895733,0.149314
