In [0]:
import warnings
import pandas as pd
import numpy as np
import boto3
import os
from datetime import datetime
from dataheroes import CoresetTreeServiceDTC

warnings.simplefilter('ignore', DeprecationWarning)
warnings.simplefilter('ignore', FutureWarning)

In [0]:
def get_s3_client():
    # replace with your credentials
    aws_access_key_id = 'your-access-key'
    saws_secret_access_key='your-secret-key'
    return boto3.client('s3', aws_access_key_id=aws_access_key_id , aws_secret_access_key=aws_secret_access_key)

def upload_folder_to_s3(folder_path, bucket_name, s3_prefix=""):
    s3_client = get_s3_client()
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            local_path = os.path.join(root, file_name)
            relative_path = os.path.relpath(local_path, folder_path)
            s3_path = os.path.join(s3_prefix, relative_path).replace("\\", "/")  # Ensure correct path format for S3
            try:
                s3_client.upload_file(local_path, bucket_name, s3_path)
                print(f'Successfully uploaded {local_path} to s3://{bucket_name}/{s3_path}')
            except Exception as e:
                print(f'Failed to upload {local_path} to s3://{bucket_name}/{s3_path}: {e}')

def download_folder_from_s3(bucket_name, s3_prefix, local_dir):
    s3_client = get_s3_client()
    paginator = s3_client.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=s3_prefix)
    for page in pages:
        if 'Contents' in page:
            for obj in page['Contents']:
                s3_key = obj['Key']
                relative_path = os.path.relpath(s3_key, s3_prefix)
                local_path = os.path.join(local_dir, relative_path)
                os.makedirs(os.path.dirname(local_path), exist_ok=True)
                try:
                    s3_client.download_file(bucket_name, s3_key, local_path)
                    print(f'Successfully downloaded {s3_key} to {local_path}')
                except Exception as e:
                    print(f'Failed to download {s3_key}: {e}')   

In [0]:
# create dataset as pandas DataFrame
num_rows = 10_000
num_float_cols = 10
float_data = np.random.rand(num_rows, num_float_cols)
target_data = np.random.randint(0, 2, num_rows)
df = pd.DataFrame(float_data, columns=[f'feature_{i+1}' for i in range(num_float_cols)])
df['target'] = target_data

In [None]:
service_obj = CoresetTreeServiceDTC(
    data_params={'target': {'name': 'target'}},
    coreset_size=1_000,
    chunk_size=1_000,
    optimized_for='training',
    working_directory='/local_disk0/'
)
# build Coreset Tree 
service_obj.build_from_df(df)

In [None]:
# save tree locally and upload to s3
service_obj_save_path = f'/local_disk0/service_tree_{datetime.now().strftime("%d%m%Y_%H%M%S")}'
bucket_name = "bucket_name"
s3_path = service_obj_save_path.replace('/local_disk0/', '')

service_obj.save(service_obj_save_path)
upload_folder_to_s3(service_obj_save_path, bucket_name, s3_path)


In [None]:
# download tree from s3 to new destination
service_obj_download_save_path = f'/local_disk0/download_service_tree_{datetime.now().strftime("%d%m%Y_%H%M%S")}'
download_folder_from_s3(bucket_name, s3_path, service_obj_download_save_path)
# load tree to new service object
service_obj_new = CoresetTreeServiceDTC.load(service_obj_download_save_path)
# checking results
service_obj_new.print()
